Buckets:

alucchi's picture
download
raw
3.59 kB
#!/usr/bin/env python3
import json, os, re, subprocess, zipfile
from collections import Counter
from pathlib import Path
root = Path(__file__).resolve().parent
src = (Path(__file__).resolve().parents[2] / 'shared_resources' / 'enwik8').resolve()
raw = src.read_bytes()
# Candidate patterns: XML-heavy + common wiki fragments.
candidates = [
b"<page>", b"</page>", b"<title>", b"</title>", b"<id>", b"</id>",
b"<revision>", b"</revision>", b"<timestamp>", b"</timestamp>",
b"<contributor>", b"</contributor>", b"<username>", b"</username>",
b"<text", b"</text>", b"<sha1>", b"</sha1>", b"<minor />",
b"[[", b"]]", b"{{", b"}}", b"&quot;", b"&lt;", b"&gt;", b"&amp;",
b" the ", b" and ", b" of ", b" to ", b" in ", b" is ", b" for ", b" on ",
b"|", b"||", b"==", b"===", b"----", b"\n*", b"\n#", b"\n\n",
]
used = set(raw)
unused = [b for b in range(256) if b not in used]
# Score by estimated byte gain if replaced with one byte.
def gain(p: bytes) -> int:
c = raw.count(p)
if c == 0:
return 0
return c * (len(p) - 1)
ranked = sorted([(gain(p), p) for p in candidates], reverse=True)
selected = [p for g, p in ranked if g > 20000][: len(unused)]
subs = list(zip(unused[: len(selected)], selected))
enc = raw
# Replace longer patterns first for determinism.
for code, pat in sorted(subs, key=lambda x: len(x[1]), reverse=True):
enc = enc.replace(pat, bytes([code]))
(root / 'preprocessed.bin').write_bytes(enc)
archive = root / 'archive.xz'
cmd = [
'xz', '-k', '-c', '-9e',
'--lzma2=dict=512MiB,nice=273,mf=bt4,mode=normal,lc=4,lp=0,pb=0',
str(root / 'preprocessed.bin'),
]
with archive.open('wb') as f:
subprocess.run(cmd, check=True, stdout=f)
# Build decompressor payload.
dec_dir = root / 'decompressor'
dec_dir.mkdir(exist_ok=True)
(dec_dir / 'decompress.py').write_text('''#!/usr/bin/env python3
import json, subprocess, sys
from pathlib import Path
inp = Path(sys.argv[1]) if len(sys.argv) > 1 else Path('archive.xz')
out = Path(sys.argv[2]) if len(sys.argv) > 2 else Path('enwik8.out')
work = inp.parent
pp = work / 'tmp.preprocessed.bin'
subprocess.run(['xz','-d','-c',str(inp)], check=True, stdout=pp.open('wb'))
subs = json.loads((Path(__file__).with_name('subs.json')).read_text())
data = pp.read_bytes()
for code, pat_hex in subs:
data = data.replace(bytes([code]), bytes.fromhex(pat_hex))
out.write_bytes(data)
pp.unlink(missing_ok=True)
''')
(dec_dir / 'subs.json').write_text(json.dumps([(c, p.hex()) for c, p in subs]))
os.chmod(dec_dir / 'decompress.py', 0o755)
zip_path = root / 'decompressor.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as z:
z.write(dec_dir / 'decompress.py', arcname='decompress.py')
z.write(dec_dir / 'subs.json', arcname='subs.json')
# Verify roundtrip.
out = root / 'enwik8.out'
subprocess.run(['python3', str(dec_dir / 'decompress.py'), str(archive), str(out)], check=True)
if out.read_bytes() != raw:
raise SystemExit('roundtrip failed')
archive_b = archive.stat().st_size
dec_b = zip_path.stat().st_size
total = archive_b + dec_b
bpc = round(8 * total / 1e8, 3)
res = {
'agent_id': 'AutoZip',
'experiment': 'Auto dictionary substitution + tuned xz',
'method': 'dict-auto-xz',
'archive_bytes': archive_b,
'decompressor_zip_bytes': dec_b,
'total_bytes': total,
'bpc': bpc,
'num_subs': len(subs),
'notes': 'Candidate-based substitutions using bytes absent from enwik8; tuned xz backend.'
}
(root / 'results.json').write_text(json.dumps(res, indent=2) + '\n')
print(json.dumps(res))

Xet Storage Details

Size:
3.59 kB
·
Xet hash:
144ebd855c215f1743ecf9de67c9b4eae9c9299ff5911958a465173bb2b5ce7e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.