Buckets:
| #!/usr/bin/env python3 | |
| import json, os, re, subprocess, zipfile | |
| from collections import Counter | |
| from pathlib import Path | |
| root = Path(__file__).resolve().parent | |
| src = (Path(__file__).resolve().parents[2] / 'shared_resources' / 'enwik8').resolve() | |
| raw = src.read_bytes() | |
| # Candidate patterns: XML-heavy + common wiki fragments. | |
| candidates = [ | |
| b"<page>", b"</page>", b"<title>", b"</title>", b"<id>", b"</id>", | |
| b"<revision>", b"</revision>", b"<timestamp>", b"</timestamp>", | |
| b"<contributor>", b"</contributor>", b"<username>", b"</username>", | |
| b"<text", b"</text>", b"<sha1>", b"</sha1>", b"<minor />", | |
| b"[[", b"]]", b"{{", b"}}", b""", b"<", b">", b"&", | |
| b" the ", b" and ", b" of ", b" to ", b" in ", b" is ", b" for ", b" on ", | |
| b"|", b"||", b"==", b"===", b"----", b"\n*", b"\n#", b"\n\n", | |
| ] | |
| used = set(raw) | |
| unused = [b for b in range(256) if b not in used] | |
| # Score by estimated byte gain if replaced with one byte. | |
| def gain(p: bytes) -> int: | |
| c = raw.count(p) | |
| if c == 0: | |
| return 0 | |
| return c * (len(p) - 1) | |
| ranked = sorted([(gain(p), p) for p in candidates], reverse=True) | |
| selected = [p for g, p in ranked if g > 20000][: len(unused)] | |
| subs = list(zip(unused[: len(selected)], selected)) | |
| enc = raw | |
| # Replace longer patterns first for determinism. | |
| for code, pat in sorted(subs, key=lambda x: len(x[1]), reverse=True): | |
| enc = enc.replace(pat, bytes([code])) | |
| (root / 'preprocessed.bin').write_bytes(enc) | |
| archive = root / 'archive.xz' | |
| cmd = [ | |
| 'xz', '-k', '-c', '-9e', | |
| '--lzma2=dict=512MiB,nice=273,mf=bt4,mode=normal,lc=4,lp=0,pb=0', | |
| str(root / 'preprocessed.bin'), | |
| ] | |
| with archive.open('wb') as f: | |
| subprocess.run(cmd, check=True, stdout=f) | |
| # Build decompressor payload. | |
| dec_dir = root / 'decompressor' | |
| dec_dir.mkdir(exist_ok=True) | |
| (dec_dir / 'decompress.py').write_text('''#!/usr/bin/env python3 | |
| import json, subprocess, sys | |
| from pathlib import Path | |
| inp = Path(sys.argv[1]) if len(sys.argv) > 1 else Path('archive.xz') | |
| out = Path(sys.argv[2]) if len(sys.argv) > 2 else Path('enwik8.out') | |
| work = inp.parent | |
| pp = work / 'tmp.preprocessed.bin' | |
| subprocess.run(['xz','-d','-c',str(inp)], check=True, stdout=pp.open('wb')) | |
| subs = json.loads((Path(__file__).with_name('subs.json')).read_text()) | |
| data = pp.read_bytes() | |
| for code, pat_hex in subs: | |
| data = data.replace(bytes([code]), bytes.fromhex(pat_hex)) | |
| out.write_bytes(data) | |
| pp.unlink(missing_ok=True) | |
| ''') | |
| (dec_dir / 'subs.json').write_text(json.dumps([(c, p.hex()) for c, p in subs])) | |
| os.chmod(dec_dir / 'decompress.py', 0o755) | |
| zip_path = root / 'decompressor.zip' | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as z: | |
| z.write(dec_dir / 'decompress.py', arcname='decompress.py') | |
| z.write(dec_dir / 'subs.json', arcname='subs.json') | |
| # Verify roundtrip. | |
| out = root / 'enwik8.out' | |
| subprocess.run(['python3', str(dec_dir / 'decompress.py'), str(archive), str(out)], check=True) | |
| if out.read_bytes() != raw: | |
| raise SystemExit('roundtrip failed') | |
| archive_b = archive.stat().st_size | |
| dec_b = zip_path.stat().st_size | |
| total = archive_b + dec_b | |
| bpc = round(8 * total / 1e8, 3) | |
| res = { | |
| 'agent_id': 'AutoZip', | |
| 'experiment': 'Auto dictionary substitution + tuned xz', | |
| 'method': 'dict-auto-xz', | |
| 'archive_bytes': archive_b, | |
| 'decompressor_zip_bytes': dec_b, | |
| 'total_bytes': total, | |
| 'bpc': bpc, | |
| 'num_subs': len(subs), | |
| 'notes': 'Candidate-based substitutions using bytes absent from enwik8; tuned xz backend.' | |
| } | |
| (root / 'results.json').write_text(json.dumps(res, indent=2) + '\n') | |
| print(json.dumps(res)) | |
Xet Storage Details
- Size:
- 3.59 kB
- Xet hash:
- 144ebd855c215f1743ecf9de67c9b4eae9c9299ff5911958a465173bb2b5ce7e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.