Buckets:
ml-intern-explorers/hutter-prize-collab / artifacts /dict_greedy_xz_AutoZip_bounded /run_experiment.py
| #!/usr/bin/env python3 | |
| import json, re, subprocess, zipfile, os | |
| from collections import Counter | |
| from pathlib import Path | |
| root=Path(__file__).resolve().parent | |
| raw=(root.parents[1]/'shared_resources'/'enwik8').read_bytes() | |
| cal=raw[:2_000_000] | |
| unused=[b for b in range(256) if b not in set(raw)] | |
| text=raw.decode('latin1',errors='ignore') | |
| base_cands=[b"<page>",b"</page>",b"<title>",b"</title>",b"<id>",b"</id>",b"<revision>",b"</revision>",b"<text",b"</text>",b"[[",b"]]",b"{{",b"}}",b""",b"<",b">",b" the ",b" and ",b" of ",b" to ",b" in "] | |
| for w,_ in Counter(re.findall(r"[A-Za-z]{4,14}", text)).most_common(150): | |
| base_cands.append((" "+w+" ").encode('latin1')) | |
| # unique/order | |
| seen=set(); cands=[] | |
| for c in base_cands: | |
| if 3<=len(c)<=16 and c not in seen: | |
| seen.add(c); cands.append(c) | |
| # rank by gain | |
| cands=sorted(cands,key=lambda p: raw.count(p)*(len(p)-1), reverse=True)[:60] | |
| xz=['xz','-k','-c','-9e','--lzma2=dict=512MiB,nice=273,mf=bt4,mode=normal,lc=4,lp=0,pb=0'] | |
| def xzs(b): | |
| return len(subprocess.run(xz,input=b,stdout=subprocess.PIPE,check=True).stdout) | |
| accepted=[]; cur=cal; best=xzs(cur) | |
| for p in cands: | |
| if len(accepted)>=len(unused): break | |
| code=unused[len(accepted)] | |
| trial=cur.replace(p,bytes([code])) | |
| s=xzs(trial) | |
| if s<best: | |
| accepted.append((code,p)); cur=trial; best=s | |
| enc=raw | |
| for code,p in sorted(accepted,key=lambda x:len(x[1]),reverse=True): | |
| enc=enc.replace(p,bytes([code])) | |
| (root/'preprocessed.bin').write_bytes(enc) | |
| with (root/'archive.xz').open('wb') as f: | |
| subprocess.run(xz+[str(root/'preprocessed.bin')],stdout=f,check=True) | |
| dec=root/'decompressor'; dec.mkdir(exist_ok=True) | |
| (dec/'decompress.py').write_text("import json,subprocess,sys\nfrom pathlib import Path\ninp=Path(sys.argv[1]);out=Path(sys.argv[2]);tmp=inp.parent/'tmp.bin';subprocess.run(['xz','-d','-c',str(inp)],stdout=tmp.open('wb'),check=True);subs=json.loads((Path(__file__).with_name('subs.json')).read_text());d=tmp.read_bytes();\nfor c,h in subs:d=d.replace(bytes([c]),bytes.fromhex(h));out.write_bytes(d);tmp.unlink(missing_ok=True)\n") | |
| os.chmod(dec/'decompress.py',0o755) | |
| (dec/'subs.json').write_text(json.dumps([(c,p.hex()) for c,p in accepted])) | |
| with zipfile.ZipFile(root/'decompressor.zip','w',zipfile.ZIP_DEFLATED,compresslevel=9) as z: | |
| z.write(dec/'decompress.py','decompress.py'); z.write(dec/'subs.json','subs.json') | |
| subprocess.run(['python3',str(dec/'decompress.py'),str(root/'archive.xz'),str(root/'enwik8.out')],check=True) | |
| assert (root/'enwik8.out').read_bytes()==raw | |
| a=(root/'archive.xz').stat().st_size; d=(root/'decompressor.zip').stat().st_size | |
| res={'agent_id':'AutoZip','experiment':'Bounded greedy substitutions (2MB cal) + tuned xz','method':'dict-greedy-xz-bounded','archive_bytes':a,'decompressor_zip_bytes':d,'total_bytes':a+d,'bpc':round(8*(a+d)/1e8,3),'num_subs':len(accepted)} | |
| (root/'results.json').write_text(json.dumps(res,indent=2)+'\n') | |
| (root/'accepted_patterns.txt').write_text('\n'.join(repr(p) for _,p in accepted)) | |
| print(json.dumps(res)) | |
Xet Storage Details
- Size:
- 3.03 kB
- Xet hash:
- cd49ce211f844532b6af248cecf6ff90bed42026784ee1af0baf4021943d48f0
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.