Buckets:

ml-intern-explorers
/

hutter-prize-collab

Files

xet

ml-intern-explorers/hutter-prize-collab / artifacts /dict_order_opt_xz_AutoZip /run_experiment.py

alucchi

6 days ago

download

raw

2.63 kB

	#!/usr/bin/env python3
	import json, subprocess, zipfile, os
	from pathlib import Path

	root=Path(__file__).resolve().parent
	raw=(root.parents[1]/'shared_resources'/'enwik8').read_bytes()
	# Seed from prior successful candidate list
	seed=[b"<page>",b"</page>",b"<title>",b"</title>",b"<id>",b"</id>",b"<revision>",b"</revision>",b"<timestamp>",b"</timestamp>",b"<text",b"</text>",b"[[",b"]]",b"{{",b"}}",b""",b"<",b">",b"&",b" the ",b" and ",b" of ",b" to ",b" in ",b" is ",b" for ",b" on ",b"==",b"===",b"\n\n",b"\n*",b"\n#"]
	# Keep only patterns that actually occur and have strong gain
	cand=[]
	for p in seed:
	c=raw.count(p)
	g=c*(len(p)-1)
	if c>0 and g>15000:
	cand.append((g,p))
	# Optimize order: higher gain first, then longer pattern
	cand=[p for _,p in sorted(cand,key=lambda x:(x[0],len(x[1])), reverse=True)]
	unused=[b for b in range(256) if b not in set(raw)]
	subs=list(zip(unused[:len(cand)], cand[:len(unused)]))
	enc=raw
	for code,p in subs: # optimized order, not length order
	enc=enc.replace(p,bytes([code]))
	(root/'preprocessed.bin').write_bytes(enc)
	xz=['xz','-k','-c','-9e','--lzma2=dict=512MiB,nice=273,mf=bt4,mode=normal,lc=4,lp=0,pb=0']
	with (root/'archive.xz').open('wb') as f:
	subprocess.run(xz+[str(root/'preprocessed.bin')],stdout=f,check=True)
	dec=root/'decompressor'; dec.mkdir(exist_ok=True)
	(dec/'decompress.py').write_text("import json,subprocess,sys\nfrom pathlib import Path\ninp=Path(sys.argv[1]);out=Path(sys.argv[2]);tmp=inp.parent/'tmp.bin';subprocess.run(['xz','-d','-c',str(inp)],stdout=tmp.open('wb'),check=True);subs=json.loads((Path(__file__).with_name('subs.json')).read_text());d=tmp.read_bytes();\nfor c,h in reversed(subs):d=d.replace(bytes([c]),bytes.fromhex(h));out.write_bytes(d);tmp.unlink(missing_ok=True)\n")
	os.chmod(dec/'decompress.py',0o755)
	(dec/'subs.json').write_text(json.dumps([(c,p.hex()) for c,p in subs]))
	with zipfile.ZipFile(root/'decompressor.zip','w',zipfile.ZIP_DEFLATED,compresslevel=9) as z:
	z.write(dec/'decompress.py','decompress.py'); z.write(dec/'subs.json','subs.json')
	subprocess.run(['python3',str(dec/'decompress.py'),str(root/'archive.xz'),str(root/'enwik8.out')],check=True)
	assert (root/'enwik8.out').read_bytes()==raw
	A=(root/'archive.xz').stat().st_size; D=(root/'decompressor.zip').stat().st_size
	res={'agent_id':'AutoZip','experiment':'Order-optimized substitutions + tuned xz','method':'dict-order-opt-xz','archive_bytes':A,'decompressor_zip_bytes':D,'total_bytes':A+D,'bpc':round(8*(A+D)/1e8,3),'num_subs':len(subs)}
	(root/'results.json').write_text(json.dumps(res,indent=2)+'\n')
	print(json.dumps(res))

Xet Storage Details

Size:: 2.63 kB
Xet hash:: 544a5d345dcd8007f57f72d0c4f24e99e5311137dd77cd0a6a28c101f06a3d58

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.