Buckets:

ml-intern-explorers
/

hutter-prize-collab

Files

xet

ml-intern-explorers/hutter-prize-collab / artifacts /dict_auto_xz_AutoZip /run_experiment.py

alucchi

6 days ago

download

raw

3.59 kB

	#!/usr/bin/env python3
	import json, os, re, subprocess, zipfile
	from collections import Counter
	from pathlib import Path

	root = Path(__file__).resolve().parent
	src = (Path(__file__).resolve().parents[2] / 'shared_resources' / 'enwik8').resolve()
	raw = src.read_bytes()

	# Candidate patterns: XML-heavy + common wiki fragments.
	candidates = [
	b"<page>", b"</page>", b"<title>", b"</title>", b"<id>", b"</id>",
	b"<revision>", b"</revision>", b"<timestamp>", b"</timestamp>",
	b"<contributor>", b"</contributor>", b"<username>", b"</username>",
	b"<text", b"</text>", b"<sha1>", b"</sha1>", b"<minor />",
	b"[[", b"]]", b"{{", b"}}", b""", b"<", b">", b"&",
	b" the ", b" and ", b" of ", b" to ", b" in ", b" is ", b" for ", b" on ",
	b"\|", b"\|\|", b"==", b"===", b"----", b"\n*", b"\n#", b"\n\n",
	]

	used = set(raw)
	unused = [b for b in range(256) if b not in used]

	# Score by estimated byte gain if replaced with one byte.
	def gain(p: bytes) -> int:
	c = raw.count(p)
	if c == 0:
	return 0
	return c * (len(p) - 1)

	ranked = sorted([(gain(p), p) for p in candidates], reverse=True)
	selected = [p for g, p in ranked if g > 20000][: len(unused)]
	subs = list(zip(unused[: len(selected)], selected))

	enc = raw
	# Replace longer patterns first for determinism.
	for code, pat in sorted(subs, key=lambda x: len(x[1]), reverse=True):
	enc = enc.replace(pat, bytes([code]))

	(root / 'preprocessed.bin').write_bytes(enc)

	archive = root / 'archive.xz'
	cmd = [
	'xz', '-k', '-c', '-9e',
	'--lzma2=dict=512MiB,nice=273,mf=bt4,mode=normal,lc=4,lp=0,pb=0',
	str(root / 'preprocessed.bin'),
	]
	with archive.open('wb') as f:
	subprocess.run(cmd, check=True, stdout=f)

	# Build decompressor payload.
	dec_dir = root / 'decompressor'
	dec_dir.mkdir(exist_ok=True)
	(dec_dir / 'decompress.py').write_text('''#!/usr/bin/env python3
	import json, subprocess, sys
	from pathlib import Path
	inp = Path(sys.argv[1]) if len(sys.argv) > 1 else Path('archive.xz')
	out = Path(sys.argv[2]) if len(sys.argv) > 2 else Path('enwik8.out')
	work = inp.parent
	pp = work / 'tmp.preprocessed.bin'
	subprocess.run(['xz','-d','-c',str(inp)], check=True, stdout=pp.open('wb'))
	subs = json.loads((Path(__file__).with_name('subs.json')).read_text())
	data = pp.read_bytes()
	for code, pat_hex in subs:
	data = data.replace(bytes([code]), bytes.fromhex(pat_hex))
	out.write_bytes(data)
	pp.unlink(missing_ok=True)
	''')
	(dec_dir / 'subs.json').write_text(json.dumps([(c, p.hex()) for c, p in subs]))
	os.chmod(dec_dir / 'decompress.py', 0o755)

	zip_path = root / 'decompressor.zip'
	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as z:
	z.write(dec_dir / 'decompress.py', arcname='decompress.py')
	z.write(dec_dir / 'subs.json', arcname='subs.json')

	# Verify roundtrip.
	out = root / 'enwik8.out'
	subprocess.run(['python3', str(dec_dir / 'decompress.py'), str(archive), str(out)], check=True)
	if out.read_bytes() != raw:
	raise SystemExit('roundtrip failed')

	archive_b = archive.stat().st_size
	dec_b = zip_path.stat().st_size
	total = archive_b + dec_b
	bpc = round(8 * total / 1e8, 3)

	res = {
	'agent_id': 'AutoZip',
	'experiment': 'Auto dictionary substitution + tuned xz',
	'method': 'dict-auto-xz',
	'archive_bytes': archive_b,
	'decompressor_zip_bytes': dec_b,
	'total_bytes': total,
	'bpc': bpc,
	'num_subs': len(subs),
	'notes': 'Candidate-based substitutions using bytes absent from enwik8; tuned xz backend.'
	}
	(root / 'results.json').write_text(json.dumps(res, indent=2) + '\n')
	print(json.dumps(res))

Xet Storage Details

Size:: 3.59 kB
Xet hash:: 144ebd855c215f1743ecf9de67c9b4eae9c9299ff5911958a465173bb2b5ce7e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.