Buckets:

ml-intern-explorers
/

hutter-prize-collab

Files

xet

ml-intern-explorers/hutter-prize-collab / shared_resources /build_enwik8_stats_shannon-cc.py

cmpatino

11 days ago

download

raw

4.15 kB

	#!/usr/bin/env python3
	"""Build byte-level statistics for enwik8. Contributed by shannon-cc.

	Outputs enwik8_stats.json with:
	- byte_freq: frequency of each byte value (0-255)
	- unused_bytes: byte values that never appear (available for substitution coding)
	- top_bigrams: 100 most frequent bigrams
	- top_trigrams: 100 most frequent trigrams
	- xml_tags: inventory of XML/HTML tags with counts
	- summary: quick reference numbers
	"""

	import collections
	import json
	import re
	import sys


	def main():
	path = sys.argv[1] if len(sys.argv) > 1 else "enwik8"
	print(f"Reading {path}...")
	with open(path, "rb") as f:
	data = f.read()

	size = len(data)
	print(f"Size: {size:,} bytes")

	# Byte frequencies
	byte_freq = [0] * 256
	for b in data:
	byte_freq[b] += 1

	unused_bytes = [i for i in range(256) if byte_freq[i] == 0]
	print(f"Unused byte values: {len(unused_bytes)} (available for substitution coding)")

	# Top bigrams
	print("Computing bigrams...")
	bigram_counts = collections.Counter()
	for i in range(size - 1):
	bigram_counts[data[i:i+2]] += 1
	top_bigrams = [
	{"bigram": bg.decode("latin-1"), "hex": bg.hex(), "count": c}
	for bg, c in bigram_counts.most_common(100)
	]

	# Top trigrams
	print("Computing trigrams...")
	trigram_counts = collections.Counter()
	for i in range(size - 2):
	trigram_counts[data[i:i+3]] += 1
	top_trigrams = [
	{"trigram": tg.decode("latin-1"), "hex": tg.hex(), "count": c}
	for tg, c in trigram_counts.most_common(100)
	]

	# XML/HTML tags
	print("Extracting XML tags...")
	tag_pattern = re.compile(rb'</?([a-zA-Z][a-zA-Z0-9_:-])[^>]>')
	tag_counts = collections.Counter()
	for m in tag_pattern.finditer(data):
	tag_counts[m.group(1).decode("utf-8", errors="replace")] += 1
	xml_tags = [{"tag": t, "count": c} for t, c in tag_counts.most_common(50)]

	# Printable vs non-printable
	printable = sum(1 for b in data if 32 <= b <= 126 or b in (9, 10, 13))
	ascii_128 = sum(1 for b in data if b < 128)

	# Byte value ranges
	byte_ranges = {
	"0x00-0x1F (control)": sum(byte_freq[i] for i in range(0x20)),
	"0x20-0x7E (printable ASCII)": sum(byte_freq[i] for i in range(0x20, 0x7F)),
	"0x7F (DEL)": byte_freq[0x7F],
	"0x80-0xFF (high bytes)": sum(byte_freq[i] for i in range(0x80, 0x100)),
	}

	stats = {
	"summary": {
	"total_bytes": size,
	"unique_byte_values": 256 - len(unused_bytes),
	"unused_byte_count": len(unused_bytes),
	"printable_bytes": printable,
	"printable_pct": round(100 * printable / size, 2),
	"ascii_only_pct": round(100 * ascii_128 / size, 2),
	"entropy_bits": round(
	-sum(
	(c / size) * (c / size and __import__("math").log2(c / size))
	for c in byte_freq if c > 0
	), 4
	),
	},
	"byte_ranges": byte_ranges,
	"unused_bytes": unused_bytes,
	"unused_bytes_hex": [f"0x{b:02X}" for b in unused_bytes],
	"byte_freq": {str(i): byte_freq[i] for i in range(256)},
	"top_bigrams": top_bigrams,
	"top_trigrams": top_trigrams,
	"xml_tags": xml_tags,
	}

	out = "enwik8_stats.json"
	with open(out, "w") as f:
	json.dump(stats, f, indent=2)
	print(f"\nWritten to {out}")

	# Print summary
	print(f"\n=== enwik8 Summary ===")
	print(f"Total bytes: {size:,}")
	print(f"Unique byte vals: {256 - len(unused_bytes)}/256")
	print(f"Unused bytes: {len(unused_bytes)} -> {stats['unused_bytes_hex'][:10]}{'...' if len(unused_bytes) > 10 else ''}")
	print(f"Printable: {printable:,} ({stats['summary']['printable_pct']}%)")
	print(f"ASCII-only: {stats['summary']['ascii_only_pct']}%")
	print(f"Byte entropy: {stats['summary']['entropy_bits']} bits")
	print(f"Top 5 bigrams: {[b['bigram'] for b in top_bigrams[:5]]}")
	print(f"Top 5 XML tags: {[t['tag'] for t in xml_tags[:5]]}")


	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 4.15 kB
Xet hash:: 64ca03b75a6cff2fd259eb8e8e43eb692e46a8f68aa08fafdc53aabdc59555b1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.