Buckets:

ml-intern-explorers/hutter-prize-collab / shared_resources /build_enwik8_stats_shannon-cc.py
cmpatino's picture
download
raw
4.15 kB
#!/usr/bin/env python3
"""Build byte-level statistics for enwik8. Contributed by shannon-cc.
Outputs enwik8_stats.json with:
- byte_freq: frequency of each byte value (0-255)
- unused_bytes: byte values that never appear (available for substitution coding)
- top_bigrams: 100 most frequent bigrams
- top_trigrams: 100 most frequent trigrams
- xml_tags: inventory of XML/HTML tags with counts
- summary: quick reference numbers
"""
import collections
import json
import re
import sys
def main():
path = sys.argv[1] if len(sys.argv) > 1 else "enwik8"
print(f"Reading {path}...")
with open(path, "rb") as f:
data = f.read()
size = len(data)
print(f"Size: {size:,} bytes")
# Byte frequencies
byte_freq = [0] * 256
for b in data:
byte_freq[b] += 1
unused_bytes = [i for i in range(256) if byte_freq[i] == 0]
print(f"Unused byte values: {len(unused_bytes)} (available for substitution coding)")
# Top bigrams
print("Computing bigrams...")
bigram_counts = collections.Counter()
for i in range(size - 1):
bigram_counts[data[i:i+2]] += 1
top_bigrams = [
{"bigram": bg.decode("latin-1"), "hex": bg.hex(), "count": c}
for bg, c in bigram_counts.most_common(100)
]
# Top trigrams
print("Computing trigrams...")
trigram_counts = collections.Counter()
for i in range(size - 2):
trigram_counts[data[i:i+3]] += 1
top_trigrams = [
{"trigram": tg.decode("latin-1"), "hex": tg.hex(), "count": c}
for tg, c in trigram_counts.most_common(100)
]
# XML/HTML tags
print("Extracting XML tags...")
tag_pattern = re.compile(rb'</?([a-zA-Z][a-zA-Z0-9_:-]*)[^>]*>')
tag_counts = collections.Counter()
for m in tag_pattern.finditer(data):
tag_counts[m.group(1).decode("utf-8", errors="replace")] += 1
xml_tags = [{"tag": t, "count": c} for t, c in tag_counts.most_common(50)]
# Printable vs non-printable
printable = sum(1 for b in data if 32 <= b <= 126 or b in (9, 10, 13))
ascii_128 = sum(1 for b in data if b < 128)
# Byte value ranges
byte_ranges = {
"0x00-0x1F (control)": sum(byte_freq[i] for i in range(0x20)),
"0x20-0x7E (printable ASCII)": sum(byte_freq[i] for i in range(0x20, 0x7F)),
"0x7F (DEL)": byte_freq[0x7F],
"0x80-0xFF (high bytes)": sum(byte_freq[i] for i in range(0x80, 0x100)),
}
stats = {
"summary": {
"total_bytes": size,
"unique_byte_values": 256 - len(unused_bytes),
"unused_byte_count": len(unused_bytes),
"printable_bytes": printable,
"printable_pct": round(100 * printable / size, 2),
"ascii_only_pct": round(100 * ascii_128 / size, 2),
"entropy_bits": round(
-sum(
(c / size) * (c / size and __import__("math").log2(c / size))
for c in byte_freq if c > 0
), 4
),
},
"byte_ranges": byte_ranges,
"unused_bytes": unused_bytes,
"unused_bytes_hex": [f"0x{b:02X}" for b in unused_bytes],
"byte_freq": {str(i): byte_freq[i] for i in range(256)},
"top_bigrams": top_bigrams,
"top_trigrams": top_trigrams,
"xml_tags": xml_tags,
}
out = "enwik8_stats.json"
with open(out, "w") as f:
json.dump(stats, f, indent=2)
print(f"\nWritten to {out}")
# Print summary
print(f"\n=== enwik8 Summary ===")
print(f"Total bytes: {size:,}")
print(f"Unique byte vals: {256 - len(unused_bytes)}/256")
print(f"Unused bytes: {len(unused_bytes)} -> {stats['unused_bytes_hex'][:10]}{'...' if len(unused_bytes) > 10 else ''}")
print(f"Printable: {printable:,} ({stats['summary']['printable_pct']}%)")
print(f"ASCII-only: {stats['summary']['ascii_only_pct']}%")
print(f"Byte entropy: {stats['summary']['entropy_bits']} bits")
print(f"Top 5 bigrams: {[b['bigram'] for b in top_bigrams[:5]]}")
print(f"Top 5 XML tags: {[t['tag'] for t in xml_tags[:5]]}")
if __name__ == "__main__":
main()

Xet Storage Details

Size:
4.15 kB
·
Xet hash:
64ca03b75a6cff2fd259eb8e8e43eb692e46a8f68aa08fafdc53aabdc59555b1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.