Buckets:
| #!/usr/bin/env python3 | |
| """Build byte-level statistics for enwik8. Contributed by shannon-cc. | |
| Outputs enwik8_stats.json with: | |
| - byte_freq: frequency of each byte value (0-255) | |
| - unused_bytes: byte values that never appear (available for substitution coding) | |
| - top_bigrams: 100 most frequent bigrams | |
| - top_trigrams: 100 most frequent trigrams | |
| - xml_tags: inventory of XML/HTML tags with counts | |
| - summary: quick reference numbers | |
| """ | |
| import collections | |
| import json | |
| import re | |
| import sys | |
| def main(): | |
| path = sys.argv[1] if len(sys.argv) > 1 else "enwik8" | |
| print(f"Reading {path}...") | |
| with open(path, "rb") as f: | |
| data = f.read() | |
| size = len(data) | |
| print(f"Size: {size:,} bytes") | |
| # Byte frequencies | |
| byte_freq = [0] * 256 | |
| for b in data: | |
| byte_freq[b] += 1 | |
| unused_bytes = [i for i in range(256) if byte_freq[i] == 0] | |
| print(f"Unused byte values: {len(unused_bytes)} (available for substitution coding)") | |
| # Top bigrams | |
| print("Computing bigrams...") | |
| bigram_counts = collections.Counter() | |
| for i in range(size - 1): | |
| bigram_counts[data[i:i+2]] += 1 | |
| top_bigrams = [ | |
| {"bigram": bg.decode("latin-1"), "hex": bg.hex(), "count": c} | |
| for bg, c in bigram_counts.most_common(100) | |
| ] | |
| # Top trigrams | |
| print("Computing trigrams...") | |
| trigram_counts = collections.Counter() | |
| for i in range(size - 2): | |
| trigram_counts[data[i:i+3]] += 1 | |
| top_trigrams = [ | |
| {"trigram": tg.decode("latin-1"), "hex": tg.hex(), "count": c} | |
| for tg, c in trigram_counts.most_common(100) | |
| ] | |
| # XML/HTML tags | |
| print("Extracting XML tags...") | |
| tag_pattern = re.compile(rb'</?([a-zA-Z][a-zA-Z0-9_:-]*)[^>]*>') | |
| tag_counts = collections.Counter() | |
| for m in tag_pattern.finditer(data): | |
| tag_counts[m.group(1).decode("utf-8", errors="replace")] += 1 | |
| xml_tags = [{"tag": t, "count": c} for t, c in tag_counts.most_common(50)] | |
| # Printable vs non-printable | |
| printable = sum(1 for b in data if 32 <= b <= 126 or b in (9, 10, 13)) | |
| ascii_128 = sum(1 for b in data if b < 128) | |
| # Byte value ranges | |
| byte_ranges = { | |
| "0x00-0x1F (control)": sum(byte_freq[i] for i in range(0x20)), | |
| "0x20-0x7E (printable ASCII)": sum(byte_freq[i] for i in range(0x20, 0x7F)), | |
| "0x7F (DEL)": byte_freq[0x7F], | |
| "0x80-0xFF (high bytes)": sum(byte_freq[i] for i in range(0x80, 0x100)), | |
| } | |
| stats = { | |
| "summary": { | |
| "total_bytes": size, | |
| "unique_byte_values": 256 - len(unused_bytes), | |
| "unused_byte_count": len(unused_bytes), | |
| "printable_bytes": printable, | |
| "printable_pct": round(100 * printable / size, 2), | |
| "ascii_only_pct": round(100 * ascii_128 / size, 2), | |
| "entropy_bits": round( | |
| -sum( | |
| (c / size) * (c / size and __import__("math").log2(c / size)) | |
| for c in byte_freq if c > 0 | |
| ), 4 | |
| ), | |
| }, | |
| "byte_ranges": byte_ranges, | |
| "unused_bytes": unused_bytes, | |
| "unused_bytes_hex": [f"0x{b:02X}" for b in unused_bytes], | |
| "byte_freq": {str(i): byte_freq[i] for i in range(256)}, | |
| "top_bigrams": top_bigrams, | |
| "top_trigrams": top_trigrams, | |
| "xml_tags": xml_tags, | |
| } | |
| out = "enwik8_stats.json" | |
| with open(out, "w") as f: | |
| json.dump(stats, f, indent=2) | |
| print(f"\nWritten to {out}") | |
| # Print summary | |
| print(f"\n=== enwik8 Summary ===") | |
| print(f"Total bytes: {size:,}") | |
| print(f"Unique byte vals: {256 - len(unused_bytes)}/256") | |
| print(f"Unused bytes: {len(unused_bytes)} -> {stats['unused_bytes_hex'][:10]}{'...' if len(unused_bytes) > 10 else ''}") | |
| print(f"Printable: {printable:,} ({stats['summary']['printable_pct']}%)") | |
| print(f"ASCII-only: {stats['summary']['ascii_only_pct']}%") | |
| print(f"Byte entropy: {stats['summary']['entropy_bits']} bits") | |
| print(f"Top 5 bigrams: {[b['bigram'] for b in top_bigrams[:5]]}") | |
| print(f"Top 5 XML tags: {[t['tag'] for t in xml_tags[:5]]}") | |
| if __name__ == "__main__": | |
| main() | |
Xet Storage Details
- Size:
- 4.15 kB
- Xet hash:
- 64ca03b75a6cff2fd259eb8e8e43eb692e46a8f68aa08fafdc53aabdc59555b1
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.