knoxel
/

gzip-text-classifier

Model card Files Files and versions

xet

Community

knoxel commited on 10 days ago

Commit

fa1acc2

verified ·

1 Parent(s): a7279bf

Upload gzip_classifier.py

Browse files

Files changed (1) hide show

gzip_classifier.py +124 -0

gzip_classifier.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Implementation of "Less is More: Parameter-Free Text Classification with Gzip"
+Paper: https://arxiv.org/abs/2212.09410 (Jiang et al., 2022)
+Zero parameters. Zero training. CPU only. 15 lines of core logic.
+Usage:
+    python gzip_classifier.py
+"""
+import gzip
+import numpy as np
+import time
+import json
+from collections import Counter
+from datasets import load_dataset
+from multiprocessing import Pool, cpu_count
+# ============================================================
+# Configuration
+# ============================================================
+TRAIN_SAMPLES_PER_CLASS = 500
+TEST_SAMPLES = 200
+K_VALUES = [1, 2, 3, 5, 7]
+RANDOM_SEED = 42
+NUM_WORKERS = max(1, cpu_count() - 1)
+LABEL_NAMES = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
+def ncd(x1, x2, Cx1=None, Cx2=None):
+    """Normalized Compression Distance using gzip.
+    NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
+    """
+    if Cx1 is None:
+        Cx1 = len(gzip.compress(x1.encode()))
+    if Cx2 is None:
+        Cx2 = len(gzip.compress(x2.encode()))
+    Cx1x2 = len(gzip.compress(" ".join([x1, x2]).encode()))
+    return (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
+def gzip_classify(test_text, train_texts, train_labels, train_compressed, k=7):
+    """Classify text using gzip NCD + kNN."""
+    Cx1 = len(gzip.compress(test_text.encode()))
+    distances = []
+    for j, (x2, Cx2) in enumerate(zip(train_texts, train_compressed)):
+        d = ncd(test_text, x2, Cx1, Cx2)
+        distances.append(d)
+    sorted_idx = np.argsort(distances)
+    top_k_labels = [train_labels[i] for i in sorted_idx[:k]]
+    return Counter(top_k_labels).most_common(1)[0][0]
+def compute_distances_worker(args):
+    """Worker function for parallel distance computation."""
+    test_idx, x1, Cx1, train_texts_local, train_compressed_local = args
+    distances = np.zeros(len(train_texts_local))
+    for j in range(len(train_texts_local)):
+        x1x2 = " ".join([x1, train_texts_local[j]])
+        Cx1x2 = len(gzip.compress(x1x2.encode()))
+        Cx2 = train_compressed_local[j]
+        distances[j] = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
+    return test_idx, distances
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Gzip + kNN Text Classification (Jiang et al., 2022)")
+    print("Paper: https://arxiv.org/abs/2212.09410")
+    print("=" * 60)
+    # Load dataset
+    print("Loading AG News...")
+    dataset = load_dataset("fancyzhx/ag_news")
+    np.random.seed(RANDOM_SEED)
+    # Stratified sampling
+    train_indices = []
+    for label in range(4):
+        label_indices = [i for i, l in enumerate(dataset["train"]["label"]) if l == label]
+        sampled = np.random.choice(label_indices, TRAIN_SAMPLES_PER_CLASS, replace=False)
+        train_indices.extend(sampled)
+    np.random.shuffle(train_indices)
+    train_texts = [dataset["train"]["text"][i] for i in train_indices]
+    train_labels = np.array([dataset["train"]["label"][i] for i in train_indices])
+    test_indices = np.random.choice(len(dataset["test"]), TEST_SAMPLES, replace=False)
+    test_texts = [dataset["test"]["text"][i] for i in test_indices]
+    test_labels = np.array([dataset["test"]["label"][i] for i in test_indices])
+    print(f"Train: {len(train_texts)}, Test: {len(test_texts)}")
+    # Pre-compress
+    train_compressed = [len(gzip.compress(t.encode())) for t in train_texts]
+    test_compressed = [len(gzip.compress(t.encode())) for t in test_texts]
+    # Compute distance matrix
+    print(f"Computing {len(test_texts)}x{len(train_texts)} NCD matrix...")
+    t0 = time.time()
+    distance_matrix = np.zeros((len(test_texts), len(train_texts)))
+    for i in range(len(test_texts)):
+        Cx1 = test_compressed[i]
+        for j in range(len(train_texts)):
+            x1x2 = " ".join([test_texts[i], train_texts[j]])
+            Cx1x2 = len(gzip.compress(x1x2.encode()))
+            Cx2 = train_compressed[j]
+            distance_matrix[i, j] = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
+        if (i + 1) % 20 == 0:
+            print(f"  {i+1}/{len(test_texts)}")
+    print(f"Done in {time.time() - t0:.1f}s")
+    # Sweep k values
+    print("\nResults:")
+    for k in K_VALUES:
+        sorted_idx = np.argsort(distance_matrix, axis=1)[:, :k]
+        preds = np.array([Counter(train_labels[sorted_idx[i]].tolist()).most_common(1)[0][0]
+                         for i in range(len(test_texts))])
+        acc = np.mean(preds == test_labels)
+        print(f"  k={k}: accuracy={acc:.4f}")