{ "paper": { "title": "Less is More: Parameter-Free Text Classification with Gzip", "arxiv_id": "2212.09410", "authors": "Zhiying Jiang, Matthew Y. R. Yang, Mikhail Tsirlin, Raphael Tang, Jimmy Lin", "year": 2022 }, "method": { "name": "gzip + NCD + kNN", "num_parameters": 0, "requires_training": false, "requires_gpu": false, "description": "Normalized Compression Distance using gzip as compressor with k-nearest-neighbor classification. NCD(x,y) = (C(xy) - min(C(x),C(y))) / max(C(x),C(y))" }, "dataset": "fancyzhx/ag_news", "config": { "train_samples_per_class": 500, "total_train_samples": 2000, "test_samples": 200, "k_values_tested": [1, 2, 3, 5, 7], "best_k": 7, "compressor": "gzip", "random_seed": 42 }, "best_result": { "k": 7, "accuracy": 0.775, "macro_f1": 0.773 }, "sweep_results": { "k1": {"accuracy": 0.725, "macro_f1": 0.720}, "k2": {"accuracy": 0.725, "macro_f1": 0.720}, "k3": {"accuracy": 0.735, "macro_f1": 0.733}, "k5": {"accuracy": 0.760, "macro_f1": 0.755}, "k7": {"accuracy": 0.775, "macro_f1": 0.773} }, "paper_comparison": { "paper_full_train_accuracy": 0.937, "paper_bert_accuracy": 0.944, "our_accuracy": 0.775, "our_train_fraction": "2000/120000" }, "hardware": "CPU only (cpu-basic, 2 vCPU)" }