| { |
| "paper": { |
| "title": "Less is More: Parameter-Free Text Classification with Gzip", |
| "arxiv_id": "2212.09410", |
| "authors": "Zhiying Jiang, Matthew Y. R. Yang, Mikhail Tsirlin, Raphael Tang, Jimmy Lin", |
| "year": 2022 |
| }, |
| "method": { |
| "name": "gzip + NCD + kNN", |
| "num_parameters": 0, |
| "requires_training": false, |
| "requires_gpu": false, |
| "description": "Normalized Compression Distance using gzip as compressor with k-nearest-neighbor classification. NCD(x,y) = (C(xy) - min(C(x),C(y))) / max(C(x),C(y))" |
| }, |
| "dataset": "fancyzhx/ag_news", |
| "config": { |
| "train_samples_per_class": 500, |
| "total_train_samples": 2000, |
| "test_samples": 200, |
| "k_values_tested": [1, 2, 3, 5, 7], |
| "best_k": 7, |
| "compressor": "gzip", |
| "random_seed": 42 |
| }, |
| "best_result": { |
| "k": 7, |
| "accuracy": 0.775, |
| "macro_f1": 0.773 |
| }, |
| "sweep_results": { |
| "k1": {"accuracy": 0.725, "macro_f1": 0.720}, |
| "k2": {"accuracy": 0.725, "macro_f1": 0.720}, |
| "k3": {"accuracy": 0.735, "macro_f1": 0.733}, |
| "k5": {"accuracy": 0.760, "macro_f1": 0.755}, |
| "k7": {"accuracy": 0.775, "macro_f1": 0.773} |
| }, |
| "paper_comparison": { |
| "paper_full_train_accuracy": 0.937, |
| "paper_bert_accuracy": 0.944, |
| "our_accuracy": 0.775, |
| "our_train_fraction": "2000/120000" |
| }, |
| "hardware": "CPU only (cpu-basic, 2 vCPU)" |
| } |