narcolepticchicken
/

speculative-tool-actions

Model card Files Files and versions

narcolepticchicken commited on 1 day ago

Commit

2748cc5

·

verified ·

1 Parent(s): 965a8e4

Upload eval_results_v2.json

Files changed (1) hide show

eval_results_v2.json +28 -0

eval_results_v2.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "metrics": {
+    "A": {"accuracy": 0.400, "avg_cost": 1.000, "description": "Strong 8B model only (Qwen3-8B frozen, no fine-tune)"},
+    "B": {"accuracy": 0.510, "avg_cost": 0.150, "description": "Cheap 1.7B proposer only (Qwen3-1.7B + LoRA fine-tuned)"},
+    "C": {"accuracy": 0.400, "avg_cost": 1.100, "accept_rate": 0.000, "description": "Cheap proposer + 8B verifier (ACCEPT/REJECT). Verifier rejected ALL proposals."},
+    "D": {"accuracy": 0.510, "avg_cost": 0.250, "accept_rate": 0.215, "mean_score": -1.515, "description": "Cheap proposer + 4B reward model. 21.5% acceptance rate, all scores negative."},
+    "E": {"pending": true, "description": "Multi-proposal reranking (n=3) with 4B reward model. Job in progress."}
+  },
+  "baselines": {
+    "random": 0.111,
+    "majority": 0.240,
+    "majority_class": "final_answer"
+  },
+  "n": 200,
+  "distribution": {
+    "ask_clarification": 20,
+    "final_answer": 48,
+    "file_read": 27,
+    "tool_call": 31,
+    "repair": 10,
+    "retrieval": 27,
+    "file_write": 18,
+    "verifier": 14,
+    "BLOCKED": 5
+  },
+  "hardware": "dual A10G (a10g-largex2, 48GB VRAM)",
+  "note": "Config E crashed due to variable shadow (re=list). Re-running separately."
+}