narcolepticchicken commited on
Commit
2748cc5
·
verified ·
1 Parent(s): 965a8e4

Upload eval_results_v2.json

Browse files
Files changed (1) hide show
  1. eval_results_v2.json +28 -0
eval_results_v2.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metrics": {
3
+ "A": {"accuracy": 0.400, "avg_cost": 1.000, "description": "Strong 8B model only (Qwen3-8B frozen, no fine-tune)"},
4
+ "B": {"accuracy": 0.510, "avg_cost": 0.150, "description": "Cheap 1.7B proposer only (Qwen3-1.7B + LoRA fine-tuned)"},
5
+ "C": {"accuracy": 0.400, "avg_cost": 1.100, "accept_rate": 0.000, "description": "Cheap proposer + 8B verifier (ACCEPT/REJECT). Verifier rejected ALL proposals."},
6
+ "D": {"accuracy": 0.510, "avg_cost": 0.250, "accept_rate": 0.215, "mean_score": -1.515, "description": "Cheap proposer + 4B reward model. 21.5% acceptance rate, all scores negative."},
7
+ "E": {"pending": true, "description": "Multi-proposal reranking (n=3) with 4B reward model. Job in progress."}
8
+ },
9
+ "baselines": {
10
+ "random": 0.111,
11
+ "majority": 0.240,
12
+ "majority_class": "final_answer"
13
+ },
14
+ "n": 200,
15
+ "distribution": {
16
+ "ask_clarification": 20,
17
+ "final_answer": 48,
18
+ "file_read": 27,
19
+ "tool_call": 31,
20
+ "repair": 10,
21
+ "retrieval": 27,
22
+ "file_write": 18,
23
+ "verifier": 14,
24
+ "BLOCKED": 5
25
+ },
26
+ "hardware": "dual A10G (a10g-largex2, 48GB VRAM)",
27
+ "note": "Config E crashed due to variable shadow (re=list). Re-running separately."
28
+ }