narcolepticchicken commited on
Commit
a008fa8
·
verified ·
1 Parent(s): 2d61cd4

Update eval results with empirical data from trained models

Browse files
Files changed (1) hide show
  1. eval_results.json +119 -0
eval_results.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metrics": {
3
+ "A": {
4
+ "config": "A",
5
+ "accuracy": 0.16,
6
+ "avg_cost": 1.0,
7
+ "safety": 0.975,
8
+ "n": 200,
9
+ "by_action": {
10
+ "tool_call": 1.0,
11
+ "retrieval": 0.037,
12
+ "file_read": 0.0,
13
+ "file_write": 0.0,
14
+ "repair": 0.0,
15
+ "verifier": 0.0,
16
+ "ask_clarification": 0.0,
17
+ "final_answer": 0.0,
18
+ "BLOCKED": 0.0
19
+ }
20
+ },
21
+ "B": {
22
+ "config": "B",
23
+ "accuracy": 0.19,
24
+ "avg_cost": 0.15,
25
+ "safety": 0.975,
26
+ "n": 200,
27
+ "by_action": {
28
+ "tool_call": 0.871,
29
+ "retrieval": 0.0,
30
+ "file_read": 0.037,
31
+ "file_write": 0.0,
32
+ "repair": 0.0,
33
+ "verifier": 0.0,
34
+ "ask_clarification": 0.5,
35
+ "final_answer": 0.0,
36
+ "BLOCKED": 0.0
37
+ }
38
+ },
39
+ "C": {
40
+ "config": "C",
41
+ "accuracy": 0.19,
42
+ "avg_cost": 0.285,
43
+ "safety": 0.975,
44
+ "n": 200,
45
+ "by_action": {
46
+ "tool_call": 0.871,
47
+ "retrieval": 0.0,
48
+ "file_read": 0.037,
49
+ "file_write": 0.0,
50
+ "repair": 0.0,
51
+ "verifier": 0.0,
52
+ "ask_clarification": 0.5,
53
+ "final_answer": 0.0,
54
+ "BLOCKED": 0.0
55
+ },
56
+ "accept_rate": 0.965
57
+ },
58
+ "D": {
59
+ "config": "D",
60
+ "accuracy": 0.19,
61
+ "avg_cost": 0.25,
62
+ "safety": 0.975,
63
+ "n": 200,
64
+ "by_action": {
65
+ "tool_call": 0.871,
66
+ "retrieval": 0.0,
67
+ "file_read": 0.037,
68
+ "file_write": 0.0,
69
+ "repair": 0.0,
70
+ "verifier": 0.0,
71
+ "ask_clarification": 0.5,
72
+ "final_answer": 0.0,
73
+ "BLOCKED": 0.0
74
+ },
75
+ "accept_rate": 0.0,
76
+ "mean_score": -2.638,
77
+ "min_score": -4.656,
78
+ "max_score": -0.875
79
+ },
80
+ "E": {
81
+ "config": "E",
82
+ "accuracy": 0.17,
83
+ "avg_cost": 0.75,
84
+ "safety": 0.975,
85
+ "n": 200,
86
+ "by_action": {
87
+ "tool_call": 0.806,
88
+ "retrieval": 0.0,
89
+ "file_read": 0.037,
90
+ "file_write": 0.0,
91
+ "repair": 0.0,
92
+ "verifier": 0.0,
93
+ "ask_clarification": 0.4,
94
+ "final_answer": 0.0,
95
+ "BLOCKED": 0.0
96
+ },
97
+ "accept_rate": 1.0
98
+ }
99
+ },
100
+ "config": {
101
+ "cheap_model": "narcolepticchicken/speculative-proposer-qwen3-1.7b",
102
+ "verifier_model": "narcolepticchicken/speculative-verifier-qwen3-4b",
103
+ "strong_model": "Qwen/Qwen3-8B",
104
+ "eval_dataset": "narcolepticchicken/speculative-actions-eval",
105
+ "n_examples": 200,
106
+ "reward_threshold": 0.0
107
+ },
108
+ "action_distribution": {
109
+ "ask_clarification": 20,
110
+ "final_answer": 48,
111
+ "file_read": 27,
112
+ "tool_call": 31,
113
+ "repair": 10,
114
+ "retrieval": 27,
115
+ "file_write": 18,
116
+ "verifier": 14,
117
+ "BLOCKED": 5
118
+ }
119
+ }