narcolepticchicken commited on
Commit
95c42d4
·
verified ·
1 Parent(s): 8414451

Upload eval/benchmark_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval/benchmark_results.json +54 -0
eval/benchmark_results.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aco_v8": {
3
+ "name": "aco_v8",
4
+ "success_rate": 0.796,
5
+ "avg_cost": 0.7781665000000001,
6
+ "model_cost": 0.7544249999999999,
7
+ "tool_cost": 0.0213615,
8
+ "ver_cost": 0.0023799999999999997,
9
+ "avg_context_tokens": 9352.864,
10
+ "verifications": 238,
11
+ "avg_tools": 2.727,
12
+ "escalations": 0,
13
+ "downgrades": 0
14
+ },
15
+ "frontier": {
16
+ "name": "always_frontier",
17
+ "success_rate": 0.91,
18
+ "avg_cost": 1.0413615,
19
+ "model_cost": 1.0413615,
20
+ "tool_cost": 0.0,
21
+ "ver_cost": 0.0,
22
+ "avg_context_tokens": 8000.0,
23
+ "verifications": 2000,
24
+ "avg_tools": 0.0,
25
+ "escalations": 0,
26
+ "downgrades": 0
27
+ },
28
+ "heuristic": {
29
+ "name": "heuristic",
30
+ "success_rate": 0.845,
31
+ "avg_cost": 0.9203665,
32
+ "model_cost": 0.9203665,
33
+ "tool_cost": 0.0,
34
+ "ver_cost": 0.0,
35
+ "avg_context_tokens": 8000.0,
36
+ "verifications": 2000,
37
+ "avg_tools": 0.0,
38
+ "escalations": 0,
39
+ "downgrades": 0
40
+ },
41
+ "cheap": {
42
+ "name": "always_cheap",
43
+ "success_rate": 0.2985,
44
+ "avg_cost": 0.07136150000000001,
45
+ "model_cost": 0.07136150000000001,
46
+ "tool_cost": 0.0,
47
+ "ver_cost": 0.0,
48
+ "avg_context_tokens": 8000.0,
49
+ "verifications": 2000,
50
+ "avg_tools": 0.0,
51
+ "escalations": 0,
52
+ "downgrades": 0
53
+ }
54
+ }