narcolepticchicken commited on
Commit
522d0b6
·
verified ·
1 Parent(s): a22b742

Upload eval/eval_bert_partC.py

Browse files
Files changed (1) hide show
  1. eval/eval_bert_partC.py +91 -0
eval/eval_bert_partC.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ── Evaluate ──
3
+ policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
4
+
5
+ print("\n[4] Evaluating all policies...")
6
+ for iid, model_results in traces.items():
7
+ problem = next(iter(model_results.values()))['problem']
8
+ task_type = classify_task(problem)
9
+ floor = TASK_FLOOR.get(task_type, 2)
10
+ f_model = 'claude-opus-4.7'
11
+
12
+ # Oracle
13
+ resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
14
+ if resolved:
15
+ cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0))
16
+ policies['oracle']['success'] += 1
17
+ policies['oracle']['cost'] += cheapest[1]['cost']
18
+ else:
19
+ policies['oracle']['cost'] += min(r['cost'] for r in model_results.values())
20
+ policies['oracle']['n'] += 1
21
+
22
+ # Always frontier
23
+ if f_model in model_results:
24
+ policies['frontier']['success'] += int(model_results[f_model]['resolved'])
25
+ policies['frontier']['cost'] += model_results[f_model]['cost']
26
+ policies['frontier']['n'] += 1
27
+
28
+ # BERT
29
+ bert_tier, bert_conf = route_bert(problem)
30
+ bert_tier = max(bert_tier, floor)
31
+ m_bert = TIER_TO_SWE.get(bert_tier, f_model)
32
+ if m_bert in model_results:
33
+ policies['bert']['success'] += int(model_results[m_bert]['resolved'])
34
+ policies['bert']['cost'] += model_results[m_bert]['cost']
35
+ else:
36
+ policies['bert']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
37
+ policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
38
+ policies['bert']['n'] += 1
39
+
40
+ # v11 XGBoost
41
+ v11_tier, v11_conf, v11_probs = route_v11(problem)
42
+ v11_tier = max(v11_tier, floor)
43
+ m_v11 = TIER_TO_SWE.get(v11_tier, f_model)
44
+ if m_v11 in model_results:
45
+ policies['v11_xgboost']['success'] += int(model_results[m_v11]['resolved'])
46
+ policies['v11_xgboost']['cost'] += model_results[m_v11]['cost']
47
+ else:
48
+ policies['v11_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
49
+ policies['v11_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
50
+ policies['v11_xgboost']['n'] += 1
51
+
52
+ # BERT + feedback
53
+ if m_bert in model_results and model_results[m_bert]['resolved']:
54
+ policies['bert_feedback']['success'] += 1
55
+ policies['bert_feedback']['cost'] += model_results[m_bert]['cost']
56
+ else:
57
+ up_tier = min(bert_tier + 1, 5)
58
+ m_up = TIER_TO_SWE.get(up_tier, f_model)
59
+ if m_up in model_results and model_results[m_up]['resolved']:
60
+ policies['bert_feedback']['success'] += 1
61
+ policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[m_up]['cost']
62
+ elif f_model in model_results and model_results[f_model]['resolved']:
63
+ policies['bert_feedback']['success'] += 1
64
+ policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[f_model]['cost']
65
+ else:
66
+ policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01)
67
+ policies['bert_feedback']['n'] += 1
68
+
69
+ # v11 + feedback
70
+ if m_v11 in model_results and model_results[m_v11]['resolved']:
71
+ policies['v11_feedback']['success'] += 1
72
+ policies['v11_feedback']['cost'] += model_results[m_v11]['cost']
73
+ else:
74
+ up_tier = min(v11_tier + 1, 5)
75
+ m_up = TIER_TO_SWE.get(up_tier, f_model)
76
+ if m_up in model_results and model_results[m_up]['resolved']:
77
+ policies['v11_feedback']['success'] += 1
78
+ policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01) + model_results[m_up]['cost']
79
+ elif f_model in model_results and model_results[f_model]['resolved']:
80
+ policies['v11_feedback']['success'] += 1
81
+ policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01) + model_results[f_model]['cost']
82
+ else:
83
+ policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01)
84
+ policies['v11_feedback']['n'] += 1
85
+
86
+ # Always cheap
87
+ c_model = 'deepseek-v4-flash'
88
+ if c_model in model_results:
89
+ policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
90
+ policies['always_cheap']['cost'] += model_results[c_model]['cost']
91
+ policies['always_cheap']['n'] += 1