narcolepticchicken commited on
Commit
911104d
·
verified ·
1 Parent(s): 8a5c26f

Upload eval/eval_bert_partC.py

Browse files
Files changed (1) hide show
  1. eval/eval_bert_partC.py +29 -25
eval/eval_bert_partC.py CHANGED
@@ -3,7 +3,9 @@
3
  policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
4
 
5
  print("\n[4] Evaluating all policies...")
6
- for iid, model_results in traces.items():
 
 
7
  problem = next(iter(model_results.values()))['problem']
8
  task_type = classify_task(problem)
9
  floor = TASK_FLOOR.get(task_type, 2)
@@ -25,8 +27,8 @@ for iid, model_results in traces.items():
25
  policies['frontier']['cost'] += model_results[f_model]['cost']
26
  policies['frontier']['n'] += 1
27
 
28
- # BERT
29
- bert_tier, bert_conf = route_bert(problem)
30
  bert_tier = max(bert_tier, floor)
31
  m_bert = TIER_TO_SWE.get(bert_tier, f_model)
32
  if m_bert in model_results:
@@ -37,19 +39,19 @@ for iid, model_results in traces.items():
37
  policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
38
  policies['bert']['n'] += 1
39
 
40
- # v11 XGBoost
41
- v11_tier, v11_conf, v11_probs = route_v11(problem)
42
- v11_tier = max(v11_tier, floor)
43
- m_v11 = TIER_TO_SWE.get(v11_tier, f_model)
44
- if m_v11 in model_results:
45
- policies['v11_xgboost']['success'] += int(model_results[m_v11]['resolved'])
46
- policies['v11_xgboost']['cost'] += model_results[m_v11]['cost']
47
  else:
48
- policies['v11_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
49
- policies['v11_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
50
- policies['v11_xgboost']['n'] += 1
51
 
52
- # BERT + feedback
53
  if m_bert in model_results and model_results[m_bert]['resolved']:
54
  policies['bert_feedback']['success'] += 1
55
  policies['bert_feedback']['cost'] += model_results[m_bert]['cost']
@@ -66,22 +68,22 @@ for iid, model_results in traces.items():
66
  policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01)
67
  policies['bert_feedback']['n'] += 1
68
 
69
- # v11 + feedback
70
- if m_v11 in model_results and model_results[m_v11]['resolved']:
71
- policies['v11_feedback']['success'] += 1
72
- policies['v11_feedback']['cost'] += model_results[m_v11]['cost']
73
  else:
74
- up_tier = min(v11_tier + 1, 5)
75
  m_up = TIER_TO_SWE.get(up_tier, f_model)
76
  if m_up in model_results and model_results[m_up]['resolved']:
77
- policies['v11_feedback']['success'] += 1
78
- policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01) + model_results[m_up]['cost']
79
  elif f_model in model_results and model_results[f_model]['resolved']:
80
- policies['v11_feedback']['success'] += 1
81
- policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01) + model_results[f_model]['cost']
82
  else:
83
- policies['v11_feedback']['cost'] += model_results.get(m_v11,{}).get('cost',0.01)
84
- policies['v11_feedback']['n'] += 1
85
 
86
  # Always cheap
87
  c_model = 'deepseek-v4-flash'
@@ -89,3 +91,5 @@ for iid, model_results in traces.items():
89
  policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
90
  policies['always_cheap']['cost'] += model_results[c_model]['cost']
91
  policies['always_cheap']['n'] += 1
 
 
 
3
  policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
4
 
5
  print("\n[4] Evaluating all policies...")
6
+ for idx, (iid, model_results) in enumerate(traces.items()):
7
+ if idx % 100 == 0:
8
+ print(f" Progress: {idx}/{len(traces)}")
9
  problem = next(iter(model_results.values()))['problem']
10
  task_type = classify_task(problem)
11
  floor = TASK_FLOOR.get(task_type, 2)
 
27
  policies['frontier']['cost'] += model_results[f_model]['cost']
28
  policies['frontier']['n'] += 1
29
 
30
+ # BERT (per-tier success prediction with cascade)
31
+ bert_tier, bert_conf, bert_probs = route_bert(problem)
32
  bert_tier = max(bert_tier, floor)
33
  m_bert = TIER_TO_SWE.get(bert_tier, f_model)
34
  if m_bert in model_results:
 
39
  policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
40
  policies['bert']['n'] += 1
41
 
42
+ # v10 XGBoost
43
+ v10_tier, v10_conf, v10_probs = route_v10(problem)
44
+ v10_tier = max(v10_tier, floor)
45
+ m_v10 = TIER_TO_SWE.get(v10_tier, f_model)
46
+ if m_v10 in model_results:
47
+ policies['v10_xgboost']['success'] += int(model_results[m_v10]['resolved'])
48
+ policies['v10_xgboost']['cost'] += model_results[m_v10]['cost']
49
  else:
50
+ policies['v10_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
51
+ policies['v10_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
52
+ policies['v10_xgboost']['n'] += 1
53
 
54
+ # BERT + feedback (escalate on failure)
55
  if m_bert in model_results and model_results[m_bert]['resolved']:
56
  policies['bert_feedback']['success'] += 1
57
  policies['bert_feedback']['cost'] += model_results[m_bert]['cost']
 
68
  policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01)
69
  policies['bert_feedback']['n'] += 1
70
 
71
+ # v10 + feedback
72
+ if m_v10 in model_results and model_results[m_v10]['resolved']:
73
+ policies['v10_feedback']['success'] += 1
74
+ policies['v10_feedback']['cost'] += model_results[m_v10]['cost']
75
  else:
76
+ up_tier = min(v10_tier + 1, 5)
77
  m_up = TIER_TO_SWE.get(up_tier, f_model)
78
  if m_up in model_results and model_results[m_up]['resolved']:
79
+ policies['v10_feedback']['success'] += 1
80
+ policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[m_up]['cost']
81
  elif f_model in model_results and model_results[f_model]['resolved']:
82
+ policies['v10_feedback']['success'] += 1
83
+ policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[f_model]['cost']
84
  else:
85
+ policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01)
86
+ policies['v10_feedback']['n'] += 1
87
 
88
  # Always cheap
89
  c_model = 'deepseek-v4-flash'
 
91
  policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
92
  policies['always_cheap']['cost'] += model_results[c_model]['cost']
93
  policies['always_cheap']['n'] += 1
94
+
95
+ print(f" Progress: {len(traces)}/{len(traces)} - DONE")