File size: 4,576 Bytes
522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d 522d0b6 911104d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# ── Evaluate ──
policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
print("\n[4] Evaluating all policies...")
for idx, (iid, model_results) in enumerate(traces.items()):
if idx % 100 == 0:
print(f" Progress: {idx}/{len(traces)}")
problem = next(iter(model_results.values()))['problem']
task_type = classify_task(problem)
floor = TASK_FLOOR.get(task_type, 2)
f_model = 'claude-opus-4.7'
# Oracle
resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
if resolved:
cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0))
policies['oracle']['success'] += 1
policies['oracle']['cost'] += cheapest[1]['cost']
else:
policies['oracle']['cost'] += min(r['cost'] for r in model_results.values())
policies['oracle']['n'] += 1
# Always frontier
if f_model in model_results:
policies['frontier']['success'] += int(model_results[f_model]['resolved'])
policies['frontier']['cost'] += model_results[f_model]['cost']
policies['frontier']['n'] += 1
# BERT (per-tier success prediction with cascade)
bert_tier, bert_conf, bert_probs = route_bert(problem)
bert_tier = max(bert_tier, floor)
m_bert = TIER_TO_SWE.get(bert_tier, f_model)
if m_bert in model_results:
policies['bert']['success'] += int(model_results[m_bert]['resolved'])
policies['bert']['cost'] += model_results[m_bert]['cost']
else:
policies['bert']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
policies['bert']['n'] += 1
# v10 XGBoost
v10_tier, v10_conf, v10_probs = route_v10(problem)
v10_tier = max(v10_tier, floor)
m_v10 = TIER_TO_SWE.get(v10_tier, f_model)
if m_v10 in model_results:
policies['v10_xgboost']['success'] += int(model_results[m_v10]['resolved'])
policies['v10_xgboost']['cost'] += model_results[m_v10]['cost']
else:
policies['v10_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
policies['v10_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
policies['v10_xgboost']['n'] += 1
# BERT + feedback (escalate on failure)
if m_bert in model_results and model_results[m_bert]['resolved']:
policies['bert_feedback']['success'] += 1
policies['bert_feedback']['cost'] += model_results[m_bert]['cost']
else:
up_tier = min(bert_tier + 1, 5)
m_up = TIER_TO_SWE.get(up_tier, f_model)
if m_up in model_results and model_results[m_up]['resolved']:
policies['bert_feedback']['success'] += 1
policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[m_up]['cost']
elif f_model in model_results and model_results[f_model]['resolved']:
policies['bert_feedback']['success'] += 1
policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[f_model]['cost']
else:
policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01)
policies['bert_feedback']['n'] += 1
# v10 + feedback
if m_v10 in model_results and model_results[m_v10]['resolved']:
policies['v10_feedback']['success'] += 1
policies['v10_feedback']['cost'] += model_results[m_v10]['cost']
else:
up_tier = min(v10_tier + 1, 5)
m_up = TIER_TO_SWE.get(up_tier, f_model)
if m_up in model_results and model_results[m_up]['resolved']:
policies['v10_feedback']['success'] += 1
policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[m_up]['cost']
elif f_model in model_results and model_results[f_model]['resolved']:
policies['v10_feedback']['success'] += 1
policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[f_model]['cost']
else:
policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01)
policies['v10_feedback']['n'] += 1
# Always cheap
c_model = 'deepseek-v4-flash'
if c_model in model_results:
policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
policies['always_cheap']['cost'] += model_results[c_model]['cost']
policies['always_cheap']['n'] += 1
print(f" Progress: {len(traces)}/{len(traces)} - DONE")
|