|
|
| |
| policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0}) |
|
|
| print("\n[4] Evaluating all policies...") |
| for idx, (iid, model_results) in enumerate(traces.items()): |
| if idx % 100 == 0: |
| print(f" Progress: {idx}/{len(traces)}") |
| problem = next(iter(model_results.values()))['problem'] |
| task_type = classify_task(problem) |
| floor = TASK_FLOOR.get(task_type, 2) |
| f_model = 'claude-opus-4.7' |
|
|
| |
| resolved = [(m, r) for m, r in model_results.items() if r['resolved']] |
| if resolved: |
| cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0)) |
| policies['oracle']['success'] += 1 |
| policies['oracle']['cost'] += cheapest[1]['cost'] |
| else: |
| policies['oracle']['cost'] += min(r['cost'] for r in model_results.values()) |
| policies['oracle']['n'] += 1 |
|
|
| |
| if f_model in model_results: |
| policies['frontier']['success'] += int(model_results[f_model]['resolved']) |
| policies['frontier']['cost'] += model_results[f_model]['cost'] |
| policies['frontier']['n'] += 1 |
|
|
| |
| bert_tier, bert_conf, bert_probs = route_bert(problem) |
| bert_tier = max(bert_tier, floor) |
| m_bert = TIER_TO_SWE.get(bert_tier, f_model) |
| if m_bert in model_results: |
| policies['bert']['success'] += int(model_results[m_bert]['resolved']) |
| policies['bert']['cost'] += model_results[m_bert]['cost'] |
| else: |
| policies['bert']['success'] += int(model_results.get(f_model,{}).get('resolved',0)) |
| policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3) |
| policies['bert']['n'] += 1 |
|
|
| |
| v10_tier, v10_conf, v10_probs = route_v10(problem) |
| v10_tier = max(v10_tier, floor) |
| m_v10 = TIER_TO_SWE.get(v10_tier, f_model) |
| if m_v10 in model_results: |
| policies['v10_xgboost']['success'] += int(model_results[m_v10]['resolved']) |
| policies['v10_xgboost']['cost'] += model_results[m_v10]['cost'] |
| else: |
| policies['v10_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0)) |
| policies['v10_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3) |
| policies['v10_xgboost']['n'] += 1 |
|
|
| |
| if m_bert in model_results and model_results[m_bert]['resolved']: |
| policies['bert_feedback']['success'] += 1 |
| policies['bert_feedback']['cost'] += model_results[m_bert]['cost'] |
| else: |
| up_tier = min(bert_tier + 1, 5) |
| m_up = TIER_TO_SWE.get(up_tier, f_model) |
| if m_up in model_results and model_results[m_up]['resolved']: |
| policies['bert_feedback']['success'] += 1 |
| policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[m_up]['cost'] |
| elif f_model in model_results and model_results[f_model]['resolved']: |
| policies['bert_feedback']['success'] += 1 |
| policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[f_model]['cost'] |
| else: |
| policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) |
| policies['bert_feedback']['n'] += 1 |
|
|
| |
| if m_v10 in model_results and model_results[m_v10]['resolved']: |
| policies['v10_feedback']['success'] += 1 |
| policies['v10_feedback']['cost'] += model_results[m_v10]['cost'] |
| else: |
| up_tier = min(v10_tier + 1, 5) |
| m_up = TIER_TO_SWE.get(up_tier, f_model) |
| if m_up in model_results and model_results[m_up]['resolved']: |
| policies['v10_feedback']['success'] += 1 |
| policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[m_up]['cost'] |
| elif f_model in model_results and model_results[f_model]['resolved']: |
| policies['v10_feedback']['success'] += 1 |
| policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[f_model]['cost'] |
| else: |
| policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) |
| policies['v10_feedback']['n'] += 1 |
|
|
| |
| c_model = 'deepseek-v4-flash' |
| if c_model in model_results: |
| policies['always_cheap']['success'] += int(model_results[c_model]['resolved']) |
| policies['always_cheap']['cost'] += model_results[c_model]['cost'] |
| policies['always_cheap']['n'] += 1 |
|
|
| print(f" Progress: {len(traces)}/{len(traces)} - DONE") |
|
|