File size: 4,576 Bytes
522d0b6
 
 
 
 
911104d
 
 
522d0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911104d
 
522d0b6
 
 
 
 
 
 
 
 
 
911104d
 
 
 
 
 
 
522d0b6
911104d
 
 
522d0b6
911104d
522d0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911104d
 
 
 
522d0b6
911104d
522d0b6
 
911104d
 
522d0b6
911104d
 
522d0b6
911104d
 
522d0b6
 
 
 
 
 
 
911104d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

# ── Evaluate ──
policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})

print("\n[4] Evaluating all policies...")
for idx, (iid, model_results) in enumerate(traces.items()):
    if idx % 100 == 0:
        print(f"  Progress: {idx}/{len(traces)}")
    problem = next(iter(model_results.values()))['problem']
    task_type = classify_task(problem)
    floor = TASK_FLOOR.get(task_type, 2)
    f_model = 'claude-opus-4.7'

    # Oracle
    resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
    if resolved:
        cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0))
        policies['oracle']['success'] += 1
        policies['oracle']['cost'] += cheapest[1]['cost']
    else:
        policies['oracle']['cost'] += min(r['cost'] for r in model_results.values())
    policies['oracle']['n'] += 1

    # Always frontier
    if f_model in model_results:
        policies['frontier']['success'] += int(model_results[f_model]['resolved'])
        policies['frontier']['cost'] += model_results[f_model]['cost']
    policies['frontier']['n'] += 1

    # BERT (per-tier success prediction with cascade)
    bert_tier, bert_conf, bert_probs = route_bert(problem)
    bert_tier = max(bert_tier, floor)
    m_bert = TIER_TO_SWE.get(bert_tier, f_model)
    if m_bert in model_results:
        policies['bert']['success'] += int(model_results[m_bert]['resolved'])
        policies['bert']['cost'] += model_results[m_bert]['cost']
    else:
        policies['bert']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
        policies['bert']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
    policies['bert']['n'] += 1

    # v10 XGBoost
    v10_tier, v10_conf, v10_probs = route_v10(problem)
    v10_tier = max(v10_tier, floor)
    m_v10 = TIER_TO_SWE.get(v10_tier, f_model)
    if m_v10 in model_results:
        policies['v10_xgboost']['success'] += int(model_results[m_v10]['resolved'])
        policies['v10_xgboost']['cost'] += model_results[m_v10]['cost']
    else:
        policies['v10_xgboost']['success'] += int(model_results.get(f_model,{}).get('resolved',0))
        policies['v10_xgboost']['cost'] += model_results.get(f_model,{}).get('cost',0.3)
    policies['v10_xgboost']['n'] += 1

    # BERT + feedback (escalate on failure)
    if m_bert in model_results and model_results[m_bert]['resolved']:
        policies['bert_feedback']['success'] += 1
        policies['bert_feedback']['cost'] += model_results[m_bert]['cost']
    else:
        up_tier = min(bert_tier + 1, 5)
        m_up = TIER_TO_SWE.get(up_tier, f_model)
        if m_up in model_results and model_results[m_up]['resolved']:
            policies['bert_feedback']['success'] += 1
            policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[m_up]['cost']
        elif f_model in model_results and model_results[f_model]['resolved']:
            policies['bert_feedback']['success'] += 1
            policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01) + model_results[f_model]['cost']
        else:
            policies['bert_feedback']['cost'] += model_results.get(m_bert,{}).get('cost',0.01)
    policies['bert_feedback']['n'] += 1

    # v10 + feedback
    if m_v10 in model_results and model_results[m_v10]['resolved']:
        policies['v10_feedback']['success'] += 1
        policies['v10_feedback']['cost'] += model_results[m_v10]['cost']
    else:
        up_tier = min(v10_tier + 1, 5)
        m_up = TIER_TO_SWE.get(up_tier, f_model)
        if m_up in model_results and model_results[m_up]['resolved']:
            policies['v10_feedback']['success'] += 1
            policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[m_up]['cost']
        elif f_model in model_results and model_results[f_model]['resolved']:
            policies['v10_feedback']['success'] += 1
            policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01) + model_results[f_model]['cost']
        else:
            policies['v10_feedback']['cost'] += model_results.get(m_v10,{}).get('cost',0.01)
    policies['v10_feedback']['n'] += 1

    # Always cheap
    c_model = 'deepseek-v4-flash'
    if c_model in model_results:
        policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
        policies['always_cheap']['cost'] += model_results[c_model]['cost']
    policies['always_cheap']['n'] += 1

print(f"  Progress: {len(traces)}/{len(traces)} - DONE")