narcolepticchicken commited on
Commit
9ced75f
·
verified ·
1 Parent(s): 9234215

Upload training/swe_bench_eval.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/swe_bench_eval.py +215 -0
training/swe_bench_eval.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Real SWE-bench benchmark: Evaluate ACO router against SWE-Router traces."""
3
+ import sys,json,random
4
+ from collections import defaultdict
5
+ from datasets import load_dataset
6
+
7
+ MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2',
8
+ 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash']
9
+
10
+ # Approximate model tier mapping based on capability
11
+ MODEL_TIER = {
12
+ 'deepseek-v4-flash': 1, 'gpt-5-nano': 1,
13
+ 'gpt-5-mini': 2, 'deepseek-v3.2': 2,
14
+ 'gemini-2.5-pro': 3,
15
+ 'claude-opus-4.7': 4, 'gpt-5.2': 4,
16
+ 'gemini-3-pro': 5,
17
+ }
18
+ MODEL_COST_PER_CALL = {}
19
+
20
+ print("="*80)
21
+ print("REAL SWE-BENCH BENCHMARK: ACO vs ALWAYS-FRONTIER")
22
+ print("="*80)
23
+
24
+ # Load all traces
25
+ print("\n[1] Loading SWE-Router traces...")
26
+ traces = defaultdict(dict)
27
+ for model in MODELS:
28
+ ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test')
29
+ for row in ds:
30
+ iid = row['instance_id']
31
+ traces[iid][model] = {
32
+ 'resolved': row['resolved'],
33
+ 'cost': float(row['instance_cost']),
34
+ 'api_calls': int(row['api_calls']),
35
+ 'problem': row['problem_statement'][:200],
36
+ }
37
+ print(f" {model}: loaded")
38
+
39
+ print(f"\n Total tasks: {len(traces)}")
40
+ print(f" Total traces: {sum(len(v) for v in traces.values())}")
41
+
42
+ # For each task, determine: cheapest successful model, optimal tier, etc.
43
+ print("\n[2] Analyzing per-task results...")
44
+ task_analysis = []
45
+ for iid, model_results in traces.items():
46
+ resolved_models = [(m, r) for m, r in model_results.items() if r['resolved']]
47
+ failed_models = [(m, r) for m, r in model_results.items() if not r['resolved']]
48
+ if resolved_models:
49
+ cheapest = min(resolved_models, key=lambda x: x[1]['cost'])
50
+ optimal_tier = MODEL_TIER[cheapest[0]]
51
+ optimal_cost = cheapest[1]['cost']
52
+ else:
53
+ optimal_tier = 5
54
+ optimal_cost = min(r['cost'] for r in model_results.values())
55
+ frontier_models = [(m, r) for m, r in model_results.items() if MODEL_TIER[m] >= 4 and r['resolved']]
56
+ frontier_cost = min(r['cost'] for m, r in frontier_models) if frontier_models else float('inf')
57
+ task_analysis.append({
58
+ 'instance_id': iid,
59
+ 'optimal_tier': optimal_tier,
60
+ 'optimal_cost': optimal_cost,
61
+ 'frontier_cost': frontier_cost,
62
+ 'n_resolved': len(resolved_models),
63
+ 'n_models': len(model_results),
64
+ })
65
+
66
+ n = len(task_analysis)
67
+ opt_tier_dist = defaultdict(int)
68
+ for t in task_analysis:
69
+ opt_tier_dist[t['optimal_tier']] += 1
70
+
71
+ print(f" Optimal tier distribution:")
72
+ for tier in sorted(opt_tier_dist.keys()):
73
+ print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/n*100:.1f}%)")
74
+
75
+ # Simulate routing policies
76
+ print("\n[3] Simulating routing policies...")
77
+
78
+ # For each task, determine what ACO would have routed
79
+ sys.path.insert(0,"/app")
80
+ from aco.classifier import TaskCostClassifier
81
+ from aco.router import ModelCascadeRouter
82
+ from aco.config import ACOConfig
83
+
84
+ classifier = TaskCostClassifier()
85
+ router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl",
86
+ task_floor={"coding":3})
87
+
88
+ # Map ACO tiers to SWE-Router models
89
+ TIER_TO_SWE = {
90
+ 1: 'deepseek-v4-flash', # cheapest available
91
+ 2: 'gpt-5-mini', # cheap cloud
92
+ 3: 'deepseek-v3.2', # medium (close in cost)
93
+ 4: 'claude-opus-4.7', # frontier
94
+ 5: 'gemini-3-pro', # specialist/expert
95
+ }
96
+
97
+ def route_aco(problem_text):
98
+ pred = classifier.classify(problem_text)
99
+ r = router.route(problem_text, "coding", pred["difficulty"], pred)
100
+ model = TIER_TO_SWE.get(r.tier, 'claude-opus-4.7')
101
+ return r.tier, model, r.dynamic_difficulty
102
+
103
+ # Evaluate each policy
104
+ policy_results = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
105
+
106
+ for t in task_analysis:
107
+ iid = t['instance_id']
108
+ model_results = traces[iid]
109
+ problem = next(iter(model_results.values()))['problem']
110
+
111
+ # Policy: always frontier (tier 4)
112
+ frontier_model = 'claude-opus-4.7'
113
+ if frontier_model in model_results:
114
+ r = model_results[frontier_model]
115
+ policy_results['always_frontier']['success'] += int(r['resolved'])
116
+ policy_results['always_frontier']['cost'] += r['cost']
117
+ policy_results['always_frontier']['n'] += 1
118
+
119
+ # Policy: always cheap (tier 1)
120
+ cheap_model = 'deepseek-v4-flash'
121
+ if cheap_model in model_results:
122
+ r = model_results[cheap_model]
123
+ policy_results['always_cheap']['success'] += int(r['resolved'])
124
+ policy_results['always_cheap']['cost'] += r['cost']
125
+ policy_results['always_cheap']['n'] += 1
126
+
127
+ # Policy: ACO router
128
+ tier, model, diff = route_aco(problem)
129
+ if model in model_results:
130
+ r = model_results[model]
131
+ policy_results['aco_v8']['success'] += int(r['resolved'])
132
+ policy_results['aco_v8']['cost'] += r['cost']
133
+ else:
134
+ # Fallback to frontier
135
+ if frontier_model in model_results:
136
+ r = model_results[frontier_model]
137
+ policy_results['aco_v8']['success'] += int(r['resolved'])
138
+ policy_results['aco_v8']['cost'] += r['cost']
139
+ policy_results['aco_v8']['n'] += 1
140
+
141
+ # Policy: oracle (cheapest successful model)
142
+ resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
143
+ if resolved:
144
+ cheapest = min(resolved, key=lambda x: x[1]['cost'])
145
+ policy_results['oracle']['success'] += 1
146
+ policy_results['oracle']['cost'] += cheapest[1]['cost']
147
+ else:
148
+ policy_results['oracle']['success'] += 0
149
+ policy_results['oracle']['cost'] += min(r['cost'] for r in model_results.values())
150
+ policy_results['oracle']['n'] += 1
151
+
152
+ # Print results
153
+ print(f"\n\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
154
+ print("-"*50)
155
+ fr = policy_results['always_frontier']
156
+ fr_cost = fr['cost']/fr['n']
157
+ for name in ['oracle','aco_v8','always_frontier','always_cheap']:
158
+ r = policy_results[name]
159
+ sr = r['success']/r['n']
160
+ ac = r['cost']/r['n']
161
+ cr = (1-ac/fr_cost)*100
162
+ print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
163
+
164
+ # v9 with feedback: if ACO routes to tier < 4, try cheap first, escalate if needed
165
+ # Simulate: use ACO's initial tier, but if that model fails, try tier+1
166
+ policy_v9 = {"success":0,"cost":0.0,"n":0}
167
+ for t in task_analysis:
168
+ iid = t['instance_id']
169
+ model_results = traces[iid]
170
+ problem = next(iter(model_results.values()))['problem']
171
+ tier, model, diff = route_aco(problem)
172
+
173
+ if model in model_results and model_results[model]['resolved']:
174
+ # ACO's initial choice succeeded
175
+ policy_v9['success'] += 1
176
+ policy_v9['cost'] += model_results[model]['cost']
177
+ elif tier < 5:
178
+ # Failed: try one tier up
179
+ up_tier = min(tier + 1, 5)
180
+ up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7')
181
+ if up_model in model_results and model_results[up_model]['resolved']:
182
+ policy_v9['success'] += 1
183
+ policy_v9['cost'] += model_results[model]['cost'] # pay for both
184
+ policy_v9['cost'] += model_results[up_model]['cost']
185
+ else:
186
+ policy_v9['success'] += 0
187
+ policy_v9['cost'] += model_results.get(model, {}).get('cost', 0)
188
+ policy_v9['cost'] += model_results.get(up_model, {}).get('cost', 0)
189
+ else:
190
+ policy_v9['success'] += 0
191
+ policy_v9['cost'] += model_results.get(model, {}).get('cost', 0)
192
+ policy_v9['n'] += 1
193
+
194
+ policy_results['aco_v9_feedback'] = policy_v9
195
+
196
+ # Final comparison
197
+ print(f"\n\nFINAL REAL-WORLD SWE-BENCH RESULTS:")
198
+ print(f"{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
199
+ print("-"*50)
200
+ for name in ['oracle','aco_v9_feedback','aco_v8','always_frontier','always_cheap']:
201
+ r = policy_results[name]
202
+ sr = r['success']/r['n']
203
+ ac = r['cost']/r['n']
204
+ cr = (1-ac/fr_cost)*100
205
+ print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
206
+
207
+ # Save
208
+ save_data = {}
209
+ for name, r in policy_results.items():
210
+ save_data[name] = {"success":r["success"]/r["n"],"avg_cost":r["cost"]/r["n"],
211
+ "n":r["n"]}
212
+ with open("/app/swe_bench_results.json","w") as f:
213
+ json.dump(save_data, f, indent=2)
214
+ print(f"\nSaved to /app/swe_bench_results.json")
215
+ print("DONE!")