Upload training/swe_bench_eval.py with huggingface_hub
Browse files- training/swe_bench_eval.py +215 -0
training/swe_bench_eval.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Real SWE-bench benchmark: Evaluate ACO router against SWE-Router traces."""
|
| 3 |
+
import sys,json,random
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
|
| 7 |
+
MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2',
|
| 8 |
+
'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash']
|
| 9 |
+
|
| 10 |
+
# Approximate model tier mapping based on capability
|
| 11 |
+
MODEL_TIER = {
|
| 12 |
+
'deepseek-v4-flash': 1, 'gpt-5-nano': 1,
|
| 13 |
+
'gpt-5-mini': 2, 'deepseek-v3.2': 2,
|
| 14 |
+
'gemini-2.5-pro': 3,
|
| 15 |
+
'claude-opus-4.7': 4, 'gpt-5.2': 4,
|
| 16 |
+
'gemini-3-pro': 5,
|
| 17 |
+
}
|
| 18 |
+
MODEL_COST_PER_CALL = {}
|
| 19 |
+
|
| 20 |
+
print("="*80)
|
| 21 |
+
print("REAL SWE-BENCH BENCHMARK: ACO vs ALWAYS-FRONTIER")
|
| 22 |
+
print("="*80)
|
| 23 |
+
|
| 24 |
+
# Load all traces
|
| 25 |
+
print("\n[1] Loading SWE-Router traces...")
|
| 26 |
+
traces = defaultdict(dict)
|
| 27 |
+
for model in MODELS:
|
| 28 |
+
ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test')
|
| 29 |
+
for row in ds:
|
| 30 |
+
iid = row['instance_id']
|
| 31 |
+
traces[iid][model] = {
|
| 32 |
+
'resolved': row['resolved'],
|
| 33 |
+
'cost': float(row['instance_cost']),
|
| 34 |
+
'api_calls': int(row['api_calls']),
|
| 35 |
+
'problem': row['problem_statement'][:200],
|
| 36 |
+
}
|
| 37 |
+
print(f" {model}: loaded")
|
| 38 |
+
|
| 39 |
+
print(f"\n Total tasks: {len(traces)}")
|
| 40 |
+
print(f" Total traces: {sum(len(v) for v in traces.values())}")
|
| 41 |
+
|
| 42 |
+
# For each task, determine: cheapest successful model, optimal tier, etc.
|
| 43 |
+
print("\n[2] Analyzing per-task results...")
|
| 44 |
+
task_analysis = []
|
| 45 |
+
for iid, model_results in traces.items():
|
| 46 |
+
resolved_models = [(m, r) for m, r in model_results.items() if r['resolved']]
|
| 47 |
+
failed_models = [(m, r) for m, r in model_results.items() if not r['resolved']]
|
| 48 |
+
if resolved_models:
|
| 49 |
+
cheapest = min(resolved_models, key=lambda x: x[1]['cost'])
|
| 50 |
+
optimal_tier = MODEL_TIER[cheapest[0]]
|
| 51 |
+
optimal_cost = cheapest[1]['cost']
|
| 52 |
+
else:
|
| 53 |
+
optimal_tier = 5
|
| 54 |
+
optimal_cost = min(r['cost'] for r in model_results.values())
|
| 55 |
+
frontier_models = [(m, r) for m, r in model_results.items() if MODEL_TIER[m] >= 4 and r['resolved']]
|
| 56 |
+
frontier_cost = min(r['cost'] for m, r in frontier_models) if frontier_models else float('inf')
|
| 57 |
+
task_analysis.append({
|
| 58 |
+
'instance_id': iid,
|
| 59 |
+
'optimal_tier': optimal_tier,
|
| 60 |
+
'optimal_cost': optimal_cost,
|
| 61 |
+
'frontier_cost': frontier_cost,
|
| 62 |
+
'n_resolved': len(resolved_models),
|
| 63 |
+
'n_models': len(model_results),
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
n = len(task_analysis)
|
| 67 |
+
opt_tier_dist = defaultdict(int)
|
| 68 |
+
for t in task_analysis:
|
| 69 |
+
opt_tier_dist[t['optimal_tier']] += 1
|
| 70 |
+
|
| 71 |
+
print(f" Optimal tier distribution:")
|
| 72 |
+
for tier in sorted(opt_tier_dist.keys()):
|
| 73 |
+
print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/n*100:.1f}%)")
|
| 74 |
+
|
| 75 |
+
# Simulate routing policies
|
| 76 |
+
print("\n[3] Simulating routing policies...")
|
| 77 |
+
|
| 78 |
+
# For each task, determine what ACO would have routed
|
| 79 |
+
sys.path.insert(0,"/app")
|
| 80 |
+
from aco.classifier import TaskCostClassifier
|
| 81 |
+
from aco.router import ModelCascadeRouter
|
| 82 |
+
from aco.config import ACOConfig
|
| 83 |
+
|
| 84 |
+
classifier = TaskCostClassifier()
|
| 85 |
+
router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl",
|
| 86 |
+
task_floor={"coding":3})
|
| 87 |
+
|
| 88 |
+
# Map ACO tiers to SWE-Router models
|
| 89 |
+
TIER_TO_SWE = {
|
| 90 |
+
1: 'deepseek-v4-flash', # cheapest available
|
| 91 |
+
2: 'gpt-5-mini', # cheap cloud
|
| 92 |
+
3: 'deepseek-v3.2', # medium (close in cost)
|
| 93 |
+
4: 'claude-opus-4.7', # frontier
|
| 94 |
+
5: 'gemini-3-pro', # specialist/expert
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
def route_aco(problem_text):
|
| 98 |
+
pred = classifier.classify(problem_text)
|
| 99 |
+
r = router.route(problem_text, "coding", pred["difficulty"], pred)
|
| 100 |
+
model = TIER_TO_SWE.get(r.tier, 'claude-opus-4.7')
|
| 101 |
+
return r.tier, model, r.dynamic_difficulty
|
| 102 |
+
|
| 103 |
+
# Evaluate each policy
|
| 104 |
+
policy_results = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
|
| 105 |
+
|
| 106 |
+
for t in task_analysis:
|
| 107 |
+
iid = t['instance_id']
|
| 108 |
+
model_results = traces[iid]
|
| 109 |
+
problem = next(iter(model_results.values()))['problem']
|
| 110 |
+
|
| 111 |
+
# Policy: always frontier (tier 4)
|
| 112 |
+
frontier_model = 'claude-opus-4.7'
|
| 113 |
+
if frontier_model in model_results:
|
| 114 |
+
r = model_results[frontier_model]
|
| 115 |
+
policy_results['always_frontier']['success'] += int(r['resolved'])
|
| 116 |
+
policy_results['always_frontier']['cost'] += r['cost']
|
| 117 |
+
policy_results['always_frontier']['n'] += 1
|
| 118 |
+
|
| 119 |
+
# Policy: always cheap (tier 1)
|
| 120 |
+
cheap_model = 'deepseek-v4-flash'
|
| 121 |
+
if cheap_model in model_results:
|
| 122 |
+
r = model_results[cheap_model]
|
| 123 |
+
policy_results['always_cheap']['success'] += int(r['resolved'])
|
| 124 |
+
policy_results['always_cheap']['cost'] += r['cost']
|
| 125 |
+
policy_results['always_cheap']['n'] += 1
|
| 126 |
+
|
| 127 |
+
# Policy: ACO router
|
| 128 |
+
tier, model, diff = route_aco(problem)
|
| 129 |
+
if model in model_results:
|
| 130 |
+
r = model_results[model]
|
| 131 |
+
policy_results['aco_v8']['success'] += int(r['resolved'])
|
| 132 |
+
policy_results['aco_v8']['cost'] += r['cost']
|
| 133 |
+
else:
|
| 134 |
+
# Fallback to frontier
|
| 135 |
+
if frontier_model in model_results:
|
| 136 |
+
r = model_results[frontier_model]
|
| 137 |
+
policy_results['aco_v8']['success'] += int(r['resolved'])
|
| 138 |
+
policy_results['aco_v8']['cost'] += r['cost']
|
| 139 |
+
policy_results['aco_v8']['n'] += 1
|
| 140 |
+
|
| 141 |
+
# Policy: oracle (cheapest successful model)
|
| 142 |
+
resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
|
| 143 |
+
if resolved:
|
| 144 |
+
cheapest = min(resolved, key=lambda x: x[1]['cost'])
|
| 145 |
+
policy_results['oracle']['success'] += 1
|
| 146 |
+
policy_results['oracle']['cost'] += cheapest[1]['cost']
|
| 147 |
+
else:
|
| 148 |
+
policy_results['oracle']['success'] += 0
|
| 149 |
+
policy_results['oracle']['cost'] += min(r['cost'] for r in model_results.values())
|
| 150 |
+
policy_results['oracle']['n'] += 1
|
| 151 |
+
|
| 152 |
+
# Print results
|
| 153 |
+
print(f"\n\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
|
| 154 |
+
print("-"*50)
|
| 155 |
+
fr = policy_results['always_frontier']
|
| 156 |
+
fr_cost = fr['cost']/fr['n']
|
| 157 |
+
for name in ['oracle','aco_v8','always_frontier','always_cheap']:
|
| 158 |
+
r = policy_results[name]
|
| 159 |
+
sr = r['success']/r['n']
|
| 160 |
+
ac = r['cost']/r['n']
|
| 161 |
+
cr = (1-ac/fr_cost)*100
|
| 162 |
+
print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
|
| 163 |
+
|
| 164 |
+
# v9 with feedback: if ACO routes to tier < 4, try cheap first, escalate if needed
|
| 165 |
+
# Simulate: use ACO's initial tier, but if that model fails, try tier+1
|
| 166 |
+
policy_v9 = {"success":0,"cost":0.0,"n":0}
|
| 167 |
+
for t in task_analysis:
|
| 168 |
+
iid = t['instance_id']
|
| 169 |
+
model_results = traces[iid]
|
| 170 |
+
problem = next(iter(model_results.values()))['problem']
|
| 171 |
+
tier, model, diff = route_aco(problem)
|
| 172 |
+
|
| 173 |
+
if model in model_results and model_results[model]['resolved']:
|
| 174 |
+
# ACO's initial choice succeeded
|
| 175 |
+
policy_v9['success'] += 1
|
| 176 |
+
policy_v9['cost'] += model_results[model]['cost']
|
| 177 |
+
elif tier < 5:
|
| 178 |
+
# Failed: try one tier up
|
| 179 |
+
up_tier = min(tier + 1, 5)
|
| 180 |
+
up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7')
|
| 181 |
+
if up_model in model_results and model_results[up_model]['resolved']:
|
| 182 |
+
policy_v9['success'] += 1
|
| 183 |
+
policy_v9['cost'] += model_results[model]['cost'] # pay for both
|
| 184 |
+
policy_v9['cost'] += model_results[up_model]['cost']
|
| 185 |
+
else:
|
| 186 |
+
policy_v9['success'] += 0
|
| 187 |
+
policy_v9['cost'] += model_results.get(model, {}).get('cost', 0)
|
| 188 |
+
policy_v9['cost'] += model_results.get(up_model, {}).get('cost', 0)
|
| 189 |
+
else:
|
| 190 |
+
policy_v9['success'] += 0
|
| 191 |
+
policy_v9['cost'] += model_results.get(model, {}).get('cost', 0)
|
| 192 |
+
policy_v9['n'] += 1
|
| 193 |
+
|
| 194 |
+
policy_results['aco_v9_feedback'] = policy_v9
|
| 195 |
+
|
| 196 |
+
# Final comparison
|
| 197 |
+
print(f"\n\nFINAL REAL-WORLD SWE-BENCH RESULTS:")
|
| 198 |
+
print(f"{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
|
| 199 |
+
print("-"*50)
|
| 200 |
+
for name in ['oracle','aco_v9_feedback','aco_v8','always_frontier','always_cheap']:
|
| 201 |
+
r = policy_results[name]
|
| 202 |
+
sr = r['success']/r['n']
|
| 203 |
+
ac = r['cost']/r['n']
|
| 204 |
+
cr = (1-ac/fr_cost)*100
|
| 205 |
+
print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
|
| 206 |
+
|
| 207 |
+
# Save
|
| 208 |
+
save_data = {}
|
| 209 |
+
for name, r in policy_results.items():
|
| 210 |
+
save_data[name] = {"success":r["success"]/r["n"],"avg_cost":r["cost"]/r["n"],
|
| 211 |
+
"n":r["n"]}
|
| 212 |
+
with open("/app/swe_bench_results.json","w") as f:
|
| 213 |
+
json.dump(save_data, f, indent=2)
|
| 214 |
+
print(f"\nSaved to /app/swe_bench_results.json")
|
| 215 |
+
print("DONE!")
|