| |
| """Real SWE-bench benchmark: Evaluate ACO router against SWE-Router traces.""" |
| import sys,json,random |
| from collections import defaultdict |
| from datasets import load_dataset |
|
|
| MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2', |
| 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash'] |
|
|
| |
| MODEL_TIER = { |
| 'deepseek-v4-flash': 1, 'gpt-5-nano': 1, |
| 'gpt-5-mini': 2, 'deepseek-v3.2': 2, |
| 'gemini-2.5-pro': 3, |
| 'claude-opus-4.7': 4, 'gpt-5.2': 4, |
| 'gemini-3-pro': 5, |
| } |
| MODEL_COST_PER_CALL = {} |
|
|
| print("="*80) |
| print("REAL SWE-BENCH BENCHMARK: ACO vs ALWAYS-FRONTIER") |
| print("="*80) |
|
|
| |
| print("\n[1] Loading SWE-Router traces...") |
| traces = defaultdict(dict) |
| for model in MODELS: |
| ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test') |
| for row in ds: |
| iid = row['instance_id'] |
| traces[iid][model] = { |
| 'resolved': row['resolved'], |
| 'cost': float(row['instance_cost']), |
| 'api_calls': int(row['api_calls']), |
| 'problem': row['problem_statement'][:200], |
| } |
| print(f" {model}: loaded") |
|
|
| print(f"\n Total tasks: {len(traces)}") |
| print(f" Total traces: {sum(len(v) for v in traces.values())}") |
|
|
| |
| print("\n[2] Analyzing per-task results...") |
| task_analysis = [] |
| for iid, model_results in traces.items(): |
| resolved_models = [(m, r) for m, r in model_results.items() if r['resolved']] |
| failed_models = [(m, r) for m, r in model_results.items() if not r['resolved']] |
| if resolved_models: |
| cheapest = min(resolved_models, key=lambda x: x[1]['cost']) |
| optimal_tier = MODEL_TIER[cheapest[0]] |
| optimal_cost = cheapest[1]['cost'] |
| else: |
| optimal_tier = 5 |
| optimal_cost = min(r['cost'] for r in model_results.values()) |
| frontier_models = [(m, r) for m, r in model_results.items() if MODEL_TIER[m] >= 4 and r['resolved']] |
| frontier_cost = min(r['cost'] for m, r in frontier_models) if frontier_models else float('inf') |
| task_analysis.append({ |
| 'instance_id': iid, |
| 'optimal_tier': optimal_tier, |
| 'optimal_cost': optimal_cost, |
| 'frontier_cost': frontier_cost, |
| 'n_resolved': len(resolved_models), |
| 'n_models': len(model_results), |
| }) |
|
|
| n = len(task_analysis) |
| opt_tier_dist = defaultdict(int) |
| for t in task_analysis: |
| opt_tier_dist[t['optimal_tier']] += 1 |
|
|
| print(f" Optimal tier distribution:") |
| for tier in sorted(opt_tier_dist.keys()): |
| print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/n*100:.1f}%)") |
|
|
| |
| print("\n[3] Simulating routing policies...") |
|
|
| |
| sys.path.insert(0,"/app") |
| from aco.classifier import TaskCostClassifier |
| from aco.router import ModelCascadeRouter |
| from aco.config import ACOConfig |
|
|
| classifier = TaskCostClassifier() |
| router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl", |
| task_floor={"coding":3}) |
|
|
| |
| TIER_TO_SWE = { |
| 1: 'deepseek-v4-flash', |
| 2: 'gpt-5-mini', |
| 3: 'deepseek-v3.2', |
| 4: 'claude-opus-4.7', |
| 5: 'gemini-3-pro', |
| } |
|
|
| def route_aco(problem_text): |
| pred = classifier.classify(problem_text) |
| r = router.route(problem_text, "coding", pred["difficulty"], pred) |
| model = TIER_TO_SWE.get(r.tier, 'claude-opus-4.7') |
| return r.tier, model, r.dynamic_difficulty |
|
|
| |
| policy_results = defaultdict(lambda: {"success":0,"cost":0.0,"n":0}) |
|
|
| for t in task_analysis: |
| iid = t['instance_id'] |
| model_results = traces[iid] |
| problem = next(iter(model_results.values()))['problem'] |
| |
| |
| frontier_model = 'claude-opus-4.7' |
| if frontier_model in model_results: |
| r = model_results[frontier_model] |
| policy_results['always_frontier']['success'] += int(r['resolved']) |
| policy_results['always_frontier']['cost'] += r['cost'] |
| policy_results['always_frontier']['n'] += 1 |
| |
| |
| cheap_model = 'deepseek-v4-flash' |
| if cheap_model in model_results: |
| r = model_results[cheap_model] |
| policy_results['always_cheap']['success'] += int(r['resolved']) |
| policy_results['always_cheap']['cost'] += r['cost'] |
| policy_results['always_cheap']['n'] += 1 |
| |
| |
| tier, model, diff = route_aco(problem) |
| if model in model_results: |
| r = model_results[model] |
| policy_results['aco_v8']['success'] += int(r['resolved']) |
| policy_results['aco_v8']['cost'] += r['cost'] |
| else: |
| |
| if frontier_model in model_results: |
| r = model_results[frontier_model] |
| policy_results['aco_v8']['success'] += int(r['resolved']) |
| policy_results['aco_v8']['cost'] += r['cost'] |
| policy_results['aco_v8']['n'] += 1 |
| |
| |
| resolved = [(m, r) for m, r in model_results.items() if r['resolved']] |
| if resolved: |
| cheapest = min(resolved, key=lambda x: x[1]['cost']) |
| policy_results['oracle']['success'] += 1 |
| policy_results['oracle']['cost'] += cheapest[1]['cost'] |
| else: |
| policy_results['oracle']['success'] += 0 |
| policy_results['oracle']['cost'] += min(r['cost'] for r in model_results.values()) |
| policy_results['oracle']['n'] += 1 |
|
|
| |
| print(f"\n\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") |
| print("-"*50) |
| fr = policy_results['always_frontier'] |
| fr_cost = fr['cost']/fr['n'] |
| for name in ['oracle','aco_v8','always_frontier','always_cheap']: |
| r = policy_results[name] |
| sr = r['success']/r['n'] |
| ac = r['cost']/r['n'] |
| cr = (1-ac/fr_cost)*100 |
| print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") |
|
|
| |
| |
| policy_v9 = {"success":0,"cost":0.0,"n":0} |
| for t in task_analysis: |
| iid = t['instance_id'] |
| model_results = traces[iid] |
| problem = next(iter(model_results.values()))['problem'] |
| tier, model, diff = route_aco(problem) |
| |
| if model in model_results and model_results[model]['resolved']: |
| |
| policy_v9['success'] += 1 |
| policy_v9['cost'] += model_results[model]['cost'] |
| elif tier < 5: |
| |
| up_tier = min(tier + 1, 5) |
| up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7') |
| if up_model in model_results and model_results[up_model]['resolved']: |
| policy_v9['success'] += 1 |
| policy_v9['cost'] += model_results[model]['cost'] |
| policy_v9['cost'] += model_results[up_model]['cost'] |
| else: |
| policy_v9['success'] += 0 |
| policy_v9['cost'] += model_results.get(model, {}).get('cost', 0) |
| policy_v9['cost'] += model_results.get(up_model, {}).get('cost', 0) |
| else: |
| policy_v9['success'] += 0 |
| policy_v9['cost'] += model_results.get(model, {}).get('cost', 0) |
| policy_v9['n'] += 1 |
|
|
| policy_results['aco_v9_feedback'] = policy_v9 |
|
|
| |
| print(f"\n\nFINAL REAL-WORLD SWE-BENCH RESULTS:") |
| print(f"{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") |
| print("-"*50) |
| for name in ['oracle','aco_v9_feedback','aco_v8','always_frontier','always_cheap']: |
| r = policy_results[name] |
| sr = r['success']/r['n'] |
| ac = r['cost']/r['n'] |
| cr = (1-ac/fr_cost)*100 |
| print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") |
|
|
| |
| save_data = {} |
| for name, r in policy_results.items(): |
| save_data[name] = {"success":r["success"]/r["n"],"avg_cost":r["cost"]/r["n"], |
| "n":r["n"]} |
| with open("/app/swe_bench_results.json","w") as f: |
| json.dump(save_data, f, indent=2) |
| print(f"\nSaved to /app/swe_bench_results.json") |
| print("DONE!") |
|
|