#!/usr/bin/env python3 """Real SWE-bench benchmark: Evaluate ACO router against SWE-Router traces.""" import sys,json,random from collections import defaultdict from datasets import load_dataset MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2', 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash'] # Approximate model tier mapping based on capability MODEL_TIER = { 'deepseek-v4-flash': 1, 'gpt-5-nano': 1, 'gpt-5-mini': 2, 'deepseek-v3.2': 2, 'gemini-2.5-pro': 3, 'claude-opus-4.7': 4, 'gpt-5.2': 4, 'gemini-3-pro': 5, } MODEL_COST_PER_CALL = {} print("="*80) print("REAL SWE-BENCH BENCHMARK: ACO vs ALWAYS-FRONTIER") print("="*80) # Load all traces print("\n[1] Loading SWE-Router traces...") traces = defaultdict(dict) for model in MODELS: ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test') for row in ds: iid = row['instance_id'] traces[iid][model] = { 'resolved': row['resolved'], 'cost': float(row['instance_cost']), 'api_calls': int(row['api_calls']), 'problem': row['problem_statement'][:200], } print(f" {model}: loaded") print(f"\n Total tasks: {len(traces)}") print(f" Total traces: {sum(len(v) for v in traces.values())}") # For each task, determine: cheapest successful model, optimal tier, etc. print("\n[2] Analyzing per-task results...") task_analysis = [] for iid, model_results in traces.items(): resolved_models = [(m, r) for m, r in model_results.items() if r['resolved']] failed_models = [(m, r) for m, r in model_results.items() if not r['resolved']] if resolved_models: cheapest = min(resolved_models, key=lambda x: x[1]['cost']) optimal_tier = MODEL_TIER[cheapest[0]] optimal_cost = cheapest[1]['cost'] else: optimal_tier = 5 optimal_cost = min(r['cost'] for r in model_results.values()) frontier_models = [(m, r) for m, r in model_results.items() if MODEL_TIER[m] >= 4 and r['resolved']] frontier_cost = min(r['cost'] for m, r in frontier_models) if frontier_models else float('inf') task_analysis.append({ 'instance_id': iid, 'optimal_tier': optimal_tier, 'optimal_cost': optimal_cost, 'frontier_cost': frontier_cost, 'n_resolved': len(resolved_models), 'n_models': len(model_results), }) n = len(task_analysis) opt_tier_dist = defaultdict(int) for t in task_analysis: opt_tier_dist[t['optimal_tier']] += 1 print(f" Optimal tier distribution:") for tier in sorted(opt_tier_dist.keys()): print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/n*100:.1f}%)") # Simulate routing policies print("\n[3] Simulating routing policies...") # For each task, determine what ACO would have routed sys.path.insert(0,"/app") from aco.classifier import TaskCostClassifier from aco.router import ModelCascadeRouter from aco.config import ACOConfig classifier = TaskCostClassifier() router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl", task_floor={"coding":3}) # Map ACO tiers to SWE-Router models TIER_TO_SWE = { 1: 'deepseek-v4-flash', # cheapest available 2: 'gpt-5-mini', # cheap cloud 3: 'deepseek-v3.2', # medium (close in cost) 4: 'claude-opus-4.7', # frontier 5: 'gemini-3-pro', # specialist/expert } def route_aco(problem_text): pred = classifier.classify(problem_text) r = router.route(problem_text, "coding", pred["difficulty"], pred) model = TIER_TO_SWE.get(r.tier, 'claude-opus-4.7') return r.tier, model, r.dynamic_difficulty # Evaluate each policy policy_results = defaultdict(lambda: {"success":0,"cost":0.0,"n":0}) for t in task_analysis: iid = t['instance_id'] model_results = traces[iid] problem = next(iter(model_results.values()))['problem'] # Policy: always frontier (tier 4) frontier_model = 'claude-opus-4.7' if frontier_model in model_results: r = model_results[frontier_model] policy_results['always_frontier']['success'] += int(r['resolved']) policy_results['always_frontier']['cost'] += r['cost'] policy_results['always_frontier']['n'] += 1 # Policy: always cheap (tier 1) cheap_model = 'deepseek-v4-flash' if cheap_model in model_results: r = model_results[cheap_model] policy_results['always_cheap']['success'] += int(r['resolved']) policy_results['always_cheap']['cost'] += r['cost'] policy_results['always_cheap']['n'] += 1 # Policy: ACO router tier, model, diff = route_aco(problem) if model in model_results: r = model_results[model] policy_results['aco_v8']['success'] += int(r['resolved']) policy_results['aco_v8']['cost'] += r['cost'] else: # Fallback to frontier if frontier_model in model_results: r = model_results[frontier_model] policy_results['aco_v8']['success'] += int(r['resolved']) policy_results['aco_v8']['cost'] += r['cost'] policy_results['aco_v8']['n'] += 1 # Policy: oracle (cheapest successful model) resolved = [(m, r) for m, r in model_results.items() if r['resolved']] if resolved: cheapest = min(resolved, key=lambda x: x[1]['cost']) policy_results['oracle']['success'] += 1 policy_results['oracle']['cost'] += cheapest[1]['cost'] else: policy_results['oracle']['success'] += 0 policy_results['oracle']['cost'] += min(r['cost'] for r in model_results.values()) policy_results['oracle']['n'] += 1 # Print results print(f"\n\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") print("-"*50) fr = policy_results['always_frontier'] fr_cost = fr['cost']/fr['n'] for name in ['oracle','aco_v8','always_frontier','always_cheap']: r = policy_results[name] sr = r['success']/r['n'] ac = r['cost']/r['n'] cr = (1-ac/fr_cost)*100 print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") # v9 with feedback: if ACO routes to tier < 4, try cheap first, escalate if needed # Simulate: use ACO's initial tier, but if that model fails, try tier+1 policy_v9 = {"success":0,"cost":0.0,"n":0} for t in task_analysis: iid = t['instance_id'] model_results = traces[iid] problem = next(iter(model_results.values()))['problem'] tier, model, diff = route_aco(problem) if model in model_results and model_results[model]['resolved']: # ACO's initial choice succeeded policy_v9['success'] += 1 policy_v9['cost'] += model_results[model]['cost'] elif tier < 5: # Failed: try one tier up up_tier = min(tier + 1, 5) up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7') if up_model in model_results and model_results[up_model]['resolved']: policy_v9['success'] += 1 policy_v9['cost'] += model_results[model]['cost'] # pay for both policy_v9['cost'] += model_results[up_model]['cost'] else: policy_v9['success'] += 0 policy_v9['cost'] += model_results.get(model, {}).get('cost', 0) policy_v9['cost'] += model_results.get(up_model, {}).get('cost', 0) else: policy_v9['success'] += 0 policy_v9['cost'] += model_results.get(model, {}).get('cost', 0) policy_v9['n'] += 1 policy_results['aco_v9_feedback'] = policy_v9 # Final comparison print(f"\n\nFINAL REAL-WORLD SWE-BENCH RESULTS:") print(f"{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") print("-"*50) for name in ['oracle','aco_v9_feedback','aco_v8','always_frontier','always_cheap']: r = policy_results[name] sr = r['success']/r['n'] ac = r['cost']/r['n'] cr = (1-ac/fr_cost)*100 print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") # Save save_data = {} for name, r in policy_results.items(): save_data[name] = {"success":r["success"]/r["n"],"avg_cost":r["cost"]/r["n"], "n":r["n"]} with open("/app/swe_bench_results.json","w") as f: json.dump(save_data, f, indent=2) print(f"\nSaved to /app/swe_bench_results.json") print("DONE!")