| |
| """Train v10 router on REAL SWE-Router execution data. |
| |
| This is the big one: 500 tasks x 8 models = 4000 real outcomes. |
| We learn which model succeeds on which task, at what cost. |
| """ |
| import sys, json, random, pickle, math |
| from collections import defaultdict |
| from datasets import load_dataset |
| import numpy as np |
|
|
| print("="*80) |
| print("TRAINING v10 ROUTER ON REAL SWE-ROUTER DATA") |
| print("="*80) |
|
|
| |
| MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2', |
| 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash'] |
|
|
| MODEL_TIER = { |
| 'deepseek-v4-flash': 1, 'gpt-5-nano': 1, |
| 'gpt-5-mini': 2, 'deepseek-v3.2': 2, |
| 'gemini-2.5-pro': 3, |
| 'claude-opus-4.7': 4, 'gpt-5.2': 4, |
| 'gemini-3-pro': 5, |
| } |
|
|
| TIER_COST = {1:0.01, 2:0.05, 3:0.15, 4:0.30, 5:0.50} |
|
|
| print("\n[1] Loading SWE-Router traces...") |
| traces = defaultdict(dict) |
| for model in MODELS: |
| ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test') |
| for row in ds: |
| iid = row['instance_id'] |
| traces[iid][model] = { |
| 'resolved': row['resolved'], |
| 'cost': float(row['instance_cost']), |
| 'api_calls': int(row['api_calls']), |
| 'problem': row['problem_statement'], |
| } |
| print(f" {model}: loaded") |
|
|
| print(f"\n Total tasks: {len(traces)}") |
| print(f" Total traces: {sum(len(v) for v in traces.values())}") |
|
|
| |
| print("\n[2] Engineering features from problem statements...") |
|
|
| |
| CODE_KW = ["python","javascript","code","function","bug","debug","refactor","implement","test", |
| "compile","runtime","segfault","thread","async","class","module","import","error","traceback"] |
| LEGAL_KW = ["contract","legal","compliance","gdpr","privacy","policy","regulatory","liability"] |
| RESEARCH_KW = ["research","investigate","compare","analyze","survey","paper"] |
| TOOL_KW = ["search","fetch","retrieve","query","api","database","scrape","aggregate"] |
| CRITICAL_KW = ["critical","production","urgent","emergency","live","deployed","safety","security"] |
| SIMPLE_KW = ["typo","simple","quick","brief","minor","small","easy","trivial","just"] |
| LONG_KW = ["plan","project","roadmap","orchestrate","migrate","pipeline","deploy","architecture"] |
| MATH_KW = ["calculate","compute","solve","equation","formula","optimize","probability"] |
|
|
| def extract_features(problem_text): |
| r = problem_text.lower() |
| feats = { |
| 'req_len': len(problem_text), |
| 'num_words': len(problem_text.split()), |
| 'has_code': int(any(k in r for k in CODE_KW)), |
| 'n_code': sum(1 for k in CODE_KW if k in r), |
| 'has_legal': int(any(k in r for k in LEGAL_KW)), |
| 'has_research': int(any(k in r for k in RESEARCH_KW)), |
| 'has_tool': int(any(k in r for k in TOOL_KW)), |
| 'has_critical': int(any(k in r for k in CRITICAL_KW)), |
| 'has_simple': int(any(k in r for k in SIMPLE_KW)), |
| 'has_long': int(any(k in r for k in LONG_KW)), |
| 'has_math': int(any(k in r for k in MATH_KW)), |
| 'has_error_msg': int('error' in r or 'traceback' in r or 'exception' in r), |
| 'has_file_path': int('/' in r and ('.' in r.split('/')[0] if '/' in r else False)), |
| 'n_lines': problem_text.count('\n') + 1, |
| 'has_version': int('version' in r or 'update' in r or 'upgrade' in r), |
| 'has_add': int('add' in r or 'new' in r or 'create' in r), |
| 'has_fix': int('fix' in r or 'bug' in r or 'issue' in r or 'broken' in r), |
| 'has_change': int('change' in r or 'modify' in r or 'update' in r), |
| 'has_remove': int('remove' in r or 'delete' in r or 'drop' in r), |
| 'has_test': int('test' in r or 'spec' in r or 'assert' in r), |
| 'has_doc': int('doc' in r or 'readme' in r or 'comment' in r), |
| |
| 'has_see_also': int('see also' in r or 'related' in r), |
| 'has_steps_to_reproduce': int('steps to reproduce' in r or 'reproduce' in r), |
| } |
| return feats |
|
|
| |
| print("\n[3] Building training data...") |
|
|
| |
| |
| |
|
|
| all_feat_keys = None |
| training_data = [] |
| tier_labels = {1:[],2:[],3:[],4:[],5:[]} |
| cost_labels = [] |
|
|
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| feats = extract_features(problem) |
| |
| if all_feat_keys is None: |
| all_feat_keys = sorted(feats.keys()) |
| |
| feat_vec = [float(feats.get(k, 0.0)) for k in all_feat_keys] |
| |
| |
| tier_success = {} |
| for model, result in model_results.items(): |
| tier = MODEL_TIER[model] |
| if tier not in tier_success: |
| tier_success[tier] = False |
| if result['resolved']: |
| tier_success[tier] = True |
| |
| |
| optimal_tier = 5 |
| for t in range(1, 6): |
| if tier_success.get(t, False): |
| optimal_tier = t |
| break |
| |
| |
| for t in range(1, 6): |
| tier_labels[t].append(int(tier_success.get(t, False))) |
| |
| training_data.append({ |
| 'features': feat_vec, |
| 'optimal_tier': optimal_tier, |
| 'tier_success': tier_success, |
| 'cost': min(r['cost'] for r in model_results.values()), |
| }) |
|
|
| print(f" Training samples: {len(training_data)}") |
| print(f" Features: {len(all_feat_keys)}") |
| print(f" Optimal tier distribution:") |
| opt_dist = defaultdict(int) |
| for t in training_data: |
| opt_dist[t['optimal_tier']] += 1 |
| for tier in sorted(opt_dist.keys()): |
| print(f" Tier {tier}: {opt_dist[tier]} ({opt_dist[tier]/len(training_data)*100:.1f}%)") |
| print(f" Per-tier success rates:") |
| for t in range(1,6): |
| s = sum(tier_labels[t]) |
| print(f" Tier {t}: {s}/{len(training_data)} = {s/len(training_data)*100:.1f}%") |
|
|
| |
| print("\n[4] Training XGBoost per-tier success predictors...") |
|
|
| from xgboost import XGBClassifier |
| from sklearn.calibration import IsotonicRegression |
| from sklearn.model_selection import cross_val_score |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| X = np.array([t['features'] for t in training_data], dtype=np.float32) |
| y_tier = {t: np.array(tier_labels[t]) for t in range(1,6)} |
| y_optimal = np.array([t['optimal_tier'] for t in training_data]) |
|
|
| tier_clfs = {} |
| tier_calibs = {} |
| tier_cv_scores = {} |
|
|
| for t in range(1, 6): |
| y = y_tier[t] |
| n_pos = y.sum() |
| n_neg = len(y) - n_pos |
| |
| |
| spw = max(1, n_neg / max(n_pos, 1)) |
| |
| clf = XGBClassifier( |
| n_estimators=200, max_depth=5, learning_rate=0.05, |
| subsample=0.8, colsample_bytree=0.8, |
| scale_pos_weight=spw, |
| eval_metric='logloss', use_label_encoder=False, |
| random_state=42, |
| ) |
| |
| |
| try: |
| scores = cross_val_score(clf, X, y, cv=5, scoring='f1') |
| tier_cv_scores[t] = scores.mean() |
| except: |
| tier_cv_scores[t] = 0.0 |
| |
| clf.fit(X, y) |
| |
| |
| p_raw = clf.predict_proba(X)[:, 1] |
| cal = IsotonicRegression(out_of_bounds='clip') |
| cal.fit(p_raw, y) |
| |
| tier_clfs[t] = clf |
| tier_calibs[t] = cal |
| |
| p_cal = cal.transform(p_raw) |
| brier = np.mean((p_cal - y) ** 2) |
| |
| print(f" Tier {t}: n_pos={n_pos}, CV_f1={tier_cv_scores[t]:.3f}, Brier={brier:.4f}") |
|
|
| |
| print("\n[5] Training direct optimal-tier predictor...") |
|
|
| from xgboost import XGBRegressor |
|
|
| opt_clf = XGBClassifier( |
| n_estimators=300, max_depth=6, learning_rate=0.05, |
| subsample=0.8, colsample_bytree=0.8, |
| eval_metric='mlogloss', use_label_encoder=False, |
| random_state=42, num_class=5, |
| ) |
| opt_clf.fit(X, y_optimal - 1) |
| opt_pred = opt_clf.predict(X) + 1 |
| opt_acc = np.mean(opt_pred == y_optimal) |
| print(f" Direct optimal-tier accuracy: {opt_acc:.3f}") |
| print(f" Confusion (predicted vs actual):") |
| from collections import Counter |
| for actual_tier in range(1, 6): |
| mask = y_optimal == actual_tier |
| if mask.sum() > 0: |
| pred_dist = Counter(opt_pred[mask].tolist()) |
| print(f" Actual tier {actual_tier}: {dict(pred_dist)}") |
|
|
| |
| print("\n[6] Evaluating routing policies on SWE-Router...") |
|
|
| from aco.classifier import TaskCostClassifier |
| classifier = TaskCostClassifier() |
|
|
| def route_v10(problem_text): |
| """v10: Real-data trained router.""" |
| feats = extract_features(problem_text) |
| feat_vec = np.array([float(feats.get(k, 0.0)) for k in all_feat_keys], dtype=np.float32).reshape(1,-1) |
| |
| |
| predicted_tier = int(opt_clf.predict(feat_vec)[0]) + 1 |
| |
| |
| tier_probs = {} |
| for t in range(1, 6): |
| p_raw = tier_clfs[t].predict_proba(feat_vec)[0, 1] |
| p_cal = float(tier_calibs[t].transform([p_raw])[0]) |
| tier_probs[t] = p_cal |
| |
| |
| for t in range(1, 6): |
| if tier_probs[t] >= 0.5: |
| cascade_tier = t |
| break |
| else: |
| cascade_tier = 5 |
| |
| return predicted_tier, cascade_tier, tier_probs |
|
|
| |
| TIER_TO_SWE = { |
| 1: 'deepseek-v4-flash', 2: 'gpt-5-mini', |
| 3: 'gemini-2.5-pro', 4: 'claude-opus-4.7', 5: 'gemini-3-pro', |
| } |
|
|
| policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0}) |
|
|
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| |
| |
| resolved = [(m, r) for m, r in model_results.items() if r['resolved']] |
| if resolved: |
| cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0)) |
| policies['oracle']['success'] += 1 |
| policies['oracle']['cost'] += cheapest[1]['cost'] |
| else: |
| policies['oracle']['cost'] += min(r['cost'] for r in model_results.values()) |
| policies['oracle']['n'] += 1 |
| |
| |
| f_model = 'claude-opus-4.7' |
| if f_model in model_results: |
| policies['frontier']['success'] += int(model_results[f_model]['resolved']) |
| policies['frontier']['cost'] += model_results[f_model]['cost'] |
| policies['frontier']['n'] += 1 |
| |
| |
| pred = classifier.classify(problem) |
| from aco.router import ModelCascadeRouter |
| old_router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") |
| r8 = old_router.route(problem, "coding", pred["difficulty"], pred) |
| m8 = TIER_TO_SWE.get(r8.tier, 'claude-opus-4.7') |
| if m8 in model_results: |
| policies['v8_synthetic']['success'] += int(model_results[m8]['resolved']) |
| policies['v8_synthetic']['cost'] += model_results[m8]['cost'] |
| policies['v8_synthetic']['n'] += 1 |
| |
| |
| predicted_tier, cascade_tier, tier_probs = route_v10(problem) |
| m10 = TIER_TO_SWE.get(predicted_tier, 'claude-opus-4.7') |
| if m10 in model_results: |
| policies['v10_direct']['success'] += int(model_results[m10]['resolved']) |
| policies['v10_direct']['cost'] += model_results[m10]['cost'] |
| else: |
| |
| policies['v10_direct']['success'] += int(model_results.get('claude-opus-4.7',{}).get('resolved',0)) |
| policies['v10_direct']['cost'] += model_results.get('claude-opus-4.7',{}).get('cost',0.3) |
| policies['v10_direct']['n'] += 1 |
| |
| |
| m10c = TIER_TO_SWE.get(cascade_tier, 'claude-opus-4.7') |
| if m10c in model_results: |
| policies['v10_cascade']['success'] += int(model_results[m10c]['resolved']) |
| policies['v10_cascade']['cost'] += model_results[m10c]['cost'] |
| else: |
| policies['v10_cascade']['success'] += int(model_results.get('claude-opus-4.7',{}).get('resolved',0)) |
| policies['v10_cascade']['cost'] += model_results.get('claude-opus-4.7',{}).get('cost',0.3) |
| policies['v10_cascade']['n'] += 1 |
| |
| |
| c_model = 'deepseek-v4-flash' |
| if c_model in model_results: |
| policies['always_cheap']['success'] += int(model_results[c_model]['resolved']) |
| policies['always_cheap']['cost'] += model_results[c_model]['cost'] |
| policies['always_cheap']['n'] += 1 |
|
|
| |
| print(f"\n\n{'='*80}") |
| print("REAL SWE-BENCH RESULTS WITH v10 REAL-DATA ROUTER") |
| print(f"{'='*80}") |
|
|
| fr_cost = policies['frontier']['cost'] / policies['frontier']['n'] |
| print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") |
| print("-"*50) |
| for name in ['oracle','v10_direct','v10_cascade','v8_synthetic','frontier','always_cheap']: |
| r = policies[name] |
| sr = r['success']/r['n'] |
| ac = r['cost']/r['n'] |
| cr = (1 - ac/fr_cost)*100 |
| print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") |
|
|
| |
| |
| print("\n\n[7] v10 + feedback cascade...") |
| policies_hybrid = defaultdict(lambda: {"success":0,"cost":0.0,"n":0}) |
|
|
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| predicted_tier, cascade_tier, tier_probs = route_v10(problem) |
| |
| |
| m_cascade = TIER_TO_SWE.get(cascade_tier, 'claude-opus-4.7') |
| |
| if m_cascade in model_results and model_results[m_cascade]['resolved']: |
| |
| policies_hybrid['v10_feedback']['success'] += 1 |
| policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost'] |
| elif cascade_tier < 5: |
| |
| up_tier = min(cascade_tier + 1, 5) |
| up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7') |
| if up_model in model_results and model_results[up_model]['resolved']: |
| policies_hybrid['v10_feedback']['success'] += 1 |
| policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost'] |
| policies_hybrid['v10_feedback']['cost'] += model_results[up_model]['cost'] |
| else: |
| |
| f_model = 'claude-opus-4.7' |
| if f_model in model_results and model_results[f_model]['resolved']: |
| policies_hybrid['v10_feedback']['success'] += 1 |
| policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost'] |
| policies_hybrid['v10_feedback']['cost'] += model_results[f_model]['cost'] |
| else: |
| policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost'] |
| else: |
| policies_hybrid['v10_feedback']['cost'] += model_results.get(m_cascade, {}).get('cost', 0.3) |
| policies_hybrid['v10_feedback']['n'] += 1 |
| |
| |
| resolved = [(m, r) for m, r in model_results.items() if r['resolved']] |
| if resolved: |
| cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0)) |
| policies_hybrid['oracle']['success'] += 1 |
| policies_hybrid['oracle']['cost'] += cheapest[1]['cost'] |
| policies_hybrid['oracle']['n'] += 1 |
|
|
| |
| f_model = 'claude-opus-4.7' |
| policies_hybrid['frontier']['success'] += int(model_results[f_model]['resolved']) |
| policies_hybrid['frontier']['cost'] += model_results[f_model]['cost'] |
| policies_hybrid['frontier']['n'] += 1 |
|
|
| fr_cost_h = policies_hybrid['frontier']['cost'] / policies_hybrid['frontier']['n'] |
| print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") |
| print("-"*50) |
| for name in ['oracle','v10_feedback','frontier']: |
| r = policies_hybrid[name] |
| sr = r['success']/r['n'] |
| ac = r['cost']/r['n'] |
| cr = (1-ac/fr_cost_h)*100 |
| print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") |
|
|
| |
| v10_bundle = { |
| 'tier_clfs': {str(k):v for k,v in tier_clfs.items()}, |
| 'tier_calibrators': {str(k):v for k,v in tier_calibs.items()}, |
| 'opt_clf': opt_clf, |
| 'feat_keys': all_feat_keys, |
| 'tier_config': {str(k):v for k,v in TIER_COST.items()}, |
| 'version': '10.0', |
| 'description': 'ACO v10: Trained on REAL SWE-Router execution data (500 tasks x 8 models)', |
| 'training_data': 'SWE-Router/swebench-verified-*', |
| 'n_training': len(training_data), |
| 'n_features': len(all_feat_keys), |
| } |
| with open('/app/router_models/router_bundle_v10.pkl', 'wb') as f: |
| pickle.dump(v10_bundle, f) |
| print(f"\nSaved router_bundle_v10.pkl ({os.path.getsize('/app/router_models/router_bundle_v10.pkl')/1024:.0f} KB)") |
|
|
| |
| all_results = {} |
| for name, r in policies.items(): |
| all_results[name] = {"success":r['success']/r['n'],"avg_cost":r['cost']/r['n']} |
| for name, r in policies_hybrid.items(): |
| all_results[f"hybrid_{name}"] = {"success":r['success']/r['n'],"avg_cost":r['cost']/r['n']} |
| all_results['v10_cv_scores'] = tier_cv_scores |
| all_results['v10_opt_acc'] = opt_acc |
| all_results['feat_keys'] = all_feat_keys |
| with open('/app/swe_v10_results.json', 'w') as f: |
| json.dump(all_results, f, indent=2, default=str) |
|
|
| print(f"\nSaved swe_v10_results.json") |
| print("DONE!") |
|
|