Upload training/train_router_real.py with huggingface_hub
Browse files- training/train_router_real.py +435 -0
training/train_router_real.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Train v10 router on REAL SWE-Router execution data.
|
| 3 |
+
|
| 4 |
+
This is the big one: 500 tasks x 8 models = 4000 real outcomes.
|
| 5 |
+
We learn which model succeeds on which task, at what cost.
|
| 6 |
+
"""
|
| 7 |
+
import sys, json, random, pickle, math
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
print("="*80)
|
| 13 |
+
print("TRAINING v10 ROUTER ON REAL SWE-ROUTER DATA")
|
| 14 |
+
print("="*80)
|
| 15 |
+
|
| 16 |
+
# Load all SWE-Router traces
|
| 17 |
+
MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2',
|
| 18 |
+
'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash']
|
| 19 |
+
|
| 20 |
+
MODEL_TIER = {
|
| 21 |
+
'deepseek-v4-flash': 1, 'gpt-5-nano': 1,
|
| 22 |
+
'gpt-5-mini': 2, 'deepseek-v3.2': 2,
|
| 23 |
+
'gemini-2.5-pro': 3,
|
| 24 |
+
'claude-opus-4.7': 4, 'gpt-5.2': 4,
|
| 25 |
+
'gemini-3-pro': 5,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
TIER_COST = {1:0.01, 2:0.05, 3:0.15, 4:0.30, 5:0.50}
|
| 29 |
+
|
| 30 |
+
print("\n[1] Loading SWE-Router traces...")
|
| 31 |
+
traces = defaultdict(dict)
|
| 32 |
+
for model in MODELS:
|
| 33 |
+
ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test')
|
| 34 |
+
for row in ds:
|
| 35 |
+
iid = row['instance_id']
|
| 36 |
+
traces[iid][model] = {
|
| 37 |
+
'resolved': row['resolved'],
|
| 38 |
+
'cost': float(row['instance_cost']),
|
| 39 |
+
'api_calls': int(row['api_calls']),
|
| 40 |
+
'problem': row['problem_statement'],
|
| 41 |
+
}
|
| 42 |
+
print(f" {model}: loaded")
|
| 43 |
+
|
| 44 |
+
print(f"\n Total tasks: {len(traces)}")
|
| 45 |
+
print(f" Total traces: {sum(len(v) for v in traces.values())}")
|
| 46 |
+
|
| 47 |
+
# ─── Feature Engineering ────────────────────────────────────────────────
|
| 48 |
+
print("\n[2] Engineering features from problem statements...")
|
| 49 |
+
|
| 50 |
+
# Keyword sets for feature extraction
|
| 51 |
+
CODE_KW = ["python","javascript","code","function","bug","debug","refactor","implement","test",
|
| 52 |
+
"compile","runtime","segfault","thread","async","class","module","import","error","traceback"]
|
| 53 |
+
LEGAL_KW = ["contract","legal","compliance","gdpr","privacy","policy","regulatory","liability"]
|
| 54 |
+
RESEARCH_KW = ["research","investigate","compare","analyze","survey","paper"]
|
| 55 |
+
TOOL_KW = ["search","fetch","retrieve","query","api","database","scrape","aggregate"]
|
| 56 |
+
CRITICAL_KW = ["critical","production","urgent","emergency","live","deployed","safety","security"]
|
| 57 |
+
SIMPLE_KW = ["typo","simple","quick","brief","minor","small","easy","trivial","just"]
|
| 58 |
+
LONG_KW = ["plan","project","roadmap","orchestrate","migrate","pipeline","deploy","architecture"]
|
| 59 |
+
MATH_KW = ["calculate","compute","solve","equation","formula","optimize","probability"]
|
| 60 |
+
|
| 61 |
+
def extract_features(problem_text):
|
| 62 |
+
r = problem_text.lower()
|
| 63 |
+
feats = {
|
| 64 |
+
'req_len': len(problem_text),
|
| 65 |
+
'num_words': len(problem_text.split()),
|
| 66 |
+
'has_code': int(any(k in r for k in CODE_KW)),
|
| 67 |
+
'n_code': sum(1 for k in CODE_KW if k in r),
|
| 68 |
+
'has_legal': int(any(k in r for k in LEGAL_KW)),
|
| 69 |
+
'has_research': int(any(k in r for k in RESEARCH_KW)),
|
| 70 |
+
'has_tool': int(any(k in r for k in TOOL_KW)),
|
| 71 |
+
'has_critical': int(any(k in r for k in CRITICAL_KW)),
|
| 72 |
+
'has_simple': int(any(k in r for k in SIMPLE_KW)),
|
| 73 |
+
'has_long': int(any(k in r for k in LONG_KW)),
|
| 74 |
+
'has_math': int(any(k in r for k in MATH_KW)),
|
| 75 |
+
'has_error_msg': int('error' in r or 'traceback' in r or 'exception' in r),
|
| 76 |
+
'has_file_path': int('/' in r and ('.' in r.split('/')[0] if '/' in r else False)),
|
| 77 |
+
'n_lines': problem_text.count('\n') + 1,
|
| 78 |
+
'has_version': int('version' in r or 'update' in r or 'upgrade' in r),
|
| 79 |
+
'has_add': int('add' in r or 'new' in r or 'create' in r),
|
| 80 |
+
'has_fix': int('fix' in r or 'bug' in r or 'issue' in r or 'broken' in r),
|
| 81 |
+
'has_change': int('change' in r or 'modify' in r or 'update' in r),
|
| 82 |
+
'has_remove': int('remove' in r or 'delete' in r or 'drop' in r),
|
| 83 |
+
'has_test': int('test' in r or 'spec' in r or 'assert' in r),
|
| 84 |
+
'has_doc': int('doc' in r or 'readme' in r or 'comment' in r),
|
| 85 |
+
# SWE-specific features
|
| 86 |
+
'has_see_also': int('see also' in r or 'related' in r),
|
| 87 |
+
'has_steps_to_reproduce': int('steps to reproduce' in r or 'reproduce' in r),
|
| 88 |
+
}
|
| 89 |
+
return feats
|
| 90 |
+
|
| 91 |
+
# ─── Build Training Data ────────────────────────────────────────────────
|
| 92 |
+
print("\n[3] Building training data...")
|
| 93 |
+
|
| 94 |
+
# For each task, we know which models succeeded.
|
| 95 |
+
# Ground truth: optimal_tier = cheapest tier where at least one model succeeded
|
| 96 |
+
# Features: extracted from problem statement
|
| 97 |
+
|
| 98 |
+
all_feat_keys = None
|
| 99 |
+
training_data = []
|
| 100 |
+
tier_labels = {1:[],2:[],3:[],4:[],5:[]}
|
| 101 |
+
cost_labels = []
|
| 102 |
+
|
| 103 |
+
for iid, model_results in traces.items():
|
| 104 |
+
problem = next(iter(model_results.values()))['problem']
|
| 105 |
+
feats = extract_features(problem)
|
| 106 |
+
|
| 107 |
+
if all_feat_keys is None:
|
| 108 |
+
all_feat_keys = sorted(feats.keys())
|
| 109 |
+
|
| 110 |
+
feat_vec = [float(feats.get(k, 0.0)) for k in all_feat_keys]
|
| 111 |
+
|
| 112 |
+
# Determine ground truth: which tiers succeeded?
|
| 113 |
+
tier_success = {}
|
| 114 |
+
for model, result in model_results.items():
|
| 115 |
+
tier = MODEL_TIER[model]
|
| 116 |
+
if tier not in tier_success:
|
| 117 |
+
tier_success[tier] = False
|
| 118 |
+
if result['resolved']:
|
| 119 |
+
tier_success[tier] = True
|
| 120 |
+
|
| 121 |
+
# Optimal tier = cheapest that succeeded
|
| 122 |
+
optimal_tier = 5
|
| 123 |
+
for t in range(1, 6):
|
| 124 |
+
if tier_success.get(t, False):
|
| 125 |
+
optimal_tier = t
|
| 126 |
+
break
|
| 127 |
+
|
| 128 |
+
# Per-tier success labels
|
| 129 |
+
for t in range(1, 6):
|
| 130 |
+
tier_labels[t].append(int(tier_success.get(t, False)))
|
| 131 |
+
|
| 132 |
+
training_data.append({
|
| 133 |
+
'features': feat_vec,
|
| 134 |
+
'optimal_tier': optimal_tier,
|
| 135 |
+
'tier_success': tier_success,
|
| 136 |
+
'cost': min(r['cost'] for r in model_results.values()),
|
| 137 |
+
})
|
| 138 |
+
|
| 139 |
+
print(f" Training samples: {len(training_data)}")
|
| 140 |
+
print(f" Features: {len(all_feat_keys)}")
|
| 141 |
+
print(f" Optimal tier distribution:")
|
| 142 |
+
opt_dist = defaultdict(int)
|
| 143 |
+
for t in training_data:
|
| 144 |
+
opt_dist[t['optimal_tier']] += 1
|
| 145 |
+
for tier in sorted(opt_dist.keys()):
|
| 146 |
+
print(f" Tier {tier}: {opt_dist[tier]} ({opt_dist[tier]/len(training_data)*100:.1f}%)")
|
| 147 |
+
print(f" Per-tier success rates:")
|
| 148 |
+
for t in range(1,6):
|
| 149 |
+
s = sum(tier_labels[t])
|
| 150 |
+
print(f" Tier {t}: {s}/{len(training_data)} = {s/len(training_data)*100:.1f}%")
|
| 151 |
+
|
| 152 |
+
# ─── Train XGBoost Models ────────────────────────────────────────────────
|
| 153 |
+
print("\n[4] Training XGBoost per-tier success predictors...")
|
| 154 |
+
|
| 155 |
+
from xgboost import XGBClassifier
|
| 156 |
+
from sklearn.calibration import IsotonicRegression
|
| 157 |
+
from sklearn.model_selection import cross_val_score
|
| 158 |
+
import warnings
|
| 159 |
+
warnings.filterwarnings('ignore')
|
| 160 |
+
|
| 161 |
+
X = np.array([t['features'] for t in training_data], dtype=np.float32)
|
| 162 |
+
y_tier = {t: np.array(tier_labels[t]) for t in range(1,6)}
|
| 163 |
+
y_optimal = np.array([t['optimal_tier'] for t in training_data])
|
| 164 |
+
|
| 165 |
+
tier_clfs = {}
|
| 166 |
+
tier_calibs = {}
|
| 167 |
+
tier_cv_scores = {}
|
| 168 |
+
|
| 169 |
+
for t in range(1, 6):
|
| 170 |
+
y = y_tier[t]
|
| 171 |
+
n_pos = y.sum()
|
| 172 |
+
n_neg = len(y) - n_pos
|
| 173 |
+
|
| 174 |
+
# Scale pos weight for imbalanced data
|
| 175 |
+
spw = max(1, n_neg / max(n_pos, 1))
|
| 176 |
+
|
| 177 |
+
clf = XGBClassifier(
|
| 178 |
+
n_estimators=200, max_depth=5, learning_rate=0.05,
|
| 179 |
+
subsample=0.8, colsample_bytree=0.8,
|
| 180 |
+
scale_pos_weight=spw,
|
| 181 |
+
eval_metric='logloss', use_label_encoder=False,
|
| 182 |
+
random_state=42,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# CV score
|
| 186 |
+
try:
|
| 187 |
+
scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
|
| 188 |
+
tier_cv_scores[t] = scores.mean()
|
| 189 |
+
except:
|
| 190 |
+
tier_cv_scores[t] = 0.0
|
| 191 |
+
|
| 192 |
+
clf.fit(X, y)
|
| 193 |
+
|
| 194 |
+
# Calibrate
|
| 195 |
+
p_raw = clf.predict_proba(X)[:, 1]
|
| 196 |
+
cal = IsotonicRegression(out_of_bounds='clip')
|
| 197 |
+
cal.fit(p_raw, y)
|
| 198 |
+
|
| 199 |
+
tier_clfs[t] = clf
|
| 200 |
+
tier_calibs[t] = cal
|
| 201 |
+
|
| 202 |
+
p_cal = cal.transform(p_raw)
|
| 203 |
+
brier = np.mean((p_cal - y) ** 2)
|
| 204 |
+
|
| 205 |
+
print(f" Tier {t}: n_pos={n_pos}, CV_f1={tier_cv_scores[t]:.3f}, Brier={brier:.4f}")
|
| 206 |
+
|
| 207 |
+
# ─── Train Direct Optimal-Tier Predictor ────────────────────────────────
|
| 208 |
+
print("\n[5] Training direct optimal-tier predictor...")
|
| 209 |
+
|
| 210 |
+
from xgboost import XGBRegressor
|
| 211 |
+
|
| 212 |
+
opt_clf = XGBClassifier(
|
| 213 |
+
n_estimators=300, max_depth=6, learning_rate=0.05,
|
| 214 |
+
subsample=0.8, colsample_bytree=0.8,
|
| 215 |
+
eval_metric='mlogloss', use_label_encoder=False,
|
| 216 |
+
random_state=42, num_class=5,
|
| 217 |
+
)
|
| 218 |
+
opt_clf.fit(X, y_optimal - 1) # 0-indexed
|
| 219 |
+
opt_pred = opt_clf.predict(X) + 1
|
| 220 |
+
opt_acc = np.mean(opt_pred == y_optimal)
|
| 221 |
+
print(f" Direct optimal-tier accuracy: {opt_acc:.3f}")
|
| 222 |
+
print(f" Confusion (predicted vs actual):")
|
| 223 |
+
from collections import Counter
|
| 224 |
+
for actual_tier in range(1, 6):
|
| 225 |
+
mask = y_optimal == actual_tier
|
| 226 |
+
if mask.sum() > 0:
|
| 227 |
+
pred_dist = Counter(opt_pred[mask].tolist())
|
| 228 |
+
print(f" Actual tier {actual_tier}: {dict(pred_dist)}")
|
| 229 |
+
|
| 230 |
+
# ─── Evaluate on SWE-Router data ────────────────────────────────────────
|
| 231 |
+
print("\n[6] Evaluating routing policies on SWE-Router...")
|
| 232 |
+
|
| 233 |
+
from aco.classifier import TaskCostClassifier
|
| 234 |
+
classifier = TaskCostClassifier()
|
| 235 |
+
|
| 236 |
+
def route_v10(problem_text):
|
| 237 |
+
"""v10: Real-data trained router."""
|
| 238 |
+
feats = extract_features(problem_text)
|
| 239 |
+
feat_vec = np.array([float(feats.get(k, 0.0)) for k in all_feat_keys], dtype=np.float32).reshape(1,-1)
|
| 240 |
+
|
| 241 |
+
# Method 1: Direct optimal tier prediction
|
| 242 |
+
predicted_tier = int(opt_clf.predict(feat_vec)[0]) + 1
|
| 243 |
+
|
| 244 |
+
# Method 2: Per-tier P(success) cascade
|
| 245 |
+
tier_probs = {}
|
| 246 |
+
for t in range(1, 6):
|
| 247 |
+
p_raw = tier_clfs[t].predict_proba(feat_vec)[0, 1]
|
| 248 |
+
p_cal = float(tier_calibs[t].transform([p_raw])[0])
|
| 249 |
+
tier_probs[t] = p_cal
|
| 250 |
+
|
| 251 |
+
# Find cheapest tier with P(success) > threshold
|
| 252 |
+
for t in range(1, 6):
|
| 253 |
+
if tier_probs[t] >= 0.5: # 50% success threshold
|
| 254 |
+
cascade_tier = t
|
| 255 |
+
break
|
| 256 |
+
else:
|
| 257 |
+
cascade_tier = 5
|
| 258 |
+
|
| 259 |
+
return predicted_tier, cascade_tier, tier_probs
|
| 260 |
+
|
| 261 |
+
# Evaluate
|
| 262 |
+
TIER_TO_SWE = {
|
| 263 |
+
1: 'deepseek-v4-flash', 2: 'gpt-5-mini',
|
| 264 |
+
3: 'gemini-2.5-pro', 4: 'claude-opus-4.7', 5: 'gemini-3-pro',
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
policies = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
|
| 268 |
+
|
| 269 |
+
for iid, model_results in traces.items():
|
| 270 |
+
problem = next(iter(model_results.values()))['problem']
|
| 271 |
+
|
| 272 |
+
# Oracle
|
| 273 |
+
resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
|
| 274 |
+
if resolved:
|
| 275 |
+
cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0))
|
| 276 |
+
policies['oracle']['success'] += 1
|
| 277 |
+
policies['oracle']['cost'] += cheapest[1]['cost']
|
| 278 |
+
else:
|
| 279 |
+
policies['oracle']['cost'] += min(r['cost'] for r in model_results.values())
|
| 280 |
+
policies['oracle']['n'] += 1
|
| 281 |
+
|
| 282 |
+
# Always frontier (tier 4)
|
| 283 |
+
f_model = 'claude-opus-4.7'
|
| 284 |
+
if f_model in model_results:
|
| 285 |
+
policies['frontier']['success'] += int(model_results[f_model]['resolved'])
|
| 286 |
+
policies['frontier']['cost'] += model_results[f_model]['cost']
|
| 287 |
+
policies['frontier']['n'] += 1
|
| 288 |
+
|
| 289 |
+
# v8 (old synthetic-trained router)
|
| 290 |
+
pred = classifier.classify(problem)
|
| 291 |
+
from aco.router import ModelCascadeRouter
|
| 292 |
+
old_router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
|
| 293 |
+
r8 = old_router.route(problem, "coding", pred["difficulty"], pred)
|
| 294 |
+
m8 = TIER_TO_SWE.get(r8.tier, 'claude-opus-4.7')
|
| 295 |
+
if m8 in model_results:
|
| 296 |
+
policies['v8_synthetic']['success'] += int(model_results[m8]['resolved'])
|
| 297 |
+
policies['v8_synthetic']['cost'] += model_results[m8]['cost']
|
| 298 |
+
policies['v8_synthetic']['n'] += 1
|
| 299 |
+
|
| 300 |
+
# v10 direct optimal-tier
|
| 301 |
+
predicted_tier, cascade_tier, tier_probs = route_v10(problem)
|
| 302 |
+
m10 = TIER_TO_SWE.get(predicted_tier, 'claude-opus-4.7')
|
| 303 |
+
if m10 in model_results:
|
| 304 |
+
policies['v10_direct']['success'] += int(model_results[m10]['resolved'])
|
| 305 |
+
policies['v10_direct']['cost'] += model_results[m10]['cost']
|
| 306 |
+
else:
|
| 307 |
+
# Fallback to frontier
|
| 308 |
+
policies['v10_direct']['success'] += int(model_results.get('claude-opus-4.7',{}).get('resolved',0))
|
| 309 |
+
policies['v10_direct']['cost'] += model_results.get('claude-opus-4.7',{}).get('cost',0.3)
|
| 310 |
+
policies['v10_direct']['n'] += 1
|
| 311 |
+
|
| 312 |
+
# v10 cascade (per-tier P(success) > 0.5)
|
| 313 |
+
m10c = TIER_TO_SWE.get(cascade_tier, 'claude-opus-4.7')
|
| 314 |
+
if m10c in model_results:
|
| 315 |
+
policies['v10_cascade']['success'] += int(model_results[m10c]['resolved'])
|
| 316 |
+
policies['v10_cascade']['cost'] += model_results[m10c]['cost']
|
| 317 |
+
else:
|
| 318 |
+
policies['v10_cascade']['success'] += int(model_results.get('claude-opus-4.7',{}).get('resolved',0))
|
| 319 |
+
policies['v10_cascade']['cost'] += model_results.get('claude-opus-4.7',{}).get('cost',0.3)
|
| 320 |
+
policies['v10_cascade']['n'] += 1
|
| 321 |
+
|
| 322 |
+
# Always cheap (tier 1)
|
| 323 |
+
c_model = 'deepseek-v4-flash'
|
| 324 |
+
if c_model in model_results:
|
| 325 |
+
policies['always_cheap']['success'] += int(model_results[c_model]['resolved'])
|
| 326 |
+
policies['always_cheap']['cost'] += model_results[c_model]['cost']
|
| 327 |
+
policies['always_cheap']['n'] += 1
|
| 328 |
+
|
| 329 |
+
# Print results
|
| 330 |
+
print(f"\n\n{'='*80}")
|
| 331 |
+
print("REAL SWE-BENCH RESULTS WITH v10 REAL-DATA ROUTER")
|
| 332 |
+
print(f"{'='*80}")
|
| 333 |
+
|
| 334 |
+
fr_cost = policies['frontier']['cost'] / policies['frontier']['n']
|
| 335 |
+
print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
|
| 336 |
+
print("-"*50)
|
| 337 |
+
for name in ['oracle','v10_direct','v10_cascade','v8_synthetic','frontier','always_cheap']:
|
| 338 |
+
r = policies[name]
|
| 339 |
+
sr = r['success']/r['n']
|
| 340 |
+
ac = r['cost']/r['n']
|
| 341 |
+
cr = (1 - ac/fr_cost)*100
|
| 342 |
+
print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
|
| 343 |
+
|
| 344 |
+
# Also try hybrid v10 + feedback
|
| 345 |
+
# v10 routes to cheap model; if it fails, escalate
|
| 346 |
+
print("\n\n[7] v10 + feedback cascade...")
|
| 347 |
+
policies_hybrid = defaultdict(lambda: {"success":0,"cost":0.0,"n":0})
|
| 348 |
+
|
| 349 |
+
for iid, model_results in traces.items():
|
| 350 |
+
problem = next(iter(model_results.values()))['problem']
|
| 351 |
+
predicted_tier, cascade_tier, tier_probs = route_v10(problem)
|
| 352 |
+
|
| 353 |
+
# Start with cascade_tier (more conservative than direct)
|
| 354 |
+
m_cascade = TIER_TO_SWE.get(cascade_tier, 'claude-opus-4.7')
|
| 355 |
+
|
| 356 |
+
if m_cascade in model_results and model_results[m_cascade]['resolved']:
|
| 357 |
+
# Initial model succeeded
|
| 358 |
+
policies_hybrid['v10_feedback']['success'] += 1
|
| 359 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost']
|
| 360 |
+
elif cascade_tier < 5:
|
| 361 |
+
# Failed: escalate
|
| 362 |
+
up_tier = min(cascade_tier + 1, 5)
|
| 363 |
+
up_model = TIER_TO_SWE.get(up_tier, 'claude-opus-4.7')
|
| 364 |
+
if up_model in model_results and model_results[up_model]['resolved']:
|
| 365 |
+
policies_hybrid['v10_feedback']['success'] += 1
|
| 366 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost']
|
| 367 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[up_model]['cost']
|
| 368 |
+
else:
|
| 369 |
+
# Try tier 4 (frontier) as last resort
|
| 370 |
+
f_model = 'claude-opus-4.7'
|
| 371 |
+
if f_model in model_results and model_results[f_model]['resolved']:
|
| 372 |
+
policies_hybrid['v10_feedback']['success'] += 1
|
| 373 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost']
|
| 374 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[f_model]['cost']
|
| 375 |
+
else:
|
| 376 |
+
policies_hybrid['v10_feedback']['cost'] += model_results[m_cascade]['cost']
|
| 377 |
+
else:
|
| 378 |
+
policies_hybrid['v10_feedback']['cost'] += model_results.get(m_cascade, {}).get('cost', 0.3)
|
| 379 |
+
policies_hybrid['v10_feedback']['n'] += 1
|
| 380 |
+
|
| 381 |
+
# Also track oracle
|
| 382 |
+
resolved = [(m, r) for m, r in model_results.items() if r['resolved']]
|
| 383 |
+
if resolved:
|
| 384 |
+
cheapest = min(resolved, key=lambda x: TIER_COST.get(MODEL_TIER[x[0]], 1.0))
|
| 385 |
+
policies_hybrid['oracle']['success'] += 1
|
| 386 |
+
policies_hybrid['oracle']['cost'] += cheapest[1]['cost']
|
| 387 |
+
policies_hybrid['oracle']['n'] += 1
|
| 388 |
+
|
| 389 |
+
# Frontier
|
| 390 |
+
f_model = 'claude-opus-4.7'
|
| 391 |
+
policies_hybrid['frontier']['success'] += int(model_results[f_model]['resolved'])
|
| 392 |
+
policies_hybrid['frontier']['cost'] += model_results[f_model]['cost']
|
| 393 |
+
policies_hybrid['frontier']['n'] += 1
|
| 394 |
+
|
| 395 |
+
fr_cost_h = policies_hybrid['frontier']['cost'] / policies_hybrid['frontier']['n']
|
| 396 |
+
print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
|
| 397 |
+
print("-"*50)
|
| 398 |
+
for name in ['oracle','v10_feedback','frontier']:
|
| 399 |
+
r = policies_hybrid[name]
|
| 400 |
+
sr = r['success']/r['n']
|
| 401 |
+
ac = r['cost']/r['n']
|
| 402 |
+
cr = (1-ac/fr_cost_h)*100
|
| 403 |
+
print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
|
| 404 |
+
|
| 405 |
+
# Save v10 bundle
|
| 406 |
+
v10_bundle = {
|
| 407 |
+
'tier_clfs': {str(k):v for k,v in tier_clfs.items()},
|
| 408 |
+
'tier_calibrators': {str(k):v for k,v in tier_calibs.items()},
|
| 409 |
+
'opt_clf': opt_clf,
|
| 410 |
+
'feat_keys': all_feat_keys,
|
| 411 |
+
'tier_config': {str(k):v for k,v in TIER_COST.items()},
|
| 412 |
+
'version': '10.0',
|
| 413 |
+
'description': 'ACO v10: Trained on REAL SWE-Router execution data (500 tasks x 8 models)',
|
| 414 |
+
'training_data': 'SWE-Router/swebench-verified-*',
|
| 415 |
+
'n_training': len(training_data),
|
| 416 |
+
'n_features': len(all_feat_keys),
|
| 417 |
+
}
|
| 418 |
+
with open('/app/router_models/router_bundle_v10.pkl', 'wb') as f:
|
| 419 |
+
pickle.dump(v10_bundle, f)
|
| 420 |
+
print(f"\nSaved router_bundle_v10.pkl ({os.path.getsize('/app/router_models/router_bundle_v10.pkl')/1024:.0f} KB)")
|
| 421 |
+
|
| 422 |
+
# Save results
|
| 423 |
+
all_results = {}
|
| 424 |
+
for name, r in policies.items():
|
| 425 |
+
all_results[name] = {"success":r['success']/r['n'],"avg_cost":r['cost']/r['n']}
|
| 426 |
+
for name, r in policies_hybrid.items():
|
| 427 |
+
all_results[f"hybrid_{name}"] = {"success":r['success']/r['n'],"avg_cost":r['cost']/r['n']}
|
| 428 |
+
all_results['v10_cv_scores'] = tier_cv_scores
|
| 429 |
+
all_results['v10_opt_acc'] = opt_acc
|
| 430 |
+
all_results['feat_keys'] = all_feat_keys
|
| 431 |
+
with open('/app/swe_v10_results.json', 'w') as f:
|
| 432 |
+
json.dump(all_results, f, indent=2, default=str)
|
| 433 |
+
|
| 434 |
+
print(f"\nSaved swe_v10_results.json")
|
| 435 |
+
print("DONE!")
|