| |
| """v10 Router: Fixed regularization for 500-sample training set. |
| from collections import Counter |
| |
| Problem: XGBoost with 23 features and 500 samples overfits (100% train acc). |
| Solution: Heavy regularization + fewer estimators + stratified CV. |
| """ |
| import sys, json, random, pickle, numpy as np |
| from collections import defaultdict |
| from datasets import load_dataset |
| import warnings |
| from collections import Counter |
| warnings.filterwarnings('ignore') |
|
|
| from xgboost import XGBClassifier |
| from sklearn.calibration import IsotonicRegression |
| from sklearn.model_selection import cross_val_score |
|
|
| print("="*80) |
| print("v10 ROUTER: FIXED REGULARIZATION") |
| print("="*80) |
|
|
| |
| MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2', |
| 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash'] |
| MODEL_TIER = { |
| 'deepseek-v4-flash':1,'gpt-5-nano':1,'gpt-5-mini':2,'deepseek-v3.2':2, |
| 'gemini-2.5-pro':3,'claude-opus-4.7':4,'gpt-5.2':4,'gemini-3-pro':5, |
| } |
| TIER_COST = {1:0.01,2:0.05,3:0.15,4:0.30,5:0.50} |
| TIER_TO_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',4:'claude-opus-4.7',5:'gemini-3-pro'} |
|
|
| |
| CODE_KW=["python","code","function","bug","debug","refactor","implement","test","error","traceback","import"] |
| CRITICAL_KW=["critical","production","urgent","emergency","live","deployed","safety","security"] |
| SIMPLE_KW=["typo","simple","quick","brief","minor","small","easy","trivial","just"] |
|
|
| FEAT_KEYS = sorted([ |
| 'req_len','num_words','has_code','n_code','has_critical','has_simple', |
| 'has_error_msg','has_file_path','n_lines','has_fix','has_add', |
| 'has_change','has_test','has_doc', |
| ]) |
|
|
| def extract_features(text): |
| r = text.lower() |
| return { |
| 'req_len':len(text),'num_words':len(text.split()), |
| 'has_code':int(any(k in r for k in CODE_KW)), |
| 'n_code':sum(1 for k in CODE_KW if k in r), |
| 'has_critical':int(any(k in r for k in CRITICAL_KW)), |
| 'has_simple':int(any(k in r for k in SIMPLE_KW)), |
| 'has_error_msg':int('error' in r or 'traceback' in r or 'exception' in r), |
| 'has_file_path':int('/' in r), |
| 'n_lines':text.count('\n')+1, |
| 'has_fix':int('fix' in r or 'bug' in r or 'issue' in r), |
| 'has_add':int('add' in r or 'new' in r or 'create' in r), |
| 'has_change':int('change' in r or 'modify' in r or 'update' in r), |
| 'has_test':int('test' in r or 'spec' in r), |
| 'has_doc':int('doc' in r or 'readme' in r), |
| } |
|
|
| print("\n[1] Loading traces...") |
| traces = defaultdict(dict) |
| for model in MODELS: |
| ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test') |
| for row in ds: |
| traces[row['instance_id']][model] = { |
| 'resolved':row['resolved'], 'cost':float(row['instance_cost']), |
| 'problem':row['problem_statement'], |
| } |
| print(f" {len(traces)} tasks loaded") |
|
|
| print("\n[2] Building features...") |
| X = [] |
| tier_labels = {t:[] for t in range(1,6)} |
| optimal_tiers = [] |
|
|
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| feats = extract_features(problem) |
| feat_vec = [float(feats.get(k,0.0)) for k in FEAT_KEYS] |
| X.append(feat_vec) |
| |
| tier_success = {} |
| for model, result in model_results.items(): |
| tier = MODEL_TIER[model] |
| if tier not in tier_success: tier_success[tier] = False |
| if result['resolved']: tier_success[tier] = True |
| |
| for t in range(1,6): |
| tier_labels[t].append(int(tier_success.get(t, False))) |
| |
| opt = 5 |
| for t in range(1,6): |
| if tier_success.get(t, False): opt = t; break |
| optimal_tiers.append(opt) |
|
|
| X = np.array(X, dtype=np.float32) |
| print(f" X shape: {X.shape}") |
| print(f" Optimal tier dist: {Counter(optimal_tiers)}") |
|
|
| |
| print("\n[3] Training with heavy regularization...") |
| tier_clfs = {} |
| tier_calibs = {} |
|
|
| for t in range(1,6): |
| y = np.array(tier_labels[t]) |
| n_pos = y.sum() |
| spw = max(1, (len(y)-n_pos)/max(n_pos,1)) |
| |
| |
| clf = XGBClassifier( |
| n_estimators=50, |
| max_depth=3, |
| learning_rate=0.1, |
| subsample=0.7, |
| colsample_bytree=0.6, |
| min_child_weight=10, |
| gamma=1.0, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| scale_pos_weight=spw, |
| eval_metric='logloss', |
| random_state=42, |
| ) |
| |
| |
| try: |
| scores = cross_val_score(clf, X, y, cv=5, scoring='f1') |
| cv_f1 = scores.mean() |
| except: cv_f1 = 0.0 |
| |
| clf.fit(X, y) |
| |
| |
| train_pred = clf.predict(X) |
| train_acc = np.mean(train_pred == y) |
| |
| |
| p_raw = clf.predict_proba(X)[:,1] |
| cal = IsotonicRegression(out_of_bounds='clip') |
| cal.fit(p_raw, y) |
| p_cal = cal.transform(p_raw) |
| |
| |
| p_min, p_max = p_cal.min(), p_cal.max() |
| p_mean = p_cal.mean() |
| |
| tier_clfs[t] = clf |
| tier_calibs[t] = cal |
| print(f" Tier {t}: cv_f1={cv_f1:.3f}, train_acc={train_acc:.3f}, " |
| f"P(success) range=[{p_min:.3f},{p_max:.3f}], mean={p_mean:.3f}") |
|
|
| from collections import Counter |
|
|
| |
| print("\n[4] Evaluating with threshold sweep...") |
| best_thr = None |
| best_score = -999 |
|
|
| for thr in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85]: |
| succ=0; cost=0.0 |
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| feats = extract_features(problem) |
| feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1) |
| |
| |
| selected_tier = 5 |
| tier_probs = {} |
| for t in range(1,6): |
| p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1] |
| p_cal = float(tier_calibs[t].transform([p_raw])[0]) |
| tier_probs[t] = p_cal |
| if p_cal >= thr and selected_tier == 5: |
| selected_tier = t |
| |
| model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7') |
| if model in model_results and model_results[model]['resolved']: |
| succ += 1 |
| cost += model_results[model]['cost'] |
| else: |
| cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier]) |
| |
| sr = succ/len(traces) |
| ac = cost/len(traces) |
| cr = (1-ac/0.3167)*100 |
| score = sr*20 - ac*10 |
| print(f" thr={thr:.2f}: success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%") |
| if score > best_score: |
| best_score = score |
| best_thr = thr |
|
|
| print(f"\n Best threshold: {best_thr}") |
|
|
| |
| print("\n[5] v10 + feedback evaluation...") |
| for thr in [0.70, 0.75, 0.80]: |
| succ=0; cost=0.0; escalated=0 |
| for iid, model_results in traces.items(): |
| problem = next(iter(model_results.values()))['problem'] |
| feats = extract_features(problem) |
| feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1) |
| |
| selected_tier = 5 |
| for t in range(1,6): |
| p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1] |
| p_cal = float(tier_calibs[t].transform([p_raw])[0]) |
| if p_cal >= thr and selected_tier == 5: |
| selected_tier = t |
| |
| model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7') |
| |
| |
| if model in model_results and model_results[model]['resolved']: |
| succ += 1 |
| cost += model_results[model]['cost'] |
| elif selected_tier < 5: |
| |
| up_tier = min(selected_tier+1, 5) |
| up_model = TIER_TO_MODEL.get(up_tier, 'claude-opus-4.7') |
| escalated += 1 |
| if up_model in model_results and model_results[up_model]['resolved']: |
| succ += 1 |
| cost += model_results[model]['cost'] + model_results[up_model]['cost'] |
| else: |
| cost += model_results[model]['cost'] + model_results.get(up_model,{}).get('cost', TIER_COST[up_tier]) |
| else: |
| cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier]) |
| |
| sr = succ/len(traces) |
| ac = cost/len(traces) |
| cr = (1-ac/0.3167)*100 |
| print(f" v10_feedback(thr={thr:.2f}): success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%, escalated={escalated}") |
|
|
| |
| v10_fixed = { |
| 'tier_clfs': {str(k):v for k,v in tier_clfs.items()}, |
| 'tier_calibrators': {str(k):v for k,v in tier_calibs.items()}, |
| 'feat_keys': FEAT_KEYS, |
| 'tier_config': {str(k):v for k,v in TIER_COST.items()}, |
| 'version': '10.1', |
| 'description': 'ACO v10.1: Regularized XGBoost on SWE-Router data', |
| 'best_threshold': best_thr, |
| } |
| with open('/app/router_models/router_bundle_v10_fixed.pkl', 'wb') as f: |
| pickle.dump(v10_fixed, f) |
| print(f"\nSaved v10.1 bundle") |
| print("DONE!") |
|
|