narcolepticchicken
/

agent-cost-optimizer

Safetensors

Model card Files Files and versions

xet

Community

narcolepticchicken commited on about 18 hours ago

Commit

8d7e0ef

verified ·

1 Parent(s): 946830d

Upload training/train_v10_fixed.py with huggingface_hub

Browse files

Files changed (1) hide show

training/train_v10_fixed.py +250 -0

training/train_v10_fixed.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""v10 Router: Fixed regularization for 500-sample training set.
+from collections import Counter
+Problem: XGBoost with 23 features and 500 samples overfits (100% train acc).
+Solution: Heavy regularization + fewer estimators + stratified CV.
+"""
+import sys, json, random, pickle, numpy as np
+from collections import defaultdict
+from datasets import load_dataset
+import warnings
+from collections import Counter
+warnings.filterwarnings('ignore')
+from xgboost import XGBClassifier
+from sklearn.calibration import IsotonicRegression
+from sklearn.model_selection import cross_val_score
+print("="*80)
+print("v10 ROUTER: FIXED REGULARIZATION")
+print("="*80)
+# Load traces
+MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2',
+          'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash']
+MODEL_TIER = {
+    'deepseek-v4-flash':1,'gpt-5-nano':1,'gpt-5-mini':2,'deepseek-v3.2':2,
+    'gemini-2.5-pro':3,'claude-opus-4.7':4,'gpt-5.2':4,'gemini-3-pro':5,
+}
+TIER_COST = {1:0.01,2:0.05,3:0.15,4:0.30,5:0.50}
+TIER_TO_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',4:'claude-opus-4.7',5:'gemini-3-pro'}
+# Feature extraction (same as before)
+CODE_KW=["python","code","function","bug","debug","refactor","implement","test","error","traceback","import"]
+CRITICAL_KW=["critical","production","urgent","emergency","live","deployed","safety","security"]
+SIMPLE_KW=["typo","simple","quick","brief","minor","small","easy","trivial","just"]
+FEAT_KEYS = sorted([
+    'req_len','num_words','has_code','n_code','has_critical','has_simple',
+    'has_error_msg','has_file_path','n_lines','has_fix','has_add',
+    'has_change','has_test','has_doc',
+])
+def extract_features(text):
+    r = text.lower()
+    return {
+        'req_len':len(text),'num_words':len(text.split()),
+        'has_code':int(any(k in r for k in CODE_KW)),
+        'n_code':sum(1 for k in CODE_KW if k in r),
+        'has_critical':int(any(k in r for k in CRITICAL_KW)),
+        'has_simple':int(any(k in r for k in SIMPLE_KW)),
+        'has_error_msg':int('error' in r or 'traceback' in r or 'exception' in r),
+        'has_file_path':int('/' in r),
+        'n_lines':text.count('\n')+1,
+        'has_fix':int('fix' in r or 'bug' in r or 'issue' in r),
+        'has_add':int('add' in r or 'new' in r or 'create' in r),
+        'has_change':int('change' in r or 'modify' in r or 'update' in r),
+        'has_test':int('test' in r or 'spec' in r),
+        'has_doc':int('doc' in r or 'readme' in r),
+    }
+print("\n[1] Loading traces...")
+traces = defaultdict(dict)
+for model in MODELS:
+    ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test')
+    for row in ds:
+        traces[row['instance_id']][model] = {
+            'resolved':row['resolved'], 'cost':float(row['instance_cost']),
+            'problem':row['problem_statement'],
+        }
+print(f"  {len(traces)} tasks loaded")
+print("\n[2] Building features...")
+X = []
+tier_labels = {t:[] for t in range(1,6)}
+optimal_tiers = []
+for iid, model_results in traces.items():
+    problem = next(iter(model_results.values()))['problem']
+    feats = extract_features(problem)
+    feat_vec = [float(feats.get(k,0.0)) for k in FEAT_KEYS]
+    X.append(feat_vec)
+    tier_success = {}
+    for model, result in model_results.items():
+        tier = MODEL_TIER[model]
+        if tier not in tier_success: tier_success[tier] = False
+        if result['resolved']: tier_success[tier] = True
+    for t in range(1,6):
+        tier_labels[t].append(int(tier_success.get(t, False)))
+    opt = 5
+    for t in range(1,6):
+        if tier_success.get(t, False): opt = t; break
+    optimal_tiers.append(opt)
+X = np.array(X, dtype=np.float32)
+print(f"  X shape: {X.shape}")
+print(f"  Optimal tier dist: {Counter(optimal_tiers)}")
+# Train with HEAVY regularization
+print("\n[3] Training with heavy regularization...")
+tier_clfs = {}
+tier_calibs = {}
+for t in range(1,6):
+    y = np.array(tier_labels[t])
+    n_pos = y.sum()
+    spw = max(1, (len(y)-n_pos)/max(n_pos,1))
+    # Heavy regularization to prevent overfitting on 500 samples
+    clf = XGBClassifier(
+        n_estimators=50,        # Reduced from 200
+        max_depth=3,            # Reduced from 5
+        learning_rate=0.1,
+        subsample=0.7,
+        colsample_bytree=0.6,
+        min_child_weight=10,    # Prevent memorization
+        gamma=1.0,              # Require significant splits
+        reg_alpha=1.0,          # L1 regularization
+        reg_lambda=5.0,         # L2 regularization
+        scale_pos_weight=spw,
+        eval_metric='logloss',
+        random_state=42,
+    )
+    # Cross-validate
+    try:
+        scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
+        cv_f1 = scores.mean()
+    except: cv_f1 = 0.0
+    clf.fit(X, y)
+    # Check train accuracy
+    train_pred = clf.predict(X)
+    train_acc = np.mean(train_pred == y)
+    # Calibrate
+    p_raw = clf.predict_proba(X)[:,1]
+    cal = IsotonicRegression(out_of_bounds='clip')
+    cal.fit(p_raw, y)
+    p_cal = cal.transform(p_raw)
+    # Check calibration range
+    p_min, p_max = p_cal.min(), p_cal.max()
+    p_mean = p_cal.mean()
+    tier_clfs[t] = clf
+    tier_calibs[t] = cal
+    print(f"  Tier {t}: cv_f1={cv_f1:.3f}, train_acc={train_acc:.3f}, "
+          f"P(success) range=[{p_min:.3f},{p_max:.3f}], mean={p_mean:.3f}")
+from collections import Counter
+# Evaluate with different thresholds
+print("\n[4] Evaluating with threshold sweep...")
+best_thr = None
+best_score = -999
+for thr in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85]:
+    succ=0; cost=0.0
+    for iid, model_results in traces.items():
+        problem = next(iter(model_results.values()))['problem']
+        feats = extract_features(problem)
+        feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1)
+        # Route: cheapest tier with P(success) >= thr
+        selected_tier = 5
+        tier_probs = {}
+        for t in range(1,6):
+            p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1]
+            p_cal = float(tier_calibs[t].transform([p_raw])[0])
+            tier_probs[t] = p_cal
+            if p_cal >= thr and selected_tier == 5:
+                selected_tier = t
+        model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7')
+        if model in model_results and model_results[model]['resolved']:
+            succ += 1
+            cost += model_results[model]['cost']
+        else:
+            cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier])
+    sr = succ/len(traces)
+    ac = cost/len(traces)
+    cr = (1-ac/0.3167)*100
+    score = sr*20 - ac*10  # weighted score
+    print(f"  thr={thr:.2f}: success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%")
+    if score > best_score:
+        best_score = score
+        best_thr = thr
+print(f"\n  Best threshold: {best_thr}")
+# v10 + feedback: route cheap, escalate on failure
+print("\n[5] v10 + feedback evaluation...")
+for thr in [0.70, 0.75, 0.80]:
+    succ=0; cost=0.0; escalated=0
+    for iid, model_results in traces.items():
+        problem = next(iter(model_results.values()))['problem']
+        feats = extract_features(problem)
+        feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1)
+        selected_tier = 5
+        for t in range(1,6):
+            p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1]
+            p_cal = float(tier_calibs[t].transform([p_raw])[0])
+            if p_cal >= thr and selected_tier == 5:
+                selected_tier = t
+        model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7')
+        # Try cheap model first
+        if model in model_results and model_results[model]['resolved']:
+            succ += 1
+            cost += model_results[model]['cost']
+        elif selected_tier < 5:
+            # Escalate
+            up_tier = min(selected_tier+1, 5)
+            up_model = TIER_TO_MODEL.get(up_tier, 'claude-opus-4.7')
+            escalated += 1
+            if up_model in model_results and model_results[up_model]['resolved']:
+                succ += 1
+                cost += model_results[model]['cost'] + model_results[up_model]['cost']
+            else:
+                cost += model_results[model]['cost'] + model_results.get(up_model,{}).get('cost', TIER_COST[up_tier])
+        else:
+            cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier])
+    sr = succ/len(traces)
+    ac = cost/len(traces)
+    cr = (1-ac/0.3167)*100
+    print(f"  v10_feedback(thr={thr:.2f}): success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%, escalated={escalated}")
+# Save fixed bundle
+v10_fixed = {
+    'tier_clfs': {str(k):v for k,v in tier_clfs.items()},
+    'tier_calibrators': {str(k):v for k,v in tier_calibs.items()},
+    'feat_keys': FEAT_KEYS,
+    'tier_config': {str(k):v for k,v in TIER_COST.items()},
+    'version': '10.1',
+    'description': 'ACO v10.1: Regularized XGBoost on SWE-Router data',
+    'best_threshold': best_thr,
+}
+with open('/app/router_models/router_bundle_v10_fixed.pkl', 'wb') as f:
+    pickle.dump(v10_fixed, f)
+print(f"\nSaved v10.1 bundle")
+print("DONE!")