narcolepticchicken commited on
Commit
8d7e0ef
·
verified ·
1 Parent(s): 946830d

Upload training/train_v10_fixed.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/train_v10_fixed.py +250 -0
training/train_v10_fixed.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """v10 Router: Fixed regularization for 500-sample training set.
3
+ from collections import Counter
4
+
5
+ Problem: XGBoost with 23 features and 500 samples overfits (100% train acc).
6
+ Solution: Heavy regularization + fewer estimators + stratified CV.
7
+ """
8
+ import sys, json, random, pickle, numpy as np
9
+ from collections import defaultdict
10
+ from datasets import load_dataset
11
+ import warnings
12
+ from collections import Counter
13
+ warnings.filterwarnings('ignore')
14
+
15
+ from xgboost import XGBClassifier
16
+ from sklearn.calibration import IsotonicRegression
17
+ from sklearn.model_selection import cross_val_score
18
+
19
+ print("="*80)
20
+ print("v10 ROUTER: FIXED REGULARIZATION")
21
+ print("="*80)
22
+
23
+ # Load traces
24
+ MODELS = ['claude-opus-4.7','gpt-5-mini','gpt-5-nano','gpt-5.2',
25
+ 'gemini-2.5-pro','gemini-3-pro','deepseek-v3.2','deepseek-v4-flash']
26
+ MODEL_TIER = {
27
+ 'deepseek-v4-flash':1,'gpt-5-nano':1,'gpt-5-mini':2,'deepseek-v3.2':2,
28
+ 'gemini-2.5-pro':3,'claude-opus-4.7':4,'gpt-5.2':4,'gemini-3-pro':5,
29
+ }
30
+ TIER_COST = {1:0.01,2:0.05,3:0.15,4:0.30,5:0.50}
31
+ TIER_TO_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',4:'claude-opus-4.7',5:'gemini-3-pro'}
32
+
33
+ # Feature extraction (same as before)
34
+ CODE_KW=["python","code","function","bug","debug","refactor","implement","test","error","traceback","import"]
35
+ CRITICAL_KW=["critical","production","urgent","emergency","live","deployed","safety","security"]
36
+ SIMPLE_KW=["typo","simple","quick","brief","minor","small","easy","trivial","just"]
37
+
38
+ FEAT_KEYS = sorted([
39
+ 'req_len','num_words','has_code','n_code','has_critical','has_simple',
40
+ 'has_error_msg','has_file_path','n_lines','has_fix','has_add',
41
+ 'has_change','has_test','has_doc',
42
+ ])
43
+
44
+ def extract_features(text):
45
+ r = text.lower()
46
+ return {
47
+ 'req_len':len(text),'num_words':len(text.split()),
48
+ 'has_code':int(any(k in r for k in CODE_KW)),
49
+ 'n_code':sum(1 for k in CODE_KW if k in r),
50
+ 'has_critical':int(any(k in r for k in CRITICAL_KW)),
51
+ 'has_simple':int(any(k in r for k in SIMPLE_KW)),
52
+ 'has_error_msg':int('error' in r or 'traceback' in r or 'exception' in r),
53
+ 'has_file_path':int('/' in r),
54
+ 'n_lines':text.count('\n')+1,
55
+ 'has_fix':int('fix' in r or 'bug' in r or 'issue' in r),
56
+ 'has_add':int('add' in r or 'new' in r or 'create' in r),
57
+ 'has_change':int('change' in r or 'modify' in r or 'update' in r),
58
+ 'has_test':int('test' in r or 'spec' in r),
59
+ 'has_doc':int('doc' in r or 'readme' in r),
60
+ }
61
+
62
+ print("\n[1] Loading traces...")
63
+ traces = defaultdict(dict)
64
+ for model in MODELS:
65
+ ds = load_dataset(f'SWE-Router/swebench-verified-{model}', split='test')
66
+ for row in ds:
67
+ traces[row['instance_id']][model] = {
68
+ 'resolved':row['resolved'], 'cost':float(row['instance_cost']),
69
+ 'problem':row['problem_statement'],
70
+ }
71
+ print(f" {len(traces)} tasks loaded")
72
+
73
+ print("\n[2] Building features...")
74
+ X = []
75
+ tier_labels = {t:[] for t in range(1,6)}
76
+ optimal_tiers = []
77
+
78
+ for iid, model_results in traces.items():
79
+ problem = next(iter(model_results.values()))['problem']
80
+ feats = extract_features(problem)
81
+ feat_vec = [float(feats.get(k,0.0)) for k in FEAT_KEYS]
82
+ X.append(feat_vec)
83
+
84
+ tier_success = {}
85
+ for model, result in model_results.items():
86
+ tier = MODEL_TIER[model]
87
+ if tier not in tier_success: tier_success[tier] = False
88
+ if result['resolved']: tier_success[tier] = True
89
+
90
+ for t in range(1,6):
91
+ tier_labels[t].append(int(tier_success.get(t, False)))
92
+
93
+ opt = 5
94
+ for t in range(1,6):
95
+ if tier_success.get(t, False): opt = t; break
96
+ optimal_tiers.append(opt)
97
+
98
+ X = np.array(X, dtype=np.float32)
99
+ print(f" X shape: {X.shape}")
100
+ print(f" Optimal tier dist: {Counter(optimal_tiers)}")
101
+
102
+ # Train with HEAVY regularization
103
+ print("\n[3] Training with heavy regularization...")
104
+ tier_clfs = {}
105
+ tier_calibs = {}
106
+
107
+ for t in range(1,6):
108
+ y = np.array(tier_labels[t])
109
+ n_pos = y.sum()
110
+ spw = max(1, (len(y)-n_pos)/max(n_pos,1))
111
+
112
+ # Heavy regularization to prevent overfitting on 500 samples
113
+ clf = XGBClassifier(
114
+ n_estimators=50, # Reduced from 200
115
+ max_depth=3, # Reduced from 5
116
+ learning_rate=0.1,
117
+ subsample=0.7,
118
+ colsample_bytree=0.6,
119
+ min_child_weight=10, # Prevent memorization
120
+ gamma=1.0, # Require significant splits
121
+ reg_alpha=1.0, # L1 regularization
122
+ reg_lambda=5.0, # L2 regularization
123
+ scale_pos_weight=spw,
124
+ eval_metric='logloss',
125
+ random_state=42,
126
+ )
127
+
128
+ # Cross-validate
129
+ try:
130
+ scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
131
+ cv_f1 = scores.mean()
132
+ except: cv_f1 = 0.0
133
+
134
+ clf.fit(X, y)
135
+
136
+ # Check train accuracy
137
+ train_pred = clf.predict(X)
138
+ train_acc = np.mean(train_pred == y)
139
+
140
+ # Calibrate
141
+ p_raw = clf.predict_proba(X)[:,1]
142
+ cal = IsotonicRegression(out_of_bounds='clip')
143
+ cal.fit(p_raw, y)
144
+ p_cal = cal.transform(p_raw)
145
+
146
+ # Check calibration range
147
+ p_min, p_max = p_cal.min(), p_cal.max()
148
+ p_mean = p_cal.mean()
149
+
150
+ tier_clfs[t] = clf
151
+ tier_calibs[t] = cal
152
+ print(f" Tier {t}: cv_f1={cv_f1:.3f}, train_acc={train_acc:.3f}, "
153
+ f"P(success) range=[{p_min:.3f},{p_max:.3f}], mean={p_mean:.3f}")
154
+
155
+ from collections import Counter
156
+
157
+ # Evaluate with different thresholds
158
+ print("\n[4] Evaluating with threshold sweep...")
159
+ best_thr = None
160
+ best_score = -999
161
+
162
+ for thr in [0.60, 0.65, 0.70, 0.75, 0.80, 0.85]:
163
+ succ=0; cost=0.0
164
+ for iid, model_results in traces.items():
165
+ problem = next(iter(model_results.values()))['problem']
166
+ feats = extract_features(problem)
167
+ feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1)
168
+
169
+ # Route: cheapest tier with P(success) >= thr
170
+ selected_tier = 5
171
+ tier_probs = {}
172
+ for t in range(1,6):
173
+ p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1]
174
+ p_cal = float(tier_calibs[t].transform([p_raw])[0])
175
+ tier_probs[t] = p_cal
176
+ if p_cal >= thr and selected_tier == 5:
177
+ selected_tier = t
178
+
179
+ model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7')
180
+ if model in model_results and model_results[model]['resolved']:
181
+ succ += 1
182
+ cost += model_results[model]['cost']
183
+ else:
184
+ cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier])
185
+
186
+ sr = succ/len(traces)
187
+ ac = cost/len(traces)
188
+ cr = (1-ac/0.3167)*100
189
+ score = sr*20 - ac*10 # weighted score
190
+ print(f" thr={thr:.2f}: success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%")
191
+ if score > best_score:
192
+ best_score = score
193
+ best_thr = thr
194
+
195
+ print(f"\n Best threshold: {best_thr}")
196
+
197
+ # v10 + feedback: route cheap, escalate on failure
198
+ print("\n[5] v10 + feedback evaluation...")
199
+ for thr in [0.70, 0.75, 0.80]:
200
+ succ=0; cost=0.0; escalated=0
201
+ for iid, model_results in traces.items():
202
+ problem = next(iter(model_results.values()))['problem']
203
+ feats = extract_features(problem)
204
+ feat_vec = np.array([float(feats.get(k,0.0)) for k in FEAT_KEYS], dtype=np.float32).reshape(1,-1)
205
+
206
+ selected_tier = 5
207
+ for t in range(1,6):
208
+ p_raw = tier_clfs[t].predict_proba(feat_vec)[0,1]
209
+ p_cal = float(tier_calibs[t].transform([p_raw])[0])
210
+ if p_cal >= thr and selected_tier == 5:
211
+ selected_tier = t
212
+
213
+ model = TIER_TO_MODEL.get(selected_tier, 'claude-opus-4.7')
214
+
215
+ # Try cheap model first
216
+ if model in model_results and model_results[model]['resolved']:
217
+ succ += 1
218
+ cost += model_results[model]['cost']
219
+ elif selected_tier < 5:
220
+ # Escalate
221
+ up_tier = min(selected_tier+1, 5)
222
+ up_model = TIER_TO_MODEL.get(up_tier, 'claude-opus-4.7')
223
+ escalated += 1
224
+ if up_model in model_results and model_results[up_model]['resolved']:
225
+ succ += 1
226
+ cost += model_results[model]['cost'] + model_results[up_model]['cost']
227
+ else:
228
+ cost += model_results[model]['cost'] + model_results.get(up_model,{}).get('cost', TIER_COST[up_tier])
229
+ else:
230
+ cost += model_results.get(model,{}).get('cost', TIER_COST[selected_tier])
231
+
232
+ sr = succ/len(traces)
233
+ ac = cost/len(traces)
234
+ cr = (1-ac/0.3167)*100
235
+ print(f" v10_feedback(thr={thr:.2f}): success={sr:.3f}, cost=${ac:.4f}, costRed={cr:.1f}%, escalated={escalated}")
236
+
237
+ # Save fixed bundle
238
+ v10_fixed = {
239
+ 'tier_clfs': {str(k):v for k,v in tier_clfs.items()},
240
+ 'tier_calibrators': {str(k):v for k,v in tier_calibs.items()},
241
+ 'feat_keys': FEAT_KEYS,
242
+ 'tier_config': {str(k):v for k,v in TIER_COST.items()},
243
+ 'version': '10.1',
244
+ 'description': 'ACO v10.1: Regularized XGBoost on SWE-Router data',
245
+ 'best_threshold': best_thr,
246
+ }
247
+ with open('/app/router_models/router_bundle_v10_fixed.pkl', 'wb') as f:
248
+ pickle.dump(v10_fixed, f)
249
+ print(f"\nSaved v10.1 bundle")
250
+ print("DONE!")