muthuk1 commited on
Commit
0764371
Β·
verified Β·
1 Parent(s): 16f6fe0

Upload training/train_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/train_v2.py +438 -0
training/train_v2.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ALWAS ML Models v2 β€” Retrained with overfitting fixes:
3
+ 1. Hours: stronger regularization (lower depth, higher min_child_weight)
4
+ 2. Complexity: reduced tree depth + stronger L1/L2
5
+ 3. Bottleneck: removed leaky features
6
+ 4. Completion: group-aware split
7
+ """
8
+ import numpy as np
9
+ import pandas as pd
10
+ import json
11
+ import joblib
12
+ import os
13
+ from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
14
+ from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
15
+ from sklearn.metrics import (
16
+ mean_absolute_error, mean_squared_error, r2_score,
17
+ classification_report, accuracy_score, f1_score
18
+ )
19
+ from sklearn.calibration import CalibratedClassifierCV
20
+ import xgboost as xgb
21
+ import lightgbm as lgb
22
+
23
+ df = pd.read_csv('/app/alwas_blocks_dataset.csv')
24
+
25
+ # Encode
26
+ tech_node_encoder = LabelEncoder()
27
+ block_type_encoder = LabelEncoder()
28
+ priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
29
+
30
+ df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
31
+ df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
32
+ df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
33
+ df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
34
+ df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
35
+ df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
36
+
37
+ complexity_encoder = LabelEncoder()
38
+ df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
39
+ bottleneck_encoder = LabelEncoder()
40
+ df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
41
+
42
+ # Safe derived features for bottleneck
43
+ df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
44
+ df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)
45
+
46
+ completed = df[df['is_completed'] == 1].copy()
47
+
48
+ all_metrics = {}
49
+
50
+ # =====================================================================
51
+ # MODEL 1: Hours Estimator β€” REGULARIZED
52
+ # =====================================================================
53
+ print("=" * 60)
54
+ print("MODEL 1: Hours Estimator (regularized)")
55
+ print("=" * 60)
56
+
57
+ HOURS_FEATURES = [
58
+ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
59
+ 'transistor_count', 'transistor_count_log', 'has_dependencies',
60
+ 'num_dependencies', 'constraint_complexity', 'drc_iterations',
61
+ 'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
62
+ 'size_priority_interaction'
63
+ ]
64
+
65
+ X_h = completed[HOURS_FEATURES]
66
+ y_h = completed['actual_hours']
67
+ X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
68
+
69
+ hours_model = xgb.XGBRegressor(
70
+ n_estimators=300, # reduced from 500
71
+ learning_rate=0.05,
72
+ max_depth=4, # reduced from 7
73
+ subsample=0.7, # reduced from 0.8
74
+ colsample_bytree=0.7, # reduced from 0.8
75
+ min_child_weight=10, # increased from 3
76
+ reg_alpha=1.0, # increased from 0.1
77
+ reg_lambda=5.0, # increased from 1.0
78
+ gamma=0.5, # added: min split loss
79
+ objective='reg:squarederror',
80
+ tree_method='hist',
81
+ random_state=42,
82
+ early_stopping_rounds=30,
83
+ )
84
+ hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)
85
+
86
+ train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
87
+ test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
88
+ train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
89
+ test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
90
+ cv_model_h = xgb.XGBRegressor(
91
+ n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
92
+ colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
93
+ gamma=0.5, tree_method='hist', random_state=42,
94
+ )
95
+ cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')
96
+
97
+ print(f" Train RΒ²: {train_r2:.4f} Test RΒ²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}")
98
+ print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}")
99
+ print(f" CV RΒ²: {cv.mean():.4f} Β± {cv.std():.4f}")
100
+
101
+ all_metrics['hours_estimation'] = {
102
+ 'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
103
+ 'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
104
+ 'gap': round(train_r2 - test_r2, 4),
105
+ 'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
106
+ }
107
+
108
+ # =====================================================================
109
+ # MODEL 2: Complexity Classifier β€” REGULARIZED
110
+ # =====================================================================
111
+ print("\n" + "=" * 60)
112
+ print("MODEL 2: Complexity Classifier (regularized)")
113
+ print("=" * 60)
114
+
115
+ COMPLEXITY_FEATURES = [
116
+ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
117
+ 'transistor_count', 'transistor_count_log', 'has_dependencies',
118
+ 'num_dependencies', 'constraint_complexity', 'drc_iterations',
119
+ 'type_node_interaction', 'complexity_score', 'size_priority_interaction'
120
+ ]
121
+
122
+ X_c = completed[COMPLEXITY_FEATURES]
123
+ y_c = completed['complexity_encoded']
124
+ X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
125
+
126
+ xgb_clf = xgb.XGBClassifier(
127
+ n_estimators=200,
128
+ learning_rate=0.05,
129
+ max_depth=4, # reduced from 6
130
+ subsample=0.7,
131
+ colsample_bytree=0.7,
132
+ min_child_weight=10, # increased
133
+ reg_alpha=1.0,
134
+ reg_lambda=5.0,
135
+ gamma=0.5,
136
+ objective='multi:softprob',
137
+ num_class=3,
138
+ tree_method='hist',
139
+ random_state=42,
140
+ early_stopping_rounds=30,
141
+ )
142
+ xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)
143
+
144
+ lgb_clf = lgb.LGBMClassifier(
145
+ n_estimators=200,
146
+ learning_rate=0.05,
147
+ num_leaves=15, # reduced from 63
148
+ max_depth=4,
149
+ subsample=0.7,
150
+ colsample_bytree=0.7,
151
+ min_child_samples=20, # increased
152
+ reg_alpha=1.0,
153
+ reg_lambda=5.0,
154
+ random_state=42,
155
+ verbose=-1,
156
+ )
157
+ lgb_clf.fit(X_train_c, y_train_c)
158
+
159
+ train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
160
+ test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
161
+ train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
162
+ test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
163
+
164
+ # Ensemble
165
+ xgb_p = xgb_clf.predict_proba(X_test_c)
166
+ lgb_p = lgb_clf.predict_proba(X_test_c)
167
+ ens_p = (xgb_p + lgb_p) / 2
168
+ y_pred_ens = np.argmax(ens_p, axis=1)
169
+ ens_acc = accuracy_score(y_test_c, y_pred_ens)
170
+ ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')
171
+
172
+ cv_model_c = xgb.XGBClassifier(
173
+ n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
174
+ colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
175
+ gamma=0.5, tree_method='hist', random_state=42,
176
+ )
177
+ cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')
178
+
179
+ print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}")
180
+ print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}")
181
+ print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}")
182
+ print(f" CV Acc: {cv_c.mean():.4f} Β± {cv_c.std():.4f}")
183
+
184
+ all_metrics['complexity_classification'] = {
185
+ 'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
186
+ 'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
187
+ 'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
188
+ 'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
189
+ }
190
+
191
+ # =====================================================================
192
+ # MODEL 3: Bottleneck β€” LEAKAGE-FREE
193
+ # =====================================================================
194
+ print("\n" + "=" * 60)
195
+ print("MODEL 3: Bottleneck Predictor (leakage-free)")
196
+ print("=" * 60)
197
+
198
+ SAFE_BOTTLENECK_FEATURES = [
199
+ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
200
+ 'transistor_count_log', 'has_dependencies', 'num_dependencies',
201
+ 'constraint_complexity', 'estimated_hours', 'hours_logged',
202
+ 'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
203
+ 'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
204
+ 'hours_budget_pct', 'stage_velocity'
205
+ ]
206
+
207
+ X_b = df[SAFE_BOTTLENECK_FEATURES]
208
+ y_b = df['bottleneck_encoded']
209
+ X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
210
+
211
+ base_bn = xgb.XGBClassifier(
212
+ n_estimators=300,
213
+ learning_rate=0.05,
214
+ max_depth=4,
215
+ subsample=0.7,
216
+ colsample_bytree=0.7,
217
+ min_child_weight=10,
218
+ reg_alpha=1.0,
219
+ reg_lambda=5.0,
220
+ gamma=0.5,
221
+ objective='multi:softprob',
222
+ num_class=3,
223
+ tree_method='hist',
224
+ random_state=42,
225
+ )
226
+ bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
227
+ bn_model.fit(X_train_b, y_train_b)
228
+
229
+ train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
230
+ test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
231
+ test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
232
+ cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')
233
+
234
+ print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}")
235
+ print(f" F1 (weighted): {test_f1_bn:.4f}")
236
+ print(f" CV Acc: {cv_bn.mean():.4f} Β± {cv_bn.std():.4f}")
237
+ print(f"\n Classification Report:")
238
+ print(classification_report(y_test_b, bn_model.predict(X_test_b),
239
+ target_names=bottleneck_encoder.classes_))
240
+
241
+ all_metrics['bottleneck_prediction'] = {
242
+ 'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
243
+ 'gap': round(train_bn - test_bn, 4),
244
+ 'f1_weighted': round(test_f1_bn, 4),
245
+ 'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
246
+ 'features_used': 'SAFE (no leaky features)',
247
+ }
248
+
249
+ # =====================================================================
250
+ # MODEL 4: Completion β€” GROUP-AWARE SPLIT
251
+ # =====================================================================
252
+ print("\n" + "=" * 60)
253
+ print("MODEL 4: Completion Predictor (group-aware split)")
254
+ print("=" * 60)
255
+
256
+ COMPLETION_FEATURES = [
257
+ 'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
258
+ 'transistor_count_log', 'has_dependencies', 'num_dependencies',
259
+ 'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
260
+ 'drc_iterations', 'current_stage_idx', 'cumulative_hours',
261
+ 'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
262
+ 'hours_vs_estimate_ratio', 'stages_completed',
263
+ 'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
264
+ ]
265
+
266
+ # Build samples with block_id
267
+ training_samples = []
268
+ for _, row in completed.iterrows():
269
+ try:
270
+ transitions = json.loads(row['transitions'])
271
+ except:
272
+ continue
273
+ total_actual_hours = row['actual_hours']
274
+ cumulative_hours = 0
275
+ cumulative_days = 0
276
+ cumulative_drc = 0
277
+ cumulative_lvs = 0
278
+ for i, t in enumerate(transitions):
279
+ if i == 0:
280
+ continue
281
+ cumulative_hours += t.get('hours_in_stage', 0)
282
+ cumulative_days += t.get('days_in_stage', 0)
283
+ cumulative_drc += t.get('drc_violations', 0)
284
+ cumulative_lvs += t.get('lvs_mismatches', 0)
285
+ remaining = max(0, total_actual_hours - cumulative_hours)
286
+ training_samples.append({
287
+ 'block_id': row['block_id'],
288
+ 'tech_node_encoded': row.get('tech_node_encoded', 0),
289
+ 'block_type_encoded': row.get('block_type_encoded', 0),
290
+ 'priority_numeric': row['priority_numeric'],
291
+ 'transistor_count_log': row['transistor_count_log'],
292
+ 'has_dependencies': row['has_dependencies'],
293
+ 'num_dependencies': row['num_dependencies'],
294
+ 'constraint_complexity': row['constraint_complexity'],
295
+ 'estimated_hours': row['estimated_hours'],
296
+ 'engineer_skill_factor': row['engineer_skill_factor'],
297
+ 'drc_iterations': row['drc_iterations'],
298
+ 'current_stage_idx': i,
299
+ 'cumulative_hours': cumulative_hours,
300
+ 'cumulative_days': cumulative_days,
301
+ 'cumulative_drc_violations': cumulative_drc,
302
+ 'cumulative_lvs_mismatches': cumulative_lvs,
303
+ 'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
304
+ 'stages_completed': i,
305
+ 'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
306
+ 'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
307
+ 'remaining_hours': remaining,
308
+ })
309
+
310
+ train_df = pd.DataFrame(training_samples)
311
+
312
+ # Group-aware split
313
+ unique_blocks = train_df['block_id'].unique()
314
+ rng = np.random.RandomState(42)
315
+ rng.shuffle(unique_blocks)
316
+ split_idx = int(len(unique_blocks) * 0.8)
317
+ train_blocks = set(unique_blocks[:split_idx])
318
+ test_blocks = set(unique_blocks[split_idx:])
319
+
320
+ train_mask = train_df['block_id'].isin(train_blocks)
321
+ test_mask = train_df['block_id'].isin(test_blocks)
322
+
323
+ X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
324
+ y_train_g = train_df.loc[train_mask, 'remaining_hours']
325
+ X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
326
+ y_test_g = train_df.loc[test_mask, 'remaining_hours']
327
+
328
+ completion_model = xgb.XGBRegressor(
329
+ n_estimators=500,
330
+ learning_rate=0.03,
331
+ max_depth=5, # reduced from 8
332
+ subsample=0.7,
333
+ colsample_bytree=0.7,
334
+ min_child_weight=10,
335
+ reg_alpha=1.0,
336
+ reg_lambda=5.0,
337
+ gamma=0.5,
338
+ objective='reg:squarederror',
339
+ tree_method='hist',
340
+ random_state=42,
341
+ early_stopping_rounds=30,
342
+ )
343
+ completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
344
+
345
+ train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
346
+ test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
347
+ train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
348
+ test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))
349
+
350
+ # GroupKFold CV
351
+ groups = train_df['block_id'].values
352
+ gkf = GroupKFold(n_splits=5)
353
+ cv_model = xgb.XGBRegressor(
354
+ n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
355
+ colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
356
+ gamma=0.5, tree_method='hist', random_state=42
357
+ )
358
+ cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES],
359
+ train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')
360
+
361
+ print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
362
+ print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks")
363
+ print(f" Train RΒ²: {train_r2_g:.4f} Test RΒ²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}")
364
+ print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}")
365
+ print(f" GroupKFold CV RΒ²: {cv_scores_g.mean():.4f} Β± {cv_scores_g.std():.4f}")
366
+
367
+ all_metrics['completion_prediction'] = {
368
+ 'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
369
+ 'gap': round(train_r2_g - test_r2_g, 4),
370
+ 'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
371
+ 'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
372
+ 'group_cv_r2_std': round(cv_scores_g.std(), 4),
373
+ 'split_type': 'group-aware (block-level)',
374
+ }
375
+
376
+ # =====================================================================
377
+ # SAVE ALL v2 MODELS
378
+ # =====================================================================
379
+ print("\n" + "=" * 60)
380
+ print("SAVING v2 MODELS")
381
+ print("=" * 60)
382
+
383
+ os.makedirs('/app/models_v2', exist_ok=True)
384
+
385
+ joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
386
+ joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
387
+ joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
388
+ joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
389
+ joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')
390
+
391
+ # Encoders
392
+ joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
393
+ joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
394
+ joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
395
+ joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
396
+ joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')
397
+
398
+ # Feature config
399
+ feature_config = {
400
+ 'hours_features': HOURS_FEATURES,
401
+ 'complexity_features': COMPLEXITY_FEATURES,
402
+ 'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
403
+ 'completion_features': COMPLETION_FEATURES,
404
+ 'tech_nodes': list(tech_node_encoder.classes_),
405
+ 'block_types': list(block_type_encoder.classes_),
406
+ 'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
407
+ 'complexity_classes': list(complexity_encoder.classes_),
408
+ 'bottleneck_classes': list(bottleneck_encoder.classes_),
409
+ }
410
+ with open('/app/models_v2/feature_config.json', 'w') as f:
411
+ json.dump(feature_config, f, indent=2)
412
+
413
+ # Metrics
414
+ all_metrics['training_data'] = {
415
+ 'total_samples': len(df),
416
+ 'completed_blocks': int(df['is_completed'].sum()),
417
+ 'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
418
+ 'completion_train_samples': len(X_train_g),
419
+ }
420
+ with open('/app/models_v2/metrics.json', 'w') as f:
421
+ json.dump(all_metrics, f, indent=2)
422
+
423
+ print("All v2 models saved to /app/models_v2/")
424
+
425
+ # Final summary
426
+ print("\n" + "=" * 60)
427
+ print("v1 vs v2 COMPARISON")
428
+ print("=" * 60)
429
+ print(f"""
430
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
431
+ β”‚ Model β”‚ v1 (overfit) β”‚ v2 (fixed) β”‚
432
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
433
+ β”‚ Hours Estimator β”‚ RΒ²=0.881 (gap 0.113) β”‚ RΒ²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) β”‚
434
+ β”‚ Complexity Classifier β”‚ Acc=92.3% (gap 5.9%) β”‚ Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%) β”‚
435
+ β”‚ Bottleneck Predictor β”‚ 99.6% (DATA LEAKAGE) β”‚ {test_bn*100:.1f}% (honest) β”‚
436
+ β”‚ Completion Predictor β”‚ RΒ²=0.945 (GROUP LEAK) β”‚ RΒ²={test_r2_g:.3f} (grouped) β”‚
437
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
438
+ """)