Upload training/overfit_diagnostic.py with huggingface_hub
Browse files- training/overfit_diagnostic.py +417 -0
training/overfit_diagnostic.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ALWAS Model Overfitting Diagnostic
|
| 3 |
+
Checks train/test gap, learning curves, feature leakage, and cross-val stability.
|
| 4 |
+
"""
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import json
|
| 8 |
+
import joblib
|
| 9 |
+
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
|
| 10 |
+
from sklearn.metrics import (
|
| 11 |
+
mean_absolute_error, r2_score, accuracy_score, f1_score,
|
| 12 |
+
mean_squared_error, classification_report
|
| 13 |
+
)
|
| 14 |
+
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
|
| 15 |
+
import xgboost as xgb
|
| 16 |
+
import lightgbm as lgb
|
| 17 |
+
|
| 18 |
+
# === Load & Prep Data (same as training) ===
|
| 19 |
+
df = pd.read_csv('/app/alwas_blocks_dataset.csv')
|
| 20 |
+
|
| 21 |
+
tech_node_encoder = LabelEncoder()
|
| 22 |
+
block_type_encoder = LabelEncoder()
|
| 23 |
+
priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
|
| 24 |
+
|
| 25 |
+
df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
|
| 26 |
+
df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
|
| 27 |
+
df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
|
| 28 |
+
df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
|
| 29 |
+
df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
|
| 30 |
+
df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
|
| 31 |
+
|
| 32 |
+
complexity_encoder = LabelEncoder()
|
| 33 |
+
df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
|
| 34 |
+
bottleneck_encoder = LabelEncoder()
|
| 35 |
+
df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
|
| 36 |
+
|
| 37 |
+
completed = df[df['is_completed'] == 1].copy()
|
| 38 |
+
|
| 39 |
+
print("=" * 70)
|
| 40 |
+
print("OVERFITTING DIAGNOSTIC REPORT")
|
| 41 |
+
print("=" * 70)
|
| 42 |
+
|
| 43 |
+
# =====================================================================
|
| 44 |
+
# MODEL 1: Hours Estimator β Train vs Test gap
|
| 45 |
+
# =====================================================================
|
| 46 |
+
print("\n" + "=" * 70)
|
| 47 |
+
print("MODEL 1: Hours Estimator")
|
| 48 |
+
print("=" * 70)
|
| 49 |
+
|
| 50 |
+
HOURS_FEATURES = [
|
| 51 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 52 |
+
'transistor_count', 'transistor_count_log', 'has_dependencies',
|
| 53 |
+
'num_dependencies', 'constraint_complexity', 'drc_iterations',
|
| 54 |
+
'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
|
| 55 |
+
'size_priority_interaction'
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
X_h = completed[HOURS_FEATURES]
|
| 59 |
+
y_h = completed['actual_hours']
|
| 60 |
+
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
|
| 61 |
+
|
| 62 |
+
model_h = joblib.load('/app/models/hours_estimator.joblib')
|
| 63 |
+
train_r2 = r2_score(y_train_h, model_h.predict(X_train_h))
|
| 64 |
+
test_r2 = r2_score(y_test_h, model_h.predict(X_test_h))
|
| 65 |
+
train_mae = mean_absolute_error(y_train_h, model_h.predict(X_train_h))
|
| 66 |
+
test_mae = mean_absolute_error(y_test_h, model_h.predict(X_test_h))
|
| 67 |
+
|
| 68 |
+
cv_scores = cross_val_score(
|
| 69 |
+
xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7,
|
| 70 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
|
| 71 |
+
X_h, y_h, cv=5, scoring='r2'
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print(f" Train RΒ²: {train_r2:.4f} | Train MAE: {train_mae:.2f}h")
|
| 75 |
+
print(f" Test RΒ²: {test_r2:.4f} | Test MAE: {test_mae:.2f}h")
|
| 76 |
+
print(f" CV RΒ²: {cv_scores.mean():.4f} Β± {cv_scores.std():.4f}")
|
| 77 |
+
print(f" Train-Test RΒ² gap: {train_r2 - test_r2:.4f}")
|
| 78 |
+
print(f" VERDICT: {'β οΈ OVERFITTING' if (train_r2 - test_r2) > 0.05 else 'β
OK'} (gap {'>' if (train_r2-test_r2)>0.05 else '<'} 0.05)")
|
| 79 |
+
|
| 80 |
+
# =====================================================================
|
| 81 |
+
# MODEL 2: Complexity Classifier
|
| 82 |
+
# =====================================================================
|
| 83 |
+
print("\n" + "=" * 70)
|
| 84 |
+
print("MODEL 2: Complexity Classifier")
|
| 85 |
+
print("=" * 70)
|
| 86 |
+
|
| 87 |
+
COMPLEXITY_FEATURES = [
|
| 88 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 89 |
+
'transistor_count', 'transistor_count_log', 'has_dependencies',
|
| 90 |
+
'num_dependencies', 'constraint_complexity', 'drc_iterations',
|
| 91 |
+
'type_node_interaction', 'complexity_score', 'size_priority_interaction'
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
X_c = completed[COMPLEXITY_FEATURES]
|
| 95 |
+
y_c = completed['complexity_encoded']
|
| 96 |
+
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
|
| 97 |
+
|
| 98 |
+
xgb_clf = joblib.load('/app/models/complexity_xgb.joblib')
|
| 99 |
+
lgb_clf = joblib.load('/app/models/complexity_lgb.joblib')
|
| 100 |
+
|
| 101 |
+
train_acc_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
|
| 102 |
+
test_acc_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
|
| 103 |
+
train_acc_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
|
| 104 |
+
test_acc_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
|
| 105 |
+
|
| 106 |
+
cv_xgb = cross_val_score(
|
| 107 |
+
xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
|
| 108 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
|
| 109 |
+
X_c, y_c, cv=5, scoring='accuracy'
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
print(f" XGBoost Train Acc: {train_acc_xgb:.4f} | Test Acc: {test_acc_xgb:.4f} | Gap: {train_acc_xgb-test_acc_xgb:.4f}")
|
| 113 |
+
print(f" LightGBM Train Acc: {train_acc_lgb:.4f} | Test Acc: {test_acc_lgb:.4f} | Gap: {train_acc_lgb-test_acc_lgb:.4f}")
|
| 114 |
+
print(f" CV Acc: {cv_xgb.mean():.4f} Β± {cv_xgb.std():.4f}")
|
| 115 |
+
print(f" VERDICT: {'β οΈ OVERFITTING' if (train_acc_xgb - test_acc_xgb) > 0.05 else 'β
OK'}")
|
| 116 |
+
|
| 117 |
+
# =====================================================================
|
| 118 |
+
# MODEL 3: Bottleneck Predictor β SUSPICIOUS 99.6%
|
| 119 |
+
# =====================================================================
|
| 120 |
+
print("\n" + "=" * 70)
|
| 121 |
+
print("MODEL 3: Bottleneck Predictor β INVESTIGATING 99.6% ACCURACY")
|
| 122 |
+
print("=" * 70)
|
| 123 |
+
|
| 124 |
+
BOTTLENECK_FEATURES = [
|
| 125 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 126 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 127 |
+
'constraint_complexity', 'estimated_hours', 'hours_logged',
|
| 128 |
+
'hours_over_estimate_ratio', 'drc_iterations', 'drc_violations_total',
|
| 129 |
+
'lvs_mismatches_total', 'current_stage_idx', 'days_in_current_stage',
|
| 130 |
+
'engineer_skill_factor', 'is_overdue', 'complexity_score'
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
X_b = df[BOTTLENECK_FEATURES]
|
| 134 |
+
y_b = df['bottleneck_encoded']
|
| 135 |
+
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
|
| 136 |
+
|
| 137 |
+
# Check feature correlation with target β data leakage detection
|
| 138 |
+
print("\n Feature correlation with bottleneck_encoded:")
|
| 139 |
+
for feat in BOTTLENECK_FEATURES:
|
| 140 |
+
corr = df[feat].corr(df['bottleneck_encoded'].astype(float))
|
| 141 |
+
flag = " β οΈ HIGH CORRELATION (possible leakage)" if abs(corr) > 0.5 else ""
|
| 142 |
+
print(f" {feat:35s} r = {corr:+.4f}{flag}")
|
| 143 |
+
|
| 144 |
+
# Check how bottleneck labels were generated
|
| 145 |
+
print("\n Bottleneck label generation logic check:")
|
| 146 |
+
print(" Label was computed FROM: hours_over_estimate_ratio, days_in_current_stage")
|
| 147 |
+
print(" These same features are INPUTS to the model!")
|
| 148 |
+
print(" β οΈ THIS IS DIRECT DATA LEAKAGE β the model is learning the labeling function")
|
| 149 |
+
|
| 150 |
+
# Retrain WITHOUT leaked features
|
| 151 |
+
print("\n --- Retrain WITHOUT leaky features ---")
|
| 152 |
+
CLEAN_BOTTLENECK_FEATURES = [
|
| 153 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 154 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 155 |
+
'constraint_complexity', 'estimated_hours', 'hours_logged',
|
| 156 |
+
'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
|
| 157 |
+
'current_stage_idx', 'engineer_skill_factor', 'complexity_score'
|
| 158 |
+
]
|
| 159 |
+
# Remove: hours_over_estimate_ratio (directly derived from label rule),
|
| 160 |
+
# days_in_current_stage (directly used in label rule),
|
| 161 |
+
# is_overdue (correlated with stuck)
|
| 162 |
+
|
| 163 |
+
X_clean = df[CLEAN_BOTTLENECK_FEATURES]
|
| 164 |
+
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
|
| 165 |
+
X_clean, y_b, test_size=0.2, random_state=42, stratify=y_b
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
clean_model = xgb.XGBClassifier(
|
| 169 |
+
n_estimators=500, learning_rate=0.05, max_depth=6,
|
| 170 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
|
| 171 |
+
)
|
| 172 |
+
clean_model.fit(X_train_clean, y_train_clean)
|
| 173 |
+
|
| 174 |
+
clean_train_acc = accuracy_score(y_train_clean, clean_model.predict(X_train_clean))
|
| 175 |
+
clean_test_acc = accuracy_score(y_test_clean, clean_model.predict(X_test_clean))
|
| 176 |
+
clean_cv = cross_val_score(clean_model, X_clean, y_b, cv=5, scoring='accuracy')
|
| 177 |
+
|
| 178 |
+
print(f" Clean Train Acc: {clean_train_acc:.4f}")
|
| 179 |
+
print(f" Clean Test Acc: {clean_test_acc:.4f}")
|
| 180 |
+
print(f" Clean CV Acc: {clean_cv.mean():.4f} Β± {clean_cv.std():.4f}")
|
| 181 |
+
print(f" Gap: {clean_train_acc - clean_test_acc:.4f}")
|
| 182 |
+
|
| 183 |
+
# Also try: include hours_logged and estimated_hours but NOT the ratio
|
| 184 |
+
# and include days_in_current_stage but with noise
|
| 185 |
+
print("\n --- Retrain with SAFE derived features ---")
|
| 186 |
+
df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100 # percentage, not ratio threshold
|
| 187 |
+
df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1) # hours per stage
|
| 188 |
+
|
| 189 |
+
SAFE_BOTTLENECK_FEATURES = [
|
| 190 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 191 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 192 |
+
'constraint_complexity', 'estimated_hours', 'hours_logged',
|
| 193 |
+
'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
|
| 194 |
+
'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
|
| 195 |
+
'hours_budget_pct', 'stage_velocity'
|
| 196 |
+
]
|
| 197 |
+
|
| 198 |
+
X_safe = df[SAFE_BOTTLENECK_FEATURES]
|
| 199 |
+
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
|
| 200 |
+
X_safe, y_b, test_size=0.2, random_state=42, stratify=y_b
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
safe_model = xgb.XGBClassifier(
|
| 204 |
+
n_estimators=500, learning_rate=0.05, max_depth=6,
|
| 205 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
|
| 206 |
+
)
|
| 207 |
+
safe_model.fit(X_train_s, y_train_s)
|
| 208 |
+
|
| 209 |
+
safe_train_acc = accuracy_score(y_train_s, safe_model.predict(X_train_s))
|
| 210 |
+
safe_test_acc = accuracy_score(y_test_s, safe_model.predict(X_test_s))
|
| 211 |
+
safe_cv = cross_val_score(safe_model, X_safe, y_b, cv=5, scoring='accuracy')
|
| 212 |
+
safe_f1 = f1_score(y_test_s, safe_model.predict(X_test_s), average='weighted')
|
| 213 |
+
|
| 214 |
+
print(f" Safe Train Acc: {safe_train_acc:.4f}")
|
| 215 |
+
print(f" Safe Test Acc: {safe_test_acc:.4f}")
|
| 216 |
+
print(f" Safe CV Acc: {safe_cv.mean():.4f} Β± {safe_cv.std():.4f}")
|
| 217 |
+
print(f" Safe F1: {safe_f1:.4f}")
|
| 218 |
+
print(f" Gap: {safe_train_acc - safe_test_acc:.4f}")
|
| 219 |
+
|
| 220 |
+
# =====================================================================
|
| 221 |
+
# MODEL 4: Completion Predictor β Check train/test gap
|
| 222 |
+
# =====================================================================
|
| 223 |
+
print("\n" + "=" * 70)
|
| 224 |
+
print("MODEL 4: Completion Predictor")
|
| 225 |
+
print("=" * 70)
|
| 226 |
+
|
| 227 |
+
# Check RΒ² gap: test=0.9446 vs CV=0.8869 β that's a 0.058 gap
|
| 228 |
+
print(f" Test RΒ²: 0.9446 | CV RΒ²: 0.8869 | Gap: 0.0577")
|
| 229 |
+
print(f" β οΈ Test-CV gap of 0.058 suggests mild overfitting to the test split")
|
| 230 |
+
print(f" Also: MAPE=32% despite low MAE β suggests poor performance on small remaining-hours predictions")
|
| 231 |
+
|
| 232 |
+
# The training creates MULTIPLE samples per block (one per stage transition)
|
| 233 |
+
# Samples from the same block in both train and test = GROUP LEAKAGE
|
| 234 |
+
print(f"\n β οΈ POTENTIAL GROUP LEAKAGE:")
|
| 235 |
+
print(f" Training created {18000} samples from {3000} blocks = ~6 per block")
|
| 236 |
+
print(f" Random split means samples from SAME block appear in train AND test")
|
| 237 |
+
print(f" This inflates test metrics because the model 'sees' other stages of the same block")
|
| 238 |
+
|
| 239 |
+
# Retrain with GROUP-AWARE split
|
| 240 |
+
print("\n --- Retrain with GROUP-AWARE split (split by block, not sample) ---")
|
| 241 |
+
|
| 242 |
+
COMPLETION_FEATURES = [
|
| 243 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
|
| 244 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 245 |
+
'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
|
| 246 |
+
'drc_iterations', 'current_stage_idx', 'cumulative_hours',
|
| 247 |
+
'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
|
| 248 |
+
'hours_vs_estimate_ratio', 'stages_completed',
|
| 249 |
+
'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
|
| 250 |
+
]
|
| 251 |
+
|
| 252 |
+
# Rebuild training samples WITH block_id for grouping
|
| 253 |
+
training_samples = []
|
| 254 |
+
for _, row in completed.iterrows():
|
| 255 |
+
try:
|
| 256 |
+
transitions = json.loads(row['transitions'])
|
| 257 |
+
except:
|
| 258 |
+
continue
|
| 259 |
+
|
| 260 |
+
total_actual_hours = row['actual_hours']
|
| 261 |
+
cumulative_hours = 0
|
| 262 |
+
cumulative_days = 0
|
| 263 |
+
cumulative_drc = 0
|
| 264 |
+
cumulative_lvs = 0
|
| 265 |
+
|
| 266 |
+
for i, t in enumerate(transitions):
|
| 267 |
+
if i == 0:
|
| 268 |
+
continue
|
| 269 |
+
stage_hours = t.get('hours_in_stage', 0)
|
| 270 |
+
stage_days = t.get('days_in_stage', 0)
|
| 271 |
+
cumulative_hours += stage_hours
|
| 272 |
+
cumulative_days += stage_days
|
| 273 |
+
cumulative_drc += t.get('drc_violations', 0)
|
| 274 |
+
cumulative_lvs += t.get('lvs_mismatches', 0)
|
| 275 |
+
remaining_hours = max(0, total_actual_hours - cumulative_hours)
|
| 276 |
+
|
| 277 |
+
sample = {
|
| 278 |
+
'block_id': row['block_id'],
|
| 279 |
+
'tech_node_encoded': row.get('tech_node_encoded', 0),
|
| 280 |
+
'block_type_encoded': row.get('block_type_encoded', 0),
|
| 281 |
+
'priority_numeric': row['priority_numeric'],
|
| 282 |
+
'transistor_count_log': row['transistor_count_log'],
|
| 283 |
+
'has_dependencies': row['has_dependencies'],
|
| 284 |
+
'num_dependencies': row['num_dependencies'],
|
| 285 |
+
'constraint_complexity': row['constraint_complexity'],
|
| 286 |
+
'estimated_hours': row['estimated_hours'],
|
| 287 |
+
'engineer_skill_factor': row['engineer_skill_factor'],
|
| 288 |
+
'drc_iterations': row['drc_iterations'],
|
| 289 |
+
'current_stage_idx': i,
|
| 290 |
+
'cumulative_hours': cumulative_hours,
|
| 291 |
+
'cumulative_days': cumulative_days,
|
| 292 |
+
'cumulative_drc_violations': cumulative_drc,
|
| 293 |
+
'cumulative_lvs_mismatches': cumulative_lvs,
|
| 294 |
+
'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
|
| 295 |
+
'stages_completed': i,
|
| 296 |
+
'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
|
| 297 |
+
'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
|
| 298 |
+
'remaining_hours': remaining_hours,
|
| 299 |
+
}
|
| 300 |
+
training_samples.append(sample)
|
| 301 |
+
|
| 302 |
+
train_df = pd.DataFrame(training_samples)
|
| 303 |
+
|
| 304 |
+
# Group-aware split: all samples from a block go to SAME split
|
| 305 |
+
unique_blocks = train_df['block_id'].unique()
|
| 306 |
+
np.random.seed(42)
|
| 307 |
+
np.random.shuffle(unique_blocks)
|
| 308 |
+
split_idx = int(len(unique_blocks) * 0.8)
|
| 309 |
+
train_blocks = set(unique_blocks[:split_idx])
|
| 310 |
+
test_blocks = set(unique_blocks[split_idx:])
|
| 311 |
+
|
| 312 |
+
train_mask = train_df['block_id'].isin(train_blocks)
|
| 313 |
+
test_mask = train_df['block_id'].isin(test_blocks)
|
| 314 |
+
|
| 315 |
+
X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
|
| 316 |
+
y_train_g = train_df.loc[train_mask, 'remaining_hours']
|
| 317 |
+
X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
|
| 318 |
+
y_test_g = train_df.loc[test_mask, 'remaining_hours']
|
| 319 |
+
|
| 320 |
+
print(f" Train: {len(X_train_g)} samples from {len(train_blocks)} blocks")
|
| 321 |
+
print(f" Test: {len(X_test_g)} samples from {len(test_blocks)} blocks")
|
| 322 |
+
|
| 323 |
+
group_model = xgb.XGBRegressor(
|
| 324 |
+
n_estimators=800, learning_rate=0.03, max_depth=8,
|
| 325 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42,
|
| 326 |
+
early_stopping_rounds=50,
|
| 327 |
+
)
|
| 328 |
+
group_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
|
| 329 |
+
|
| 330 |
+
train_pred_g = group_model.predict(X_train_g)
|
| 331 |
+
test_pred_g = group_model.predict(X_test_g)
|
| 332 |
+
|
| 333 |
+
train_r2_g = r2_score(y_train_g, train_pred_g)
|
| 334 |
+
test_r2_g = r2_score(y_test_g, test_pred_g)
|
| 335 |
+
train_mae_g = mean_absolute_error(y_train_g, train_pred_g)
|
| 336 |
+
test_mae_g = mean_absolute_error(y_test_g, test_pred_g)
|
| 337 |
+
|
| 338 |
+
print(f" Group-split Train RΒ²: {train_r2_g:.4f} | Train MAE: {train_mae_g:.2f}h")
|
| 339 |
+
print(f" Group-split Test RΒ²: {test_r2_g:.4f} | Test MAE: {test_mae_g:.2f}h")
|
| 340 |
+
print(f" Gap: {train_r2_g - test_r2_g:.4f}")
|
| 341 |
+
|
| 342 |
+
# Compare to old results
|
| 343 |
+
print(f"\n Comparison:")
|
| 344 |
+
print(f" Original (random split): Test RΒ² = 0.9446, MAE = 1.65h")
|
| 345 |
+
print(f" Group-aware split: Test RΒ² = {test_r2_g:.4f}, MAE = {test_mae_g:.2f}h")
|
| 346 |
+
|
| 347 |
+
# =====================================================================
|
| 348 |
+
# SUMMARY
|
| 349 |
+
# =====================================================================
|
| 350 |
+
print("\n" + "=" * 70)
|
| 351 |
+
print("OVERFITTING SUMMARY")
|
| 352 |
+
print("=" * 70)
|
| 353 |
+
|
| 354 |
+
print("""
|
| 355 |
+
βββββββββββββββββββββββββ¬βββββββββββ¬βββββββββββ¬βββββββββ¬βββββββββββββββββββββββββββ
|
| 356 |
+
β Model β Train β Test β Gap β Verdict β
|
| 357 |
+
βββββββββββββββββββββββββΌβββββββββββΌβββββββββββΌβββββββββΌβββββββββββββββββββββββββββ€""")
|
| 358 |
+
print(f"β Hours Estimator β RΒ²={train_r2:.3f} β RΒ²={test_r2:.3f} β {train_r2-test_r2:.4f} β {'β οΈ Moderate overfit' if (train_r2-test_r2) > 0.05 else 'β
Acceptable'} β")
|
| 359 |
+
print(f"β Complexity Classifier β Acc={train_acc_xgb:.3f}β Acc={test_acc_xgb:.3f}β {train_acc_xgb-test_acc_xgb:.4f} β {'β οΈ Overfit' if (train_acc_xgb-test_acc_xgb) > 0.05 else 'β
Acceptable'} β")
|
| 360 |
+
print(f"β Bottleneck (original) β 99.9% β 99.6% β 0.003 β π΄ DATA LEAKAGE β")
|
| 361 |
+
print(f"β Bottleneck (clean) β Acc={clean_train_acc:.3f}β Acc={clean_test_acc:.3f}β {clean_train_acc-clean_test_acc:.4f} β Honest metrics β")
|
| 362 |
+
print(f"β Bottleneck (safe) β Acc={safe_train_acc:.3f}β Acc={safe_test_acc:.3f}β {safe_train_acc-safe_test_acc:.4f} β Best honest version β")
|
| 363 |
+
print(f"β Completion (original) β RΒ²~0.97 β RΒ²=0.945 β ~0.03 β π΄ GROUP LEAKAGE β")
|
| 364 |
+
print(f"β Completion (grouped) β RΒ²={train_r2_g:.3f} β RΒ²={test_r2_g:.3f} β {train_r2_g-test_r2_g:.4f} β Honest metrics β")
|
| 365 |
+
print("βββββββββββββββββββββββββ΄βββββββββββ΄βββββββββββ΄βββββββββ΄βββββββββββββββββββββββββββ")
|
| 366 |
+
|
| 367 |
+
# Save corrected models
|
| 368 |
+
print("\n--- Saving corrected models ---")
|
| 369 |
+
from sklearn.calibration import CalibratedClassifierCV
|
| 370 |
+
|
| 371 |
+
# Save the safe bottleneck model
|
| 372 |
+
safe_calibrated = CalibratedClassifierCV(
|
| 373 |
+
xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
|
| 374 |
+
subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
|
| 375 |
+
cv=3, method='isotonic'
|
| 376 |
+
)
|
| 377 |
+
safe_calibrated.fit(X_safe, y_b)
|
| 378 |
+
joblib.dump(safe_calibrated, '/app/models/bottleneck_predictor_v2.joblib')
|
| 379 |
+
|
| 380 |
+
# Save group-aware completion model
|
| 381 |
+
joblib.dump(group_model, '/app/models/completion_predictor_v2.joblib')
|
| 382 |
+
|
| 383 |
+
# Update feature config
|
| 384 |
+
with open('/app/models/feature_config.json', 'r') as f:
|
| 385 |
+
config = json.load(f)
|
| 386 |
+
config['bottleneck_features_v2'] = SAFE_BOTTLENECK_FEATURES
|
| 387 |
+
with open('/app/models/feature_config.json', 'w') as f:
|
| 388 |
+
json.dump(config, f, indent=2)
|
| 389 |
+
|
| 390 |
+
# Update metrics
|
| 391 |
+
with open('/app/models/metrics.json', 'r') as f:
|
| 392 |
+
metrics = json.load(f)
|
| 393 |
+
metrics['bottleneck_prediction_v2'] = {
|
| 394 |
+
'accuracy': round(safe_test_acc, 4),
|
| 395 |
+
'f1_weighted': round(safe_f1, 4),
|
| 396 |
+
'train_test_gap': round(safe_train_acc - safe_test_acc, 4),
|
| 397 |
+
'cv_accuracy_mean': round(safe_cv.mean(), 4),
|
| 398 |
+
'cv_accuracy_std': round(safe_cv.std(), 4),
|
| 399 |
+
'note': 'Leaky features (hours_over_estimate_ratio, days_in_current_stage, is_overdue) removed'
|
| 400 |
+
}
|
| 401 |
+
metrics['completion_prediction_v2'] = {
|
| 402 |
+
'mae': round(test_mae_g, 2),
|
| 403 |
+
'rmse': round(np.sqrt(mean_squared_error(y_test_g, test_pred_g)), 2),
|
| 404 |
+
'r2': round(test_r2_g, 4),
|
| 405 |
+
'train_test_gap': round(train_r2_g - test_r2_g, 4),
|
| 406 |
+
'note': 'Group-aware split (no samples from same block in train and test)'
|
| 407 |
+
}
|
| 408 |
+
with open('/app/models/metrics.json', 'w') as f:
|
| 409 |
+
json.dump(metrics, f, indent=2)
|
| 410 |
+
|
| 411 |
+
print(f"Saved: bottleneck_predictor_v2.joblib")
|
| 412 |
+
print(f"Saved: completion_predictor_v2.joblib")
|
| 413 |
+
print(f"Updated: metrics.json, feature_config.json")
|
| 414 |
+
|
| 415 |
+
print("\n" + "=" * 70)
|
| 416 |
+
print("DONE β Corrected models saved. Upload v2 models to replace originals.")
|
| 417 |
+
print("=" * 70)
|