Upload training/train_v2.py with huggingface_hub
Browse files- training/train_v2.py +438 -0
training/train_v2.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ALWAS ML Models v2 β Retrained with overfitting fixes:
|
| 3 |
+
1. Hours: stronger regularization (lower depth, higher min_child_weight)
|
| 4 |
+
2. Complexity: reduced tree depth + stronger L1/L2
|
| 5 |
+
3. Bottleneck: removed leaky features
|
| 6 |
+
4. Completion: group-aware split
|
| 7 |
+
"""
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import json
|
| 11 |
+
import joblib
|
| 12 |
+
import os
|
| 13 |
+
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
|
| 14 |
+
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
mean_absolute_error, mean_squared_error, r2_score,
|
| 17 |
+
classification_report, accuracy_score, f1_score
|
| 18 |
+
)
|
| 19 |
+
from sklearn.calibration import CalibratedClassifierCV
|
| 20 |
+
import xgboost as xgb
|
| 21 |
+
import lightgbm as lgb
|
| 22 |
+
|
| 23 |
+
df = pd.read_csv('/app/alwas_blocks_dataset.csv')
|
| 24 |
+
|
| 25 |
+
# Encode
|
| 26 |
+
tech_node_encoder = LabelEncoder()
|
| 27 |
+
block_type_encoder = LabelEncoder()
|
| 28 |
+
priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
|
| 29 |
+
|
| 30 |
+
df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
|
| 31 |
+
df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
|
| 32 |
+
df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
|
| 33 |
+
df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
|
| 34 |
+
df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
|
| 35 |
+
df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
|
| 36 |
+
|
| 37 |
+
complexity_encoder = LabelEncoder()
|
| 38 |
+
df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
|
| 39 |
+
bottleneck_encoder = LabelEncoder()
|
| 40 |
+
df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
|
| 41 |
+
|
| 42 |
+
# Safe derived features for bottleneck
|
| 43 |
+
df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
|
| 44 |
+
df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)
|
| 45 |
+
|
| 46 |
+
completed = df[df['is_completed'] == 1].copy()
|
| 47 |
+
|
| 48 |
+
all_metrics = {}
|
| 49 |
+
|
| 50 |
+
# =====================================================================
|
| 51 |
+
# MODEL 1: Hours Estimator β REGULARIZED
|
| 52 |
+
# =====================================================================
|
| 53 |
+
print("=" * 60)
|
| 54 |
+
print("MODEL 1: Hours Estimator (regularized)")
|
| 55 |
+
print("=" * 60)
|
| 56 |
+
|
| 57 |
+
HOURS_FEATURES = [
|
| 58 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 59 |
+
'transistor_count', 'transistor_count_log', 'has_dependencies',
|
| 60 |
+
'num_dependencies', 'constraint_complexity', 'drc_iterations',
|
| 61 |
+
'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
|
| 62 |
+
'size_priority_interaction'
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
X_h = completed[HOURS_FEATURES]
|
| 66 |
+
y_h = completed['actual_hours']
|
| 67 |
+
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
|
| 68 |
+
|
| 69 |
+
hours_model = xgb.XGBRegressor(
|
| 70 |
+
n_estimators=300, # reduced from 500
|
| 71 |
+
learning_rate=0.05,
|
| 72 |
+
max_depth=4, # reduced from 7
|
| 73 |
+
subsample=0.7, # reduced from 0.8
|
| 74 |
+
colsample_bytree=0.7, # reduced from 0.8
|
| 75 |
+
min_child_weight=10, # increased from 3
|
| 76 |
+
reg_alpha=1.0, # increased from 0.1
|
| 77 |
+
reg_lambda=5.0, # increased from 1.0
|
| 78 |
+
gamma=0.5, # added: min split loss
|
| 79 |
+
objective='reg:squarederror',
|
| 80 |
+
tree_method='hist',
|
| 81 |
+
random_state=42,
|
| 82 |
+
early_stopping_rounds=30,
|
| 83 |
+
)
|
| 84 |
+
hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)
|
| 85 |
+
|
| 86 |
+
train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
|
| 87 |
+
test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
|
| 88 |
+
train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
|
| 89 |
+
test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
|
| 90 |
+
cv_model_h = xgb.XGBRegressor(
|
| 91 |
+
n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
|
| 92 |
+
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
|
| 93 |
+
gamma=0.5, tree_method='hist', random_state=42,
|
| 94 |
+
)
|
| 95 |
+
cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')
|
| 96 |
+
|
| 97 |
+
print(f" Train RΒ²: {train_r2:.4f} Test RΒ²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}")
|
| 98 |
+
print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}")
|
| 99 |
+
print(f" CV RΒ²: {cv.mean():.4f} Β± {cv.std():.4f}")
|
| 100 |
+
|
| 101 |
+
all_metrics['hours_estimation'] = {
|
| 102 |
+
'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
|
| 103 |
+
'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
|
| 104 |
+
'gap': round(train_r2 - test_r2, 4),
|
| 105 |
+
'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# =====================================================================
|
| 109 |
+
# MODEL 2: Complexity Classifier β REGULARIZED
|
| 110 |
+
# =====================================================================
|
| 111 |
+
print("\n" + "=" * 60)
|
| 112 |
+
print("MODEL 2: Complexity Classifier (regularized)")
|
| 113 |
+
print("=" * 60)
|
| 114 |
+
|
| 115 |
+
COMPLEXITY_FEATURES = [
|
| 116 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 117 |
+
'transistor_count', 'transistor_count_log', 'has_dependencies',
|
| 118 |
+
'num_dependencies', 'constraint_complexity', 'drc_iterations',
|
| 119 |
+
'type_node_interaction', 'complexity_score', 'size_priority_interaction'
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
X_c = completed[COMPLEXITY_FEATURES]
|
| 123 |
+
y_c = completed['complexity_encoded']
|
| 124 |
+
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
|
| 125 |
+
|
| 126 |
+
xgb_clf = xgb.XGBClassifier(
|
| 127 |
+
n_estimators=200,
|
| 128 |
+
learning_rate=0.05,
|
| 129 |
+
max_depth=4, # reduced from 6
|
| 130 |
+
subsample=0.7,
|
| 131 |
+
colsample_bytree=0.7,
|
| 132 |
+
min_child_weight=10, # increased
|
| 133 |
+
reg_alpha=1.0,
|
| 134 |
+
reg_lambda=5.0,
|
| 135 |
+
gamma=0.5,
|
| 136 |
+
objective='multi:softprob',
|
| 137 |
+
num_class=3,
|
| 138 |
+
tree_method='hist',
|
| 139 |
+
random_state=42,
|
| 140 |
+
early_stopping_rounds=30,
|
| 141 |
+
)
|
| 142 |
+
xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)
|
| 143 |
+
|
| 144 |
+
lgb_clf = lgb.LGBMClassifier(
|
| 145 |
+
n_estimators=200,
|
| 146 |
+
learning_rate=0.05,
|
| 147 |
+
num_leaves=15, # reduced from 63
|
| 148 |
+
max_depth=4,
|
| 149 |
+
subsample=0.7,
|
| 150 |
+
colsample_bytree=0.7,
|
| 151 |
+
min_child_samples=20, # increased
|
| 152 |
+
reg_alpha=1.0,
|
| 153 |
+
reg_lambda=5.0,
|
| 154 |
+
random_state=42,
|
| 155 |
+
verbose=-1,
|
| 156 |
+
)
|
| 157 |
+
lgb_clf.fit(X_train_c, y_train_c)
|
| 158 |
+
|
| 159 |
+
train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
|
| 160 |
+
test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
|
| 161 |
+
train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
|
| 162 |
+
test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
|
| 163 |
+
|
| 164 |
+
# Ensemble
|
| 165 |
+
xgb_p = xgb_clf.predict_proba(X_test_c)
|
| 166 |
+
lgb_p = lgb_clf.predict_proba(X_test_c)
|
| 167 |
+
ens_p = (xgb_p + lgb_p) / 2
|
| 168 |
+
y_pred_ens = np.argmax(ens_p, axis=1)
|
| 169 |
+
ens_acc = accuracy_score(y_test_c, y_pred_ens)
|
| 170 |
+
ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')
|
| 171 |
+
|
| 172 |
+
cv_model_c = xgb.XGBClassifier(
|
| 173 |
+
n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
|
| 174 |
+
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
|
| 175 |
+
gamma=0.5, tree_method='hist', random_state=42,
|
| 176 |
+
)
|
| 177 |
+
cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')
|
| 178 |
+
|
| 179 |
+
print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}")
|
| 180 |
+
print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}")
|
| 181 |
+
print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}")
|
| 182 |
+
print(f" CV Acc: {cv_c.mean():.4f} Β± {cv_c.std():.4f}")
|
| 183 |
+
|
| 184 |
+
all_metrics['complexity_classification'] = {
|
| 185 |
+
'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
|
| 186 |
+
'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
|
| 187 |
+
'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
|
| 188 |
+
'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
# =====================================================================
|
| 192 |
+
# MODEL 3: Bottleneck β LEAKAGE-FREE
|
| 193 |
+
# =====================================================================
|
| 194 |
+
print("\n" + "=" * 60)
|
| 195 |
+
print("MODEL 3: Bottleneck Predictor (leakage-free)")
|
| 196 |
+
print("=" * 60)
|
| 197 |
+
|
| 198 |
+
SAFE_BOTTLENECK_FEATURES = [
|
| 199 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
|
| 200 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 201 |
+
'constraint_complexity', 'estimated_hours', 'hours_logged',
|
| 202 |
+
'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
|
| 203 |
+
'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
|
| 204 |
+
'hours_budget_pct', 'stage_velocity'
|
| 205 |
+
]
|
| 206 |
+
|
| 207 |
+
X_b = df[SAFE_BOTTLENECK_FEATURES]
|
| 208 |
+
y_b = df['bottleneck_encoded']
|
| 209 |
+
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
|
| 210 |
+
|
| 211 |
+
base_bn = xgb.XGBClassifier(
|
| 212 |
+
n_estimators=300,
|
| 213 |
+
learning_rate=0.05,
|
| 214 |
+
max_depth=4,
|
| 215 |
+
subsample=0.7,
|
| 216 |
+
colsample_bytree=0.7,
|
| 217 |
+
min_child_weight=10,
|
| 218 |
+
reg_alpha=1.0,
|
| 219 |
+
reg_lambda=5.0,
|
| 220 |
+
gamma=0.5,
|
| 221 |
+
objective='multi:softprob',
|
| 222 |
+
num_class=3,
|
| 223 |
+
tree_method='hist',
|
| 224 |
+
random_state=42,
|
| 225 |
+
)
|
| 226 |
+
bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
|
| 227 |
+
bn_model.fit(X_train_b, y_train_b)
|
| 228 |
+
|
| 229 |
+
train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
|
| 230 |
+
test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
|
| 231 |
+
test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
|
| 232 |
+
cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')
|
| 233 |
+
|
| 234 |
+
print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}")
|
| 235 |
+
print(f" F1 (weighted): {test_f1_bn:.4f}")
|
| 236 |
+
print(f" CV Acc: {cv_bn.mean():.4f} Β± {cv_bn.std():.4f}")
|
| 237 |
+
print(f"\n Classification Report:")
|
| 238 |
+
print(classification_report(y_test_b, bn_model.predict(X_test_b),
|
| 239 |
+
target_names=bottleneck_encoder.classes_))
|
| 240 |
+
|
| 241 |
+
all_metrics['bottleneck_prediction'] = {
|
| 242 |
+
'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
|
| 243 |
+
'gap': round(train_bn - test_bn, 4),
|
| 244 |
+
'f1_weighted': round(test_f1_bn, 4),
|
| 245 |
+
'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
|
| 246 |
+
'features_used': 'SAFE (no leaky features)',
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# =====================================================================
|
| 250 |
+
# MODEL 4: Completion β GROUP-AWARE SPLIT
|
| 251 |
+
# =====================================================================
|
| 252 |
+
print("\n" + "=" * 60)
|
| 253 |
+
print("MODEL 4: Completion Predictor (group-aware split)")
|
| 254 |
+
print("=" * 60)
|
| 255 |
+
|
| 256 |
+
COMPLETION_FEATURES = [
|
| 257 |
+
'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
|
| 258 |
+
'transistor_count_log', 'has_dependencies', 'num_dependencies',
|
| 259 |
+
'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
|
| 260 |
+
'drc_iterations', 'current_stage_idx', 'cumulative_hours',
|
| 261 |
+
'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
|
| 262 |
+
'hours_vs_estimate_ratio', 'stages_completed',
|
| 263 |
+
'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
|
| 264 |
+
]
|
| 265 |
+
|
| 266 |
+
# Build samples with block_id
|
| 267 |
+
training_samples = []
|
| 268 |
+
for _, row in completed.iterrows():
|
| 269 |
+
try:
|
| 270 |
+
transitions = json.loads(row['transitions'])
|
| 271 |
+
except:
|
| 272 |
+
continue
|
| 273 |
+
total_actual_hours = row['actual_hours']
|
| 274 |
+
cumulative_hours = 0
|
| 275 |
+
cumulative_days = 0
|
| 276 |
+
cumulative_drc = 0
|
| 277 |
+
cumulative_lvs = 0
|
| 278 |
+
for i, t in enumerate(transitions):
|
| 279 |
+
if i == 0:
|
| 280 |
+
continue
|
| 281 |
+
cumulative_hours += t.get('hours_in_stage', 0)
|
| 282 |
+
cumulative_days += t.get('days_in_stage', 0)
|
| 283 |
+
cumulative_drc += t.get('drc_violations', 0)
|
| 284 |
+
cumulative_lvs += t.get('lvs_mismatches', 0)
|
| 285 |
+
remaining = max(0, total_actual_hours - cumulative_hours)
|
| 286 |
+
training_samples.append({
|
| 287 |
+
'block_id': row['block_id'],
|
| 288 |
+
'tech_node_encoded': row.get('tech_node_encoded', 0),
|
| 289 |
+
'block_type_encoded': row.get('block_type_encoded', 0),
|
| 290 |
+
'priority_numeric': row['priority_numeric'],
|
| 291 |
+
'transistor_count_log': row['transistor_count_log'],
|
| 292 |
+
'has_dependencies': row['has_dependencies'],
|
| 293 |
+
'num_dependencies': row['num_dependencies'],
|
| 294 |
+
'constraint_complexity': row['constraint_complexity'],
|
| 295 |
+
'estimated_hours': row['estimated_hours'],
|
| 296 |
+
'engineer_skill_factor': row['engineer_skill_factor'],
|
| 297 |
+
'drc_iterations': row['drc_iterations'],
|
| 298 |
+
'current_stage_idx': i,
|
| 299 |
+
'cumulative_hours': cumulative_hours,
|
| 300 |
+
'cumulative_days': cumulative_days,
|
| 301 |
+
'cumulative_drc_violations': cumulative_drc,
|
| 302 |
+
'cumulative_lvs_mismatches': cumulative_lvs,
|
| 303 |
+
'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
|
| 304 |
+
'stages_completed': i,
|
| 305 |
+
'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
|
| 306 |
+
'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
|
| 307 |
+
'remaining_hours': remaining,
|
| 308 |
+
})
|
| 309 |
+
|
| 310 |
+
train_df = pd.DataFrame(training_samples)
|
| 311 |
+
|
| 312 |
+
# Group-aware split
|
| 313 |
+
unique_blocks = train_df['block_id'].unique()
|
| 314 |
+
rng = np.random.RandomState(42)
|
| 315 |
+
rng.shuffle(unique_blocks)
|
| 316 |
+
split_idx = int(len(unique_blocks) * 0.8)
|
| 317 |
+
train_blocks = set(unique_blocks[:split_idx])
|
| 318 |
+
test_blocks = set(unique_blocks[split_idx:])
|
| 319 |
+
|
| 320 |
+
train_mask = train_df['block_id'].isin(train_blocks)
|
| 321 |
+
test_mask = train_df['block_id'].isin(test_blocks)
|
| 322 |
+
|
| 323 |
+
X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
|
| 324 |
+
y_train_g = train_df.loc[train_mask, 'remaining_hours']
|
| 325 |
+
X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
|
| 326 |
+
y_test_g = train_df.loc[test_mask, 'remaining_hours']
|
| 327 |
+
|
| 328 |
+
completion_model = xgb.XGBRegressor(
|
| 329 |
+
n_estimators=500,
|
| 330 |
+
learning_rate=0.03,
|
| 331 |
+
max_depth=5, # reduced from 8
|
| 332 |
+
subsample=0.7,
|
| 333 |
+
colsample_bytree=0.7,
|
| 334 |
+
min_child_weight=10,
|
| 335 |
+
reg_alpha=1.0,
|
| 336 |
+
reg_lambda=5.0,
|
| 337 |
+
gamma=0.5,
|
| 338 |
+
objective='reg:squarederror',
|
| 339 |
+
tree_method='hist',
|
| 340 |
+
random_state=42,
|
| 341 |
+
early_stopping_rounds=30,
|
| 342 |
+
)
|
| 343 |
+
completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
|
| 344 |
+
|
| 345 |
+
train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
|
| 346 |
+
test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
|
| 347 |
+
train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
|
| 348 |
+
test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))
|
| 349 |
+
|
| 350 |
+
# GroupKFold CV
|
| 351 |
+
groups = train_df['block_id'].values
|
| 352 |
+
gkf = GroupKFold(n_splits=5)
|
| 353 |
+
cv_model = xgb.XGBRegressor(
|
| 354 |
+
n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
|
| 355 |
+
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
|
| 356 |
+
gamma=0.5, tree_method='hist', random_state=42
|
| 357 |
+
)
|
| 358 |
+
cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES],
|
| 359 |
+
train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')
|
| 360 |
+
|
| 361 |
+
print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
|
| 362 |
+
print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks")
|
| 363 |
+
print(f" Train RΒ²: {train_r2_g:.4f} Test RΒ²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}")
|
| 364 |
+
print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}")
|
| 365 |
+
print(f" GroupKFold CV RΒ²: {cv_scores_g.mean():.4f} Β± {cv_scores_g.std():.4f}")
|
| 366 |
+
|
| 367 |
+
all_metrics['completion_prediction'] = {
|
| 368 |
+
'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
|
| 369 |
+
'gap': round(train_r2_g - test_r2_g, 4),
|
| 370 |
+
'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
|
| 371 |
+
'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
|
| 372 |
+
'group_cv_r2_std': round(cv_scores_g.std(), 4),
|
| 373 |
+
'split_type': 'group-aware (block-level)',
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
# =====================================================================
|
| 377 |
+
# SAVE ALL v2 MODELS
|
| 378 |
+
# =====================================================================
|
| 379 |
+
print("\n" + "=" * 60)
|
| 380 |
+
print("SAVING v2 MODELS")
|
| 381 |
+
print("=" * 60)
|
| 382 |
+
|
| 383 |
+
os.makedirs('/app/models_v2', exist_ok=True)
|
| 384 |
+
|
| 385 |
+
joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
|
| 386 |
+
joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
|
| 387 |
+
joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
|
| 388 |
+
joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
|
| 389 |
+
joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')
|
| 390 |
+
|
| 391 |
+
# Encoders
|
| 392 |
+
joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
|
| 393 |
+
joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
|
| 394 |
+
joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
|
| 395 |
+
joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
|
| 396 |
+
joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')
|
| 397 |
+
|
| 398 |
+
# Feature config
|
| 399 |
+
feature_config = {
|
| 400 |
+
'hours_features': HOURS_FEATURES,
|
| 401 |
+
'complexity_features': COMPLEXITY_FEATURES,
|
| 402 |
+
'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
|
| 403 |
+
'completion_features': COMPLETION_FEATURES,
|
| 404 |
+
'tech_nodes': list(tech_node_encoder.classes_),
|
| 405 |
+
'block_types': list(block_type_encoder.classes_),
|
| 406 |
+
'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
|
| 407 |
+
'complexity_classes': list(complexity_encoder.classes_),
|
| 408 |
+
'bottleneck_classes': list(bottleneck_encoder.classes_),
|
| 409 |
+
}
|
| 410 |
+
with open('/app/models_v2/feature_config.json', 'w') as f:
|
| 411 |
+
json.dump(feature_config, f, indent=2)
|
| 412 |
+
|
| 413 |
+
# Metrics
|
| 414 |
+
all_metrics['training_data'] = {
|
| 415 |
+
'total_samples': len(df),
|
| 416 |
+
'completed_blocks': int(df['is_completed'].sum()),
|
| 417 |
+
'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
|
| 418 |
+
'completion_train_samples': len(X_train_g),
|
| 419 |
+
}
|
| 420 |
+
with open('/app/models_v2/metrics.json', 'w') as f:
|
| 421 |
+
json.dump(all_metrics, f, indent=2)
|
| 422 |
+
|
| 423 |
+
print("All v2 models saved to /app/models_v2/")
|
| 424 |
+
|
| 425 |
+
# Final summary
|
| 426 |
+
print("\n" + "=" * 60)
|
| 427 |
+
print("v1 vs v2 COMPARISON")
|
| 428 |
+
print("=" * 60)
|
| 429 |
+
print(f"""
|
| 430 |
+
βββββββββββββββββββββββββ¬βββββββββββββββββββββββββ¬βββββββββββββββββββββββββ
|
| 431 |
+
β Model β v1 (overfit) β v2 (fixed) β
|
| 432 |
+
βββββββββββββββββββββββββΌβββββββββββββββββββββββββΌβββββββββββββββββββββββββ€
|
| 433 |
+
β Hours Estimator β RΒ²=0.881 (gap 0.113) β RΒ²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) β
|
| 434 |
+
β Complexity Classifier β Acc=92.3% (gap 5.9%) β Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%) β
|
| 435 |
+
β Bottleneck Predictor β 99.6% (DATA LEAKAGE) β {test_bn*100:.1f}% (honest) β
|
| 436 |
+
β Completion Predictor β RΒ²=0.945 (GROUP LEAK) β RΒ²={test_r2_g:.3f} (grouped) β
|
| 437 |
+
βββββββββββββββββββββββββ΄βββββββββββββββββββββββββ΄βββββββββββββββββββββββββ
|
| 438 |
+
""")
|