alwas-ml-models / training /train_v2.py
muthuk1's picture
Upload training/train_v2.py with huggingface_hub
0764371 verified
"""
ALWAS ML Models v2 β€” Retrained with overfitting fixes:
1. Hours: stronger regularization (lower depth, higher min_child_weight)
2. Complexity: reduced tree depth + stronger L1/L2
3. Bottleneck: removed leaky features
4. Completion: group-aware split
"""
import numpy as np
import pandas as pd
import json
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score,
classification_report, accuracy_score, f1_score
)
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
import lightgbm as lgb
df = pd.read_csv('/app/alwas_blocks_dataset.csv')
# Encode
tech_node_encoder = LabelEncoder()
block_type_encoder = LabelEncoder()
priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
complexity_encoder = LabelEncoder()
df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
bottleneck_encoder = LabelEncoder()
df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
# Safe derived features for bottleneck
df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)
completed = df[df['is_completed'] == 1].copy()
all_metrics = {}
# =====================================================================
# MODEL 1: Hours Estimator β€” REGULARIZED
# =====================================================================
print("=" * 60)
print("MODEL 1: Hours Estimator (regularized)")
print("=" * 60)
HOURS_FEATURES = [
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
'transistor_count', 'transistor_count_log', 'has_dependencies',
'num_dependencies', 'constraint_complexity', 'drc_iterations',
'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
'size_priority_interaction'
]
X_h = completed[HOURS_FEATURES]
y_h = completed['actual_hours']
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
hours_model = xgb.XGBRegressor(
n_estimators=300, # reduced from 500
learning_rate=0.05,
max_depth=4, # reduced from 7
subsample=0.7, # reduced from 0.8
colsample_bytree=0.7, # reduced from 0.8
min_child_weight=10, # increased from 3
reg_alpha=1.0, # increased from 0.1
reg_lambda=5.0, # increased from 1.0
gamma=0.5, # added: min split loss
objective='reg:squarederror',
tree_method='hist',
random_state=42,
early_stopping_rounds=30,
)
hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)
train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
cv_model_h = xgb.XGBRegressor(
n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
gamma=0.5, tree_method='hist', random_state=42,
)
cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')
print(f" Train RΒ²: {train_r2:.4f} Test RΒ²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}")
print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}")
print(f" CV RΒ²: {cv.mean():.4f} Β± {cv.std():.4f}")
all_metrics['hours_estimation'] = {
'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
'gap': round(train_r2 - test_r2, 4),
'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
}
# =====================================================================
# MODEL 2: Complexity Classifier β€” REGULARIZED
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 2: Complexity Classifier (regularized)")
print("=" * 60)
COMPLEXITY_FEATURES = [
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
'transistor_count', 'transistor_count_log', 'has_dependencies',
'num_dependencies', 'constraint_complexity', 'drc_iterations',
'type_node_interaction', 'complexity_score', 'size_priority_interaction'
]
X_c = completed[COMPLEXITY_FEATURES]
y_c = completed['complexity_encoded']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
xgb_clf = xgb.XGBClassifier(
n_estimators=200,
learning_rate=0.05,
max_depth=4, # reduced from 6
subsample=0.7,
colsample_bytree=0.7,
min_child_weight=10, # increased
reg_alpha=1.0,
reg_lambda=5.0,
gamma=0.5,
objective='multi:softprob',
num_class=3,
tree_method='hist',
random_state=42,
early_stopping_rounds=30,
)
xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)
lgb_clf = lgb.LGBMClassifier(
n_estimators=200,
learning_rate=0.05,
num_leaves=15, # reduced from 63
max_depth=4,
subsample=0.7,
colsample_bytree=0.7,
min_child_samples=20, # increased
reg_alpha=1.0,
reg_lambda=5.0,
random_state=42,
verbose=-1,
)
lgb_clf.fit(X_train_c, y_train_c)
train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
# Ensemble
xgb_p = xgb_clf.predict_proba(X_test_c)
lgb_p = lgb_clf.predict_proba(X_test_c)
ens_p = (xgb_p + lgb_p) / 2
y_pred_ens = np.argmax(ens_p, axis=1)
ens_acc = accuracy_score(y_test_c, y_pred_ens)
ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')
cv_model_c = xgb.XGBClassifier(
n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
gamma=0.5, tree_method='hist', random_state=42,
)
cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')
print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}")
print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}")
print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}")
print(f" CV Acc: {cv_c.mean():.4f} Β± {cv_c.std():.4f}")
all_metrics['complexity_classification'] = {
'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
}
# =====================================================================
# MODEL 3: Bottleneck β€” LEAKAGE-FREE
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 3: Bottleneck Predictor (leakage-free)")
print("=" * 60)
SAFE_BOTTLENECK_FEATURES = [
'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
'transistor_count_log', 'has_dependencies', 'num_dependencies',
'constraint_complexity', 'estimated_hours', 'hours_logged',
'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
'hours_budget_pct', 'stage_velocity'
]
X_b = df[SAFE_BOTTLENECK_FEATURES]
y_b = df['bottleneck_encoded']
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
base_bn = xgb.XGBClassifier(
n_estimators=300,
learning_rate=0.05,
max_depth=4,
subsample=0.7,
colsample_bytree=0.7,
min_child_weight=10,
reg_alpha=1.0,
reg_lambda=5.0,
gamma=0.5,
objective='multi:softprob',
num_class=3,
tree_method='hist',
random_state=42,
)
bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
bn_model.fit(X_train_b, y_train_b)
train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')
print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}")
print(f" F1 (weighted): {test_f1_bn:.4f}")
print(f" CV Acc: {cv_bn.mean():.4f} Β± {cv_bn.std():.4f}")
print(f"\n Classification Report:")
print(classification_report(y_test_b, bn_model.predict(X_test_b),
target_names=bottleneck_encoder.classes_))
all_metrics['bottleneck_prediction'] = {
'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
'gap': round(train_bn - test_bn, 4),
'f1_weighted': round(test_f1_bn, 4),
'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
'features_used': 'SAFE (no leaky features)',
}
# =====================================================================
# MODEL 4: Completion β€” GROUP-AWARE SPLIT
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 4: Completion Predictor (group-aware split)")
print("=" * 60)
COMPLETION_FEATURES = [
'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
'transistor_count_log', 'has_dependencies', 'num_dependencies',
'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
'drc_iterations', 'current_stage_idx', 'cumulative_hours',
'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
'hours_vs_estimate_ratio', 'stages_completed',
'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
]
# Build samples with block_id
training_samples = []
for _, row in completed.iterrows():
try:
transitions = json.loads(row['transitions'])
except:
continue
total_actual_hours = row['actual_hours']
cumulative_hours = 0
cumulative_days = 0
cumulative_drc = 0
cumulative_lvs = 0
for i, t in enumerate(transitions):
if i == 0:
continue
cumulative_hours += t.get('hours_in_stage', 0)
cumulative_days += t.get('days_in_stage', 0)
cumulative_drc += t.get('drc_violations', 0)
cumulative_lvs += t.get('lvs_mismatches', 0)
remaining = max(0, total_actual_hours - cumulative_hours)
training_samples.append({
'block_id': row['block_id'],
'tech_node_encoded': row.get('tech_node_encoded', 0),
'block_type_encoded': row.get('block_type_encoded', 0),
'priority_numeric': row['priority_numeric'],
'transistor_count_log': row['transistor_count_log'],
'has_dependencies': row['has_dependencies'],
'num_dependencies': row['num_dependencies'],
'constraint_complexity': row['constraint_complexity'],
'estimated_hours': row['estimated_hours'],
'engineer_skill_factor': row['engineer_skill_factor'],
'drc_iterations': row['drc_iterations'],
'current_stage_idx': i,
'cumulative_hours': cumulative_hours,
'cumulative_days': cumulative_days,
'cumulative_drc_violations': cumulative_drc,
'cumulative_lvs_mismatches': cumulative_lvs,
'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
'stages_completed': i,
'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
'remaining_hours': remaining,
})
train_df = pd.DataFrame(training_samples)
# Group-aware split
unique_blocks = train_df['block_id'].unique()
rng = np.random.RandomState(42)
rng.shuffle(unique_blocks)
split_idx = int(len(unique_blocks) * 0.8)
train_blocks = set(unique_blocks[:split_idx])
test_blocks = set(unique_blocks[split_idx:])
train_mask = train_df['block_id'].isin(train_blocks)
test_mask = train_df['block_id'].isin(test_blocks)
X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
y_train_g = train_df.loc[train_mask, 'remaining_hours']
X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
y_test_g = train_df.loc[test_mask, 'remaining_hours']
completion_model = xgb.XGBRegressor(
n_estimators=500,
learning_rate=0.03,
max_depth=5, # reduced from 8
subsample=0.7,
colsample_bytree=0.7,
min_child_weight=10,
reg_alpha=1.0,
reg_lambda=5.0,
gamma=0.5,
objective='reg:squarederror',
tree_method='hist',
random_state=42,
early_stopping_rounds=30,
)
completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))
# GroupKFold CV
groups = train_df['block_id'].values
gkf = GroupKFold(n_splits=5)
cv_model = xgb.XGBRegressor(
n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
gamma=0.5, tree_method='hist', random_state=42
)
cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES],
train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')
print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks")
print(f" Train RΒ²: {train_r2_g:.4f} Test RΒ²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}")
print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}")
print(f" GroupKFold CV RΒ²: {cv_scores_g.mean():.4f} Β± {cv_scores_g.std():.4f}")
all_metrics['completion_prediction'] = {
'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
'gap': round(train_r2_g - test_r2_g, 4),
'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
'group_cv_r2_std': round(cv_scores_g.std(), 4),
'split_type': 'group-aware (block-level)',
}
# =====================================================================
# SAVE ALL v2 MODELS
# =====================================================================
print("\n" + "=" * 60)
print("SAVING v2 MODELS")
print("=" * 60)
os.makedirs('/app/models_v2', exist_ok=True)
joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')
# Encoders
joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')
# Feature config
feature_config = {
'hours_features': HOURS_FEATURES,
'complexity_features': COMPLEXITY_FEATURES,
'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
'completion_features': COMPLETION_FEATURES,
'tech_nodes': list(tech_node_encoder.classes_),
'block_types': list(block_type_encoder.classes_),
'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
'complexity_classes': list(complexity_encoder.classes_),
'bottleneck_classes': list(bottleneck_encoder.classes_),
}
with open('/app/models_v2/feature_config.json', 'w') as f:
json.dump(feature_config, f, indent=2)
# Metrics
all_metrics['training_data'] = {
'total_samples': len(df),
'completed_blocks': int(df['is_completed'].sum()),
'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
'completion_train_samples': len(X_train_g),
}
with open('/app/models_v2/metrics.json', 'w') as f:
json.dump(all_metrics, f, indent=2)
print("All v2 models saved to /app/models_v2/")
# Final summary
print("\n" + "=" * 60)
print("v1 vs v2 COMPARISON")
print("=" * 60)
print(f"""
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Model β”‚ v1 (overfit) β”‚ v2 (fixed) β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Hours Estimator β”‚ RΒ²=0.881 (gap 0.113) β”‚ RΒ²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) β”‚
β”‚ Complexity Classifier β”‚ Acc=92.3% (gap 5.9%) β”‚ Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%) β”‚
β”‚ Bottleneck Predictor β”‚ 99.6% (DATA LEAKAGE) β”‚ {test_bn*100:.1f}% (honest) β”‚
β”‚ Completion Predictor β”‚ RΒ²=0.945 (GROUP LEAK) β”‚ RΒ²={test_r2_g:.3f} (grouped) β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
""")