""" ALWAS ML Models v2 — Retrained with overfitting fixes: 1. Hours: stronger regularization (lower depth, higher min_child_weight) 2. Complexity: reduced tree depth + stronger L1/L2 3. Bottleneck: removed leaky features 4. Completion: group-aware split """ import numpy as np import pandas as pd import json import joblib import os from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold from sklearn.preprocessing import LabelEncoder, OrdinalEncoder from sklearn.metrics import ( mean_absolute_error, mean_squared_error, r2_score, classification_report, accuracy_score, f1_score ) from sklearn.calibration import CalibratedClassifierCV import xgboost as xgb import lightgbm as lgb df = pd.read_csv('/app/alwas_blocks_dataset.csv') # Encode tech_node_encoder = LabelEncoder() block_type_encoder = LabelEncoder() priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']]) df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node']) df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type']) df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten() df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded'] df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log'] df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric'] complexity_encoder = LabelEncoder() df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity']) bottleneck_encoder = LabelEncoder() df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk']) # Safe derived features for bottleneck df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100 df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1) completed = df[df['is_completed'] == 1].copy() all_metrics = {} # ===================================================================== # MODEL 1: Hours Estimator — REGULARIZED # ===================================================================== print("=" * 60) print("MODEL 1: Hours Estimator (regularized)") print("=" * 60) HOURS_FEATURES = [ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', 'transistor_count', 'transistor_count_log', 'has_dependencies', 'num_dependencies', 'constraint_complexity', 'drc_iterations', 'engineer_skill_factor', 'type_node_interaction', 'complexity_score', 'size_priority_interaction' ] X_h = completed[HOURS_FEATURES] y_h = completed['actual_hours'] X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42) hours_model = xgb.XGBRegressor( n_estimators=300, # reduced from 500 learning_rate=0.05, max_depth=4, # reduced from 7 subsample=0.7, # reduced from 0.8 colsample_bytree=0.7, # reduced from 0.8 min_child_weight=10, # increased from 3 reg_alpha=1.0, # increased from 0.1 reg_lambda=5.0, # increased from 1.0 gamma=0.5, # added: min split loss objective='reg:squarederror', tree_method='hist', random_state=42, early_stopping_rounds=30, ) hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False) train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h)) test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h)) train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h)) test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h)) cv_model_h = xgb.XGBRegressor( n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7, colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, tree_method='hist', random_state=42, ) cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2') print(f" Train R²: {train_r2:.4f} Test R²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}") print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}") print(f" CV R²: {cv.mean():.4f} ± {cv.std():.4f}") all_metrics['hours_estimation'] = { 'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4), 'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2), 'gap': round(train_r2 - test_r2, 4), 'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4), } # ===================================================================== # MODEL 2: Complexity Classifier — REGULARIZED # ===================================================================== print("\n" + "=" * 60) print("MODEL 2: Complexity Classifier (regularized)") print("=" * 60) COMPLEXITY_FEATURES = [ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', 'transistor_count', 'transistor_count_log', 'has_dependencies', 'num_dependencies', 'constraint_complexity', 'drc_iterations', 'type_node_interaction', 'complexity_score', 'size_priority_interaction' ] X_c = completed[COMPLEXITY_FEATURES] y_c = completed['complexity_encoded'] X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c) xgb_clf = xgb.XGBClassifier( n_estimators=200, learning_rate=0.05, max_depth=4, # reduced from 6 subsample=0.7, colsample_bytree=0.7, min_child_weight=10, # increased reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, objective='multi:softprob', num_class=3, tree_method='hist', random_state=42, early_stopping_rounds=30, ) xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False) lgb_clf = lgb.LGBMClassifier( n_estimators=200, learning_rate=0.05, num_leaves=15, # reduced from 63 max_depth=4, subsample=0.7, colsample_bytree=0.7, min_child_samples=20, # increased reg_alpha=1.0, reg_lambda=5.0, random_state=42, verbose=-1, ) lgb_clf.fit(X_train_c, y_train_c) train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c)) test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c)) train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c)) test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c)) # Ensemble xgb_p = xgb_clf.predict_proba(X_test_c) lgb_p = lgb_clf.predict_proba(X_test_c) ens_p = (xgb_p + lgb_p) / 2 y_pred_ens = np.argmax(ens_p, axis=1) ens_acc = accuracy_score(y_test_c, y_pred_ens) ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted') cv_model_c = xgb.XGBClassifier( n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7, colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, tree_method='hist', random_state=42, ) cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy') print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}") print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}") print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}") print(f" CV Acc: {cv_c.mean():.4f} ± {cv_c.std():.4f}") all_metrics['complexity_classification'] = { 'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4), 'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4), 'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4), 'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4), } # ===================================================================== # MODEL 3: Bottleneck — LEAKAGE-FREE # ===================================================================== print("\n" + "=" * 60) print("MODEL 3: Bottleneck Predictor (leakage-free)") print("=" * 60) SAFE_BOTTLENECK_FEATURES = [ 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', 'transistor_count_log', 'has_dependencies', 'num_dependencies', 'constraint_complexity', 'estimated_hours', 'hours_logged', 'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total', 'current_stage_idx', 'engineer_skill_factor', 'complexity_score', 'hours_budget_pct', 'stage_velocity' ] X_b = df[SAFE_BOTTLENECK_FEATURES] y_b = df['bottleneck_encoded'] X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b) base_bn = xgb.XGBClassifier( n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7, colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, objective='multi:softprob', num_class=3, tree_method='hist', random_state=42, ) bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic') bn_model.fit(X_train_b, y_train_b) train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b)) test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b)) test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted') cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy') print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}") print(f" F1 (weighted): {test_f1_bn:.4f}") print(f" CV Acc: {cv_bn.mean():.4f} ± {cv_bn.std():.4f}") print(f"\n Classification Report:") print(classification_report(y_test_b, bn_model.predict(X_test_b), target_names=bottleneck_encoder.classes_)) all_metrics['bottleneck_prediction'] = { 'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4), 'gap': round(train_bn - test_bn, 4), 'f1_weighted': round(test_f1_bn, 4), 'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4), 'features_used': 'SAFE (no leaky features)', } # ===================================================================== # MODEL 4: Completion — GROUP-AWARE SPLIT # ===================================================================== print("\n" + "=" * 60) print("MODEL 4: Completion Predictor (group-aware split)") print("=" * 60) COMPLETION_FEATURES = [ 'tech_node_encoded', 'block_type_encoded', 'priority_numeric', 'transistor_count_log', 'has_dependencies', 'num_dependencies', 'constraint_complexity', 'estimated_hours', 'engineer_skill_factor', 'drc_iterations', 'current_stage_idx', 'cumulative_hours', 'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches', 'hours_vs_estimate_ratio', 'stages_completed', 'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far' ] # Build samples with block_id training_samples = [] for _, row in completed.iterrows(): try: transitions = json.loads(row['transitions']) except: continue total_actual_hours = row['actual_hours'] cumulative_hours = 0 cumulative_days = 0 cumulative_drc = 0 cumulative_lvs = 0 for i, t in enumerate(transitions): if i == 0: continue cumulative_hours += t.get('hours_in_stage', 0) cumulative_days += t.get('days_in_stage', 0) cumulative_drc += t.get('drc_violations', 0) cumulative_lvs += t.get('lvs_mismatches', 0) remaining = max(0, total_actual_hours - cumulative_hours) training_samples.append({ 'block_id': row['block_id'], 'tech_node_encoded': row.get('tech_node_encoded', 0), 'block_type_encoded': row.get('block_type_encoded', 0), 'priority_numeric': row['priority_numeric'], 'transistor_count_log': row['transistor_count_log'], 'has_dependencies': row['has_dependencies'], 'num_dependencies': row['num_dependencies'], 'constraint_complexity': row['constraint_complexity'], 'estimated_hours': row['estimated_hours'], 'engineer_skill_factor': row['engineer_skill_factor'], 'drc_iterations': row['drc_iterations'], 'current_stage_idx': i, 'cumulative_hours': cumulative_hours, 'cumulative_days': cumulative_days, 'cumulative_drc_violations': cumulative_drc, 'cumulative_lvs_mismatches': cumulative_lvs, 'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1), 'stages_completed': i, 'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1), 'avg_days_per_stage_so_far': cumulative_days / max(i, 1), 'remaining_hours': remaining, }) train_df = pd.DataFrame(training_samples) # Group-aware split unique_blocks = train_df['block_id'].unique() rng = np.random.RandomState(42) rng.shuffle(unique_blocks) split_idx = int(len(unique_blocks) * 0.8) train_blocks = set(unique_blocks[:split_idx]) test_blocks = set(unique_blocks[split_idx:]) train_mask = train_df['block_id'].isin(train_blocks) test_mask = train_df['block_id'].isin(test_blocks) X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES] y_train_g = train_df.loc[train_mask, 'remaining_hours'] X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES] y_test_g = train_df.loc[test_mask, 'remaining_hours'] completion_model = xgb.XGBRegressor( n_estimators=500, learning_rate=0.03, max_depth=5, # reduced from 8 subsample=0.7, colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, objective='reg:squarederror', tree_method='hist', random_state=42, early_stopping_rounds=30, ) completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False) train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g)) test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g)) train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g)) test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g)) # GroupKFold CV groups = train_df['block_id'].values gkf = GroupKFold(n_splits=5) cv_model = xgb.XGBRegressor( n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7, colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, gamma=0.5, tree_method='hist', random_state=42 ) cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES], train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2') print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks") print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks") print(f" Train R²: {train_r2_g:.4f} Test R²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}") print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}") print(f" GroupKFold CV R²: {cv_scores_g.mean():.4f} ± {cv_scores_g.std():.4f}") all_metrics['completion_prediction'] = { 'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4), 'gap': round(train_r2_g - test_r2_g, 4), 'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2), 'group_cv_r2_mean': round(cv_scores_g.mean(), 4), 'group_cv_r2_std': round(cv_scores_g.std(), 4), 'split_type': 'group-aware (block-level)', } # ===================================================================== # SAVE ALL v2 MODELS # ===================================================================== print("\n" + "=" * 60) print("SAVING v2 MODELS") print("=" * 60) os.makedirs('/app/models_v2', exist_ok=True) joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib') joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib') joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib') joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib') joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib') # Encoders joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib') joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib') joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib') joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib') joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib') # Feature config feature_config = { 'hours_features': HOURS_FEATURES, 'complexity_features': COMPLEXITY_FEATURES, 'bottleneck_features': SAFE_BOTTLENECK_FEATURES, 'completion_features': COMPLETION_FEATURES, 'tech_nodes': list(tech_node_encoder.classes_), 'block_types': list(block_type_encoder.classes_), 'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'], 'complexity_classes': list(complexity_encoder.classes_), 'bottleneck_classes': list(bottleneck_encoder.classes_), } with open('/app/models_v2/feature_config.json', 'w') as f: json.dump(feature_config, f, indent=2) # Metrics all_metrics['training_data'] = { 'total_samples': len(df), 'completed_blocks': int(df['is_completed'].sum()), 'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()), 'completion_train_samples': len(X_train_g), } with open('/app/models_v2/metrics.json', 'w') as f: json.dump(all_metrics, f, indent=2) print("All v2 models saved to /app/models_v2/") # Final summary print("\n" + "=" * 60) print("v1 vs v2 COMPARISON") print("=" * 60) print(f""" ┌───────────────────────┬────────────────────────┬────────────────────────┐ │ Model │ v1 (overfit) │ v2 (fixed) │ ├───────────────────────┼────────────────────────┼────────────────────────┤ │ Hours Estimator │ R²=0.881 (gap 0.113) │ R²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) │ │ Complexity Classifier │ Acc=92.3% (gap 5.9%) │ Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%) │ │ Bottleneck Predictor │ 99.6% (DATA LEAKAGE) │ {test_bn*100:.1f}% (honest) │ │ Completion Predictor │ R²=0.945 (GROUP LEAK) │ R²={test_r2_g:.3f} (grouped) │ └───────────────────────┴────────────────────────┴────────────────────────┘ """)