| """ |
| ALWAS ML Models v2 β Retrained with overfitting fixes: |
| 1. Hours: stronger regularization (lower depth, higher min_child_weight) |
| 2. Complexity: reduced tree depth + stronger L1/L2 |
| 3. Bottleneck: removed leaky features |
| 4. Completion: group-aware split |
| """ |
| import numpy as np |
| import pandas as pd |
| import json |
| import joblib |
| import os |
| from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold |
| from sklearn.preprocessing import LabelEncoder, OrdinalEncoder |
| from sklearn.metrics import ( |
| mean_absolute_error, mean_squared_error, r2_score, |
| classification_report, accuracy_score, f1_score |
| ) |
| from sklearn.calibration import CalibratedClassifierCV |
| import xgboost as xgb |
| import lightgbm as lgb |
|
|
| df = pd.read_csv('/app/alwas_blocks_dataset.csv') |
|
|
| |
| tech_node_encoder = LabelEncoder() |
| block_type_encoder = LabelEncoder() |
| priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']]) |
|
|
| df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node']) |
| df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type']) |
| df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten() |
| df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded'] |
| df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log'] |
| df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric'] |
|
|
| complexity_encoder = LabelEncoder() |
| df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity']) |
| bottleneck_encoder = LabelEncoder() |
| df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk']) |
|
|
| |
| df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100 |
| df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1) |
|
|
| completed = df[df['is_completed'] == 1].copy() |
|
|
| all_metrics = {} |
|
|
| |
| |
| |
| print("=" * 60) |
| print("MODEL 1: Hours Estimator (regularized)") |
| print("=" * 60) |
|
|
| HOURS_FEATURES = [ |
| 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', |
| 'transistor_count', 'transistor_count_log', 'has_dependencies', |
| 'num_dependencies', 'constraint_complexity', 'drc_iterations', |
| 'engineer_skill_factor', 'type_node_interaction', 'complexity_score', |
| 'size_priority_interaction' |
| ] |
|
|
| X_h = completed[HOURS_FEATURES] |
| y_h = completed['actual_hours'] |
| X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42) |
|
|
| hours_model = xgb.XGBRegressor( |
| n_estimators=300, |
| learning_rate=0.05, |
| max_depth=4, |
| subsample=0.7, |
| colsample_bytree=0.7, |
| min_child_weight=10, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| gamma=0.5, |
| objective='reg:squarederror', |
| tree_method='hist', |
| random_state=42, |
| early_stopping_rounds=30, |
| ) |
| hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False) |
|
|
| train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h)) |
| test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h)) |
| train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h)) |
| test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h)) |
| cv_model_h = xgb.XGBRegressor( |
| n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7, |
| colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, |
| gamma=0.5, tree_method='hist', random_state=42, |
| ) |
| cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2') |
|
|
| print(f" Train RΒ²: {train_r2:.4f} Test RΒ²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}") |
| print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}") |
| print(f" CV RΒ²: {cv.mean():.4f} Β± {cv.std():.4f}") |
|
|
| all_metrics['hours_estimation'] = { |
| 'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4), |
| 'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2), |
| 'gap': round(train_r2 - test_r2, 4), |
| 'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4), |
| } |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("MODEL 2: Complexity Classifier (regularized)") |
| print("=" * 60) |
|
|
| COMPLEXITY_FEATURES = [ |
| 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', |
| 'transistor_count', 'transistor_count_log', 'has_dependencies', |
| 'num_dependencies', 'constraint_complexity', 'drc_iterations', |
| 'type_node_interaction', 'complexity_score', 'size_priority_interaction' |
| ] |
|
|
| X_c = completed[COMPLEXITY_FEATURES] |
| y_c = completed['complexity_encoded'] |
| X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c) |
|
|
| xgb_clf = xgb.XGBClassifier( |
| n_estimators=200, |
| learning_rate=0.05, |
| max_depth=4, |
| subsample=0.7, |
| colsample_bytree=0.7, |
| min_child_weight=10, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| gamma=0.5, |
| objective='multi:softprob', |
| num_class=3, |
| tree_method='hist', |
| random_state=42, |
| early_stopping_rounds=30, |
| ) |
| xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False) |
|
|
| lgb_clf = lgb.LGBMClassifier( |
| n_estimators=200, |
| learning_rate=0.05, |
| num_leaves=15, |
| max_depth=4, |
| subsample=0.7, |
| colsample_bytree=0.7, |
| min_child_samples=20, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| random_state=42, |
| verbose=-1, |
| ) |
| lgb_clf.fit(X_train_c, y_train_c) |
|
|
| train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c)) |
| test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c)) |
| train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c)) |
| test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c)) |
|
|
| |
| xgb_p = xgb_clf.predict_proba(X_test_c) |
| lgb_p = lgb_clf.predict_proba(X_test_c) |
| ens_p = (xgb_p + lgb_p) / 2 |
| y_pred_ens = np.argmax(ens_p, axis=1) |
| ens_acc = accuracy_score(y_test_c, y_pred_ens) |
| ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted') |
|
|
| cv_model_c = xgb.XGBClassifier( |
| n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7, |
| colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, |
| gamma=0.5, tree_method='hist', random_state=42, |
| ) |
| cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy') |
|
|
| print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}") |
| print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}") |
| print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}") |
| print(f" CV Acc: {cv_c.mean():.4f} Β± {cv_c.std():.4f}") |
|
|
| all_metrics['complexity_classification'] = { |
| 'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4), |
| 'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4), |
| 'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4), |
| 'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4), |
| } |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("MODEL 3: Bottleneck Predictor (leakage-free)") |
| print("=" * 60) |
|
|
| SAFE_BOTTLENECK_FEATURES = [ |
| 'tech_node_encoded', 'block_type_encoded', 'priority_encoded', |
| 'transistor_count_log', 'has_dependencies', 'num_dependencies', |
| 'constraint_complexity', 'estimated_hours', 'hours_logged', |
| 'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total', |
| 'current_stage_idx', 'engineer_skill_factor', 'complexity_score', |
| 'hours_budget_pct', 'stage_velocity' |
| ] |
|
|
| X_b = df[SAFE_BOTTLENECK_FEATURES] |
| y_b = df['bottleneck_encoded'] |
| X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b) |
|
|
| base_bn = xgb.XGBClassifier( |
| n_estimators=300, |
| learning_rate=0.05, |
| max_depth=4, |
| subsample=0.7, |
| colsample_bytree=0.7, |
| min_child_weight=10, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| gamma=0.5, |
| objective='multi:softprob', |
| num_class=3, |
| tree_method='hist', |
| random_state=42, |
| ) |
| bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic') |
| bn_model.fit(X_train_b, y_train_b) |
|
|
| train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b)) |
| test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b)) |
| test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted') |
| cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy') |
|
|
| print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}") |
| print(f" F1 (weighted): {test_f1_bn:.4f}") |
| print(f" CV Acc: {cv_bn.mean():.4f} Β± {cv_bn.std():.4f}") |
| print(f"\n Classification Report:") |
| print(classification_report(y_test_b, bn_model.predict(X_test_b), |
| target_names=bottleneck_encoder.classes_)) |
|
|
| all_metrics['bottleneck_prediction'] = { |
| 'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4), |
| 'gap': round(train_bn - test_bn, 4), |
| 'f1_weighted': round(test_f1_bn, 4), |
| 'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4), |
| 'features_used': 'SAFE (no leaky features)', |
| } |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("MODEL 4: Completion Predictor (group-aware split)") |
| print("=" * 60) |
|
|
| COMPLETION_FEATURES = [ |
| 'tech_node_encoded', 'block_type_encoded', 'priority_numeric', |
| 'transistor_count_log', 'has_dependencies', 'num_dependencies', |
| 'constraint_complexity', 'estimated_hours', 'engineer_skill_factor', |
| 'drc_iterations', 'current_stage_idx', 'cumulative_hours', |
| 'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches', |
| 'hours_vs_estimate_ratio', 'stages_completed', |
| 'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far' |
| ] |
|
|
| |
| training_samples = [] |
| for _, row in completed.iterrows(): |
| try: |
| transitions = json.loads(row['transitions']) |
| except: |
| continue |
| total_actual_hours = row['actual_hours'] |
| cumulative_hours = 0 |
| cumulative_days = 0 |
| cumulative_drc = 0 |
| cumulative_lvs = 0 |
| for i, t in enumerate(transitions): |
| if i == 0: |
| continue |
| cumulative_hours += t.get('hours_in_stage', 0) |
| cumulative_days += t.get('days_in_stage', 0) |
| cumulative_drc += t.get('drc_violations', 0) |
| cumulative_lvs += t.get('lvs_mismatches', 0) |
| remaining = max(0, total_actual_hours - cumulative_hours) |
| training_samples.append({ |
| 'block_id': row['block_id'], |
| 'tech_node_encoded': row.get('tech_node_encoded', 0), |
| 'block_type_encoded': row.get('block_type_encoded', 0), |
| 'priority_numeric': row['priority_numeric'], |
| 'transistor_count_log': row['transistor_count_log'], |
| 'has_dependencies': row['has_dependencies'], |
| 'num_dependencies': row['num_dependencies'], |
| 'constraint_complexity': row['constraint_complexity'], |
| 'estimated_hours': row['estimated_hours'], |
| 'engineer_skill_factor': row['engineer_skill_factor'], |
| 'drc_iterations': row['drc_iterations'], |
| 'current_stage_idx': i, |
| 'cumulative_hours': cumulative_hours, |
| 'cumulative_days': cumulative_days, |
| 'cumulative_drc_violations': cumulative_drc, |
| 'cumulative_lvs_mismatches': cumulative_lvs, |
| 'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1), |
| 'stages_completed': i, |
| 'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1), |
| 'avg_days_per_stage_so_far': cumulative_days / max(i, 1), |
| 'remaining_hours': remaining, |
| }) |
|
|
| train_df = pd.DataFrame(training_samples) |
|
|
| |
| unique_blocks = train_df['block_id'].unique() |
| rng = np.random.RandomState(42) |
| rng.shuffle(unique_blocks) |
| split_idx = int(len(unique_blocks) * 0.8) |
| train_blocks = set(unique_blocks[:split_idx]) |
| test_blocks = set(unique_blocks[split_idx:]) |
|
|
| train_mask = train_df['block_id'].isin(train_blocks) |
| test_mask = train_df['block_id'].isin(test_blocks) |
|
|
| X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES] |
| y_train_g = train_df.loc[train_mask, 'remaining_hours'] |
| X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES] |
| y_test_g = train_df.loc[test_mask, 'remaining_hours'] |
|
|
| completion_model = xgb.XGBRegressor( |
| n_estimators=500, |
| learning_rate=0.03, |
| max_depth=5, |
| subsample=0.7, |
| colsample_bytree=0.7, |
| min_child_weight=10, |
| reg_alpha=1.0, |
| reg_lambda=5.0, |
| gamma=0.5, |
| objective='reg:squarederror', |
| tree_method='hist', |
| random_state=42, |
| early_stopping_rounds=30, |
| ) |
| completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False) |
|
|
| train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g)) |
| test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g)) |
| train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g)) |
| test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g)) |
|
|
| |
| groups = train_df['block_id'].values |
| gkf = GroupKFold(n_splits=5) |
| cv_model = xgb.XGBRegressor( |
| n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7, |
| colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0, |
| gamma=0.5, tree_method='hist', random_state=42 |
| ) |
| cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES], |
| train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2') |
|
|
| print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks") |
| print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks") |
| print(f" Train RΒ²: {train_r2_g:.4f} Test RΒ²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}") |
| print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}") |
| print(f" GroupKFold CV RΒ²: {cv_scores_g.mean():.4f} Β± {cv_scores_g.std():.4f}") |
|
|
| all_metrics['completion_prediction'] = { |
| 'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4), |
| 'gap': round(train_r2_g - test_r2_g, 4), |
| 'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2), |
| 'group_cv_r2_mean': round(cv_scores_g.mean(), 4), |
| 'group_cv_r2_std': round(cv_scores_g.std(), 4), |
| 'split_type': 'group-aware (block-level)', |
| } |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("SAVING v2 MODELS") |
| print("=" * 60) |
|
|
| os.makedirs('/app/models_v2', exist_ok=True) |
|
|
| joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib') |
| joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib') |
| joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib') |
| joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib') |
| joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib') |
|
|
| |
| joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib') |
| joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib') |
| joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib') |
| joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib') |
| joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib') |
|
|
| |
| feature_config = { |
| 'hours_features': HOURS_FEATURES, |
| 'complexity_features': COMPLEXITY_FEATURES, |
| 'bottleneck_features': SAFE_BOTTLENECK_FEATURES, |
| 'completion_features': COMPLETION_FEATURES, |
| 'tech_nodes': list(tech_node_encoder.classes_), |
| 'block_types': list(block_type_encoder.classes_), |
| 'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'], |
| 'complexity_classes': list(complexity_encoder.classes_), |
| 'bottleneck_classes': list(bottleneck_encoder.classes_), |
| } |
| with open('/app/models_v2/feature_config.json', 'w') as f: |
| json.dump(feature_config, f, indent=2) |
|
|
| |
| all_metrics['training_data'] = { |
| 'total_samples': len(df), |
| 'completed_blocks': int(df['is_completed'].sum()), |
| 'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()), |
| 'completion_train_samples': len(X_train_g), |
| } |
| with open('/app/models_v2/metrics.json', 'w') as f: |
| json.dump(all_metrics, f, indent=2) |
|
|
| print("All v2 models saved to /app/models_v2/") |
|
|
| |
| print("\n" + "=" * 60) |
| print("v1 vs v2 COMPARISON") |
| print("=" * 60) |
| print(f""" |
| βββββββββββββββββββββββββ¬βββββββββββββββββββββββββ¬βββββββββββββββββββββββββ |
| β Model β v1 (overfit) β v2 (fixed) β |
| βββββββββββββββββββββββββΌβββββββββββββββββββββββββΌβββββββββββββββββββββββββ€ |
| β Hours Estimator β RΒ²=0.881 (gap 0.113) β RΒ²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) β |
| β Complexity Classifier β Acc=92.3% (gap 5.9%) β Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%) β |
| β Bottleneck Predictor β 99.6% (DATA LEAKAGE) β {test_bn*100:.1f}% (honest) β |
| β Completion Predictor β RΒ²=0.945 (GROUP LEAK) β RΒ²={test_r2_g:.3f} (grouped) β |
| βββββββββββββββββββββββββ΄βββββββββββββββββββββββββ΄βββββββββββββββββββββββββ |
| """) |
|
|