muthuk1
/

alwas-ml-models

Joblib

Model card Files Files and versions

xet

Community

muthuk1 commited on about 1 month ago

Commit

4120b19

verified ·

1 Parent(s): 0764371

Upload training/overfit_diagnostic.py with huggingface_hub

Browse files

Files changed (1) hide show

training/overfit_diagnostic.py +417 -0

training/overfit_diagnostic.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""
+ALWAS Model Overfitting Diagnostic
+Checks train/test gap, learning curves, feature leakage, and cross-val stability.
+"""
+import numpy as np
+import pandas as pd
+import json
+import joblib
+from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
+from sklearn.metrics import (
+    mean_absolute_error, r2_score, accuracy_score, f1_score,
+    mean_squared_error, classification_report
+)
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
+import xgboost as xgb
+import lightgbm as lgb
+# === Load & Prep Data (same as training) ===
+df = pd.read_csv('/app/alwas_blocks_dataset.csv')
+tech_node_encoder = LabelEncoder()
+block_type_encoder = LabelEncoder()
+priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
+df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
+df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
+df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
+df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
+df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
+df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
+complexity_encoder = LabelEncoder()
+df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
+bottleneck_encoder = LabelEncoder()
+df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
+completed = df[df['is_completed'] == 1].copy()
+print("=" * 70)
+print("OVERFITTING DIAGNOSTIC REPORT")
+print("=" * 70)
+# =====================================================================
+# MODEL 1: Hours Estimator — Train vs Test gap
+# =====================================================================
+print("\n" + "=" * 70)
+print("MODEL 1: Hours Estimator")
+print("=" * 70)
+HOURS_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count', 'transistor_count_log', 'has_dependencies',
+    'num_dependencies', 'constraint_complexity', 'drc_iterations',
+    'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
+    'size_priority_interaction'
+]
+X_h = completed[HOURS_FEATURES]
+y_h = completed['actual_hours']
+X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
+model_h = joblib.load('/app/models/hours_estimator.joblib')
+train_r2 = r2_score(y_train_h, model_h.predict(X_train_h))
+test_r2 = r2_score(y_test_h, model_h.predict(X_test_h))
+train_mae = mean_absolute_error(y_train_h, model_h.predict(X_train_h))
+test_mae = mean_absolute_error(y_test_h, model_h.predict(X_test_h))
+cv_scores = cross_val_score(
+    xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7,
+                      subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
+    X_h, y_h, cv=5, scoring='r2'
+)
+print(f"  Train R²:  {train_r2:.4f}   |  Train MAE: {train_mae:.2f}h")
+print(f"  Test R²:   {test_r2:.4f}   |  Test MAE:  {test_mae:.2f}h")
+print(f"  CV R²:     {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
+print(f"  Train-Test R² gap: {train_r2 - test_r2:.4f}")
+print(f"  VERDICT: {'⚠️ OVERFITTING' if (train_r2 - test_r2) > 0.05 else '✅ OK'} (gap {'>' if (train_r2-test_r2)>0.05 else '<'} 0.05)")
+# =====================================================================
+# MODEL 2: Complexity Classifier
+# =====================================================================
+print("\n" + "=" * 70)
+print("MODEL 2: Complexity Classifier")
+print("=" * 70)
+COMPLEXITY_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count', 'transistor_count_log', 'has_dependencies',
+    'num_dependencies', 'constraint_complexity', 'drc_iterations',
+    'type_node_interaction', 'complexity_score', 'size_priority_interaction'
+]
+X_c = completed[COMPLEXITY_FEATURES]
+y_c = completed['complexity_encoded']
+X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
+xgb_clf = joblib.load('/app/models/complexity_xgb.joblib')
+lgb_clf = joblib.load('/app/models/complexity_lgb.joblib')
+train_acc_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
+test_acc_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
+train_acc_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
+test_acc_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
+cv_xgb = cross_val_score(
+    xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
+                       subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
+    X_c, y_c, cv=5, scoring='accuracy'
+)
+print(f"  XGBoost  Train Acc: {train_acc_xgb:.4f}  |  Test Acc: {test_acc_xgb:.4f}  |  Gap: {train_acc_xgb-test_acc_xgb:.4f}")
+print(f"  LightGBM Train Acc: {train_acc_lgb:.4f}  |  Test Acc: {test_acc_lgb:.4f}  |  Gap: {train_acc_lgb-test_acc_lgb:.4f}")
+print(f"  CV Acc: {cv_xgb.mean():.4f} ± {cv_xgb.std():.4f}")
+print(f"  VERDICT: {'⚠️ OVERFITTING' if (train_acc_xgb - test_acc_xgb) > 0.05 else '✅ OK'}")
+# =====================================================================
+# MODEL 3: Bottleneck Predictor — SUSPICIOUS 99.6%
+# =====================================================================
+print("\n" + "=" * 70)
+print("MODEL 3: Bottleneck Predictor — INVESTIGATING 99.6% ACCURACY")
+print("=" * 70)
+BOTTLENECK_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'hours_logged',
+    'hours_over_estimate_ratio', 'drc_iterations', 'drc_violations_total',
+    'lvs_mismatches_total', 'current_stage_idx', 'days_in_current_stage',
+    'engineer_skill_factor', 'is_overdue', 'complexity_score'
+]
+X_b = df[BOTTLENECK_FEATURES]
+y_b = df['bottleneck_encoded']
+X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
+# Check feature correlation with target — data leakage detection
+print("\n  Feature correlation with bottleneck_encoded:")
+for feat in BOTTLENECK_FEATURES:
+    corr = df[feat].corr(df['bottleneck_encoded'].astype(float))
+    flag = " ⚠️ HIGH CORRELATION (possible leakage)" if abs(corr) > 0.5 else ""
+    print(f"    {feat:35s} r = {corr:+.4f}{flag}")
+# Check how bottleneck labels were generated
+print("\n  Bottleneck label generation logic check:")
+print("    Label was computed FROM: hours_over_estimate_ratio, days_in_current_stage")
+print("    These same features are INPUTS to the model!")
+print("    ⚠️ THIS IS DIRECT DATA LEAKAGE — the model is learning the labeling function")
+# Retrain WITHOUT leaked features
+print("\n  --- Retrain WITHOUT leaky features ---")
+CLEAN_BOTTLENECK_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'hours_logged',
+    'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
+    'current_stage_idx', 'engineer_skill_factor', 'complexity_score'
+]
+# Remove: hours_over_estimate_ratio (directly derived from label rule),
+#          days_in_current_stage (directly used in label rule),
+#          is_overdue (correlated with stuck)
+X_clean = df[CLEAN_BOTTLENECK_FEATURES]
+X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
+    X_clean, y_b, test_size=0.2, random_state=42, stratify=y_b
+)
+clean_model = xgb.XGBClassifier(
+    n_estimators=500, learning_rate=0.05, max_depth=6,
+    subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
+)
+clean_model.fit(X_train_clean, y_train_clean)
+clean_train_acc = accuracy_score(y_train_clean, clean_model.predict(X_train_clean))
+clean_test_acc = accuracy_score(y_test_clean, clean_model.predict(X_test_clean))
+clean_cv = cross_val_score(clean_model, X_clean, y_b, cv=5, scoring='accuracy')
+print(f"  Clean Train Acc: {clean_train_acc:.4f}")
+print(f"  Clean Test Acc:  {clean_test_acc:.4f}")
+print(f"  Clean CV Acc:    {clean_cv.mean():.4f} ± {clean_cv.std():.4f}")
+print(f"  Gap: {clean_train_acc - clean_test_acc:.4f}")
+# Also try: include hours_logged and estimated_hours but NOT the ratio
+# and include days_in_current_stage but with noise
+print("\n  --- Retrain with SAFE derived features ---")
+df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100  # percentage, not ratio threshold
+df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)  # hours per stage
+SAFE_BOTTLENECK_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'hours_logged',
+    'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
+    'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
+    'hours_budget_pct', 'stage_velocity'
+]
+X_safe = df[SAFE_BOTTLENECK_FEATURES]
+X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
+    X_safe, y_b, test_size=0.2, random_state=42, stratify=y_b
+)
+safe_model = xgb.XGBClassifier(
+    n_estimators=500, learning_rate=0.05, max_depth=6,
+    subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42
+)
+safe_model.fit(X_train_s, y_train_s)
+safe_train_acc = accuracy_score(y_train_s, safe_model.predict(X_train_s))
+safe_test_acc = accuracy_score(y_test_s, safe_model.predict(X_test_s))
+safe_cv = cross_val_score(safe_model, X_safe, y_b, cv=5, scoring='accuracy')
+safe_f1 = f1_score(y_test_s, safe_model.predict(X_test_s), average='weighted')
+print(f"  Safe Train Acc: {safe_train_acc:.4f}")
+print(f"  Safe Test Acc:  {safe_test_acc:.4f}")
+print(f"  Safe CV Acc:    {safe_cv.mean():.4f} ± {safe_cv.std():.4f}")
+print(f"  Safe F1:        {safe_f1:.4f}")
+print(f"  Gap: {safe_train_acc - safe_test_acc:.4f}")
+# =====================================================================
+# MODEL 4: Completion Predictor — Check train/test gap
+# =====================================================================
+print("\n" + "=" * 70)
+print("MODEL 4: Completion Predictor")
+print("=" * 70)
+# Check R² gap: test=0.9446 vs CV=0.8869 — that's a 0.058 gap
+print(f"  Test R²: 0.9446  |  CV R²: 0.8869  |  Gap: 0.0577")
+print(f"  ⚠️ Test-CV gap of 0.058 suggests mild overfitting to the test split")
+print(f"  Also: MAPE=32% despite low MAE — suggests poor performance on small remaining-hours predictions")
+# The training creates MULTIPLE samples per block (one per stage transition)
+# Samples from the same block in both train and test = GROUP LEAKAGE
+print(f"\n  ⚠️ POTENTIAL GROUP LEAKAGE:")
+print(f"  Training created {18000} samples from {3000} blocks = ~6 per block")
+print(f"  Random split means samples from SAME block appear in train AND test")
+print(f"  This inflates test metrics because the model 'sees' other stages of the same block")
+# Retrain with GROUP-AWARE split
+print("\n  --- Retrain with GROUP-AWARE split (split by block, not sample) ---")
+COMPLETION_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
+    'drc_iterations', 'current_stage_idx', 'cumulative_hours',
+    'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
+    'hours_vs_estimate_ratio', 'stages_completed',
+    'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
+]
+# Rebuild training samples WITH block_id for grouping
+training_samples = []
+for _, row in completed.iterrows():
+    try:
+        transitions = json.loads(row['transitions'])
+    except:
+        continue
+    total_actual_hours = row['actual_hours']
+    cumulative_hours = 0
+    cumulative_days = 0
+    cumulative_drc = 0
+    cumulative_lvs = 0
+    for i, t in enumerate(transitions):
+        if i == 0:
+            continue
+        stage_hours = t.get('hours_in_stage', 0)
+        stage_days = t.get('days_in_stage', 0)
+        cumulative_hours += stage_hours
+        cumulative_days += stage_days
+        cumulative_drc += t.get('drc_violations', 0)
+        cumulative_lvs += t.get('lvs_mismatches', 0)
+        remaining_hours = max(0, total_actual_hours - cumulative_hours)
+        sample = {
+            'block_id': row['block_id'],
+            'tech_node_encoded': row.get('tech_node_encoded', 0),
+            'block_type_encoded': row.get('block_type_encoded', 0),
+            'priority_numeric': row['priority_numeric'],
+            'transistor_count_log': row['transistor_count_log'],
+            'has_dependencies': row['has_dependencies'],
+            'num_dependencies': row['num_dependencies'],
+            'constraint_complexity': row['constraint_complexity'],
+            'estimated_hours': row['estimated_hours'],
+            'engineer_skill_factor': row['engineer_skill_factor'],
+            'drc_iterations': row['drc_iterations'],
+            'current_stage_idx': i,
+            'cumulative_hours': cumulative_hours,
+            'cumulative_days': cumulative_days,
+            'cumulative_drc_violations': cumulative_drc,
+            'cumulative_lvs_mismatches': cumulative_lvs,
+            'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
+            'stages_completed': i,
+            'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
+            'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
+            'remaining_hours': remaining_hours,
+        }
+        training_samples.append(sample)
+train_df = pd.DataFrame(training_samples)
+# Group-aware split: all samples from a block go to SAME split
+unique_blocks = train_df['block_id'].unique()
+np.random.seed(42)
+np.random.shuffle(unique_blocks)
+split_idx = int(len(unique_blocks) * 0.8)
+train_blocks = set(unique_blocks[:split_idx])
+test_blocks = set(unique_blocks[split_idx:])
+train_mask = train_df['block_id'].isin(train_blocks)
+test_mask = train_df['block_id'].isin(test_blocks)
+X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
+y_train_g = train_df.loc[train_mask, 'remaining_hours']
+X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
+y_test_g = train_df.loc[test_mask, 'remaining_hours']
+print(f"  Train: {len(X_train_g)} samples from {len(train_blocks)} blocks")
+print(f"  Test:  {len(X_test_g)} samples from {len(test_blocks)} blocks")
+group_model = xgb.XGBRegressor(
+    n_estimators=800, learning_rate=0.03, max_depth=8,
+    subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42,
+    early_stopping_rounds=50,
+)
+group_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
+train_pred_g = group_model.predict(X_train_g)
+test_pred_g = group_model.predict(X_test_g)
+train_r2_g = r2_score(y_train_g, train_pred_g)
+test_r2_g = r2_score(y_test_g, test_pred_g)
+train_mae_g = mean_absolute_error(y_train_g, train_pred_g)
+test_mae_g = mean_absolute_error(y_test_g, test_pred_g)
+print(f"  Group-split Train R²: {train_r2_g:.4f}  |  Train MAE: {train_mae_g:.2f}h")
+print(f"  Group-split Test R²:  {test_r2_g:.4f}  |  Test MAE:  {test_mae_g:.2f}h")
+print(f"  Gap: {train_r2_g - test_r2_g:.4f}")
+# Compare to old results
+print(f"\n  Comparison:")
+print(f"    Original (random split): Test R² = 0.9446, MAE = 1.65h")
+print(f"    Group-aware split:       Test R² = {test_r2_g:.4f}, MAE = {test_mae_g:.2f}h")
+# =====================================================================
+# SUMMARY
+# =====================================================================
+print("\n" + "=" * 70)
+print("OVERFITTING SUMMARY")
+print("=" * 70)
+print("""
+┌───────────────────────┬──────────┬──────────┬────────┬──────────────────────────┐
+│ Model                 │ Train    │ Test     │ Gap    │ Verdict                  │
+├───────────────────────┼──────────┼──────────┼────────┼──────────────────────────┤""")
+print(f"│ Hours Estimator       │ R²={train_r2:.3f} │ R²={test_r2:.3f} │ {train_r2-test_r2:.4f} │ {'⚠️  Moderate overfit' if (train_r2-test_r2) > 0.05 else '✅ Acceptable'}       │")
+print(f"│ Complexity Classifier │ Acc={train_acc_xgb:.3f}│ Acc={test_acc_xgb:.3f}│ {train_acc_xgb-test_acc_xgb:.4f} │ {'⚠️  Overfit' if (train_acc_xgb-test_acc_xgb) > 0.05 else '✅ Acceptable'}            │")
+print(f"│ Bottleneck (original) │  99.9%   │  99.6%   │ 0.003  │ 🔴 DATA LEAKAGE         │")
+print(f"│ Bottleneck (clean)    │ Acc={clean_train_acc:.3f}│ Acc={clean_test_acc:.3f}│ {clean_train_acc-clean_test_acc:.4f} │ Honest metrics           │")
+print(f"│ Bottleneck (safe)     │ Acc={safe_train_acc:.3f}│ Acc={safe_test_acc:.3f}│ {safe_train_acc-safe_test_acc:.4f} │ Best honest version      │")
+print(f"│ Completion (original) │ R²~0.97  │ R²=0.945 │ ~0.03  │ 🔴 GROUP LEAKAGE         │")
+print(f"│ Completion (grouped)  │ R²={train_r2_g:.3f} │ R²={test_r2_g:.3f} │ {train_r2_g-test_r2_g:.4f} │ Honest metrics           │")
+print("└───────────────────────┴──────────┴──────────┴────────┴──────────────────────────┘")
+# Save corrected models
+print("\n--- Saving corrected models ---")
+from sklearn.calibration import CalibratedClassifierCV
+# Save the safe bottleneck model
+safe_calibrated = CalibratedClassifierCV(
+    xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
+                       subsample=0.8, colsample_bytree=0.8, tree_method='hist', random_state=42),
+    cv=3, method='isotonic'
+)
+safe_calibrated.fit(X_safe, y_b)
+joblib.dump(safe_calibrated, '/app/models/bottleneck_predictor_v2.joblib')
+# Save group-aware completion model
+joblib.dump(group_model, '/app/models/completion_predictor_v2.joblib')
+# Update feature config
+with open('/app/models/feature_config.json', 'r') as f:
+    config = json.load(f)
+config['bottleneck_features_v2'] = SAFE_BOTTLENECK_FEATURES
+with open('/app/models/feature_config.json', 'w') as f:
+    json.dump(config, f, indent=2)
+# Update metrics
+with open('/app/models/metrics.json', 'r') as f:
+    metrics = json.load(f)
+metrics['bottleneck_prediction_v2'] = {
+    'accuracy': round(safe_test_acc, 4),
+    'f1_weighted': round(safe_f1, 4),
+    'train_test_gap': round(safe_train_acc - safe_test_acc, 4),
+    'cv_accuracy_mean': round(safe_cv.mean(), 4),
+    'cv_accuracy_std': round(safe_cv.std(), 4),
+    'note': 'Leaky features (hours_over_estimate_ratio, days_in_current_stage, is_overdue) removed'
+}
+metrics['completion_prediction_v2'] = {
+    'mae': round(test_mae_g, 2),
+    'rmse': round(np.sqrt(mean_squared_error(y_test_g, test_pred_g)), 2),
+    'r2': round(test_r2_g, 4),
+    'train_test_gap': round(train_r2_g - test_r2_g, 4),
+    'note': 'Group-aware split (no samples from same block in train and test)'
+}
+with open('/app/models/metrics.json', 'w') as f:
+    json.dump(metrics, f, indent=2)
+print(f"Saved: bottleneck_predictor_v2.joblib")
+print(f"Saved: completion_predictor_v2.joblib")
+print(f"Updated: metrics.json, feature_config.json")
+print("\n" + "=" * 70)
+print("DONE — Corrected models saved. Upload v2 models to replace originals.")
+print("=" * 70)