muthuk1
/

alwas-ml-models

Joblib

Model card Files Files and versions

xet

Community

muthuk1 commited on 14 days ago

Commit

0764371

verified ·

1 Parent(s): 16f6fe0

Upload training/train_v2.py with huggingface_hub

Browse files

Files changed (1) hide show

training/train_v2.py +438 -0

training/train_v2.py ADDED Viewed

	@@ -0,0 +1,438 @@

+"""
+ALWAS ML Models v2 — Retrained with overfitting fixes:
+1. Hours: stronger regularization (lower depth, higher min_child_weight)
+2. Complexity: reduced tree depth + stronger L1/L2
+3. Bottleneck: removed leaky features
+4. Completion: group-aware split
+"""
+import numpy as np
+import pandas as pd
+import json
+import joblib
+import os
+from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
+from sklearn.metrics import (
+    mean_absolute_error, mean_squared_error, r2_score,
+    classification_report, accuracy_score, f1_score
+)
+from sklearn.calibration import CalibratedClassifierCV
+import xgboost as xgb
+import lightgbm as lgb
+df = pd.read_csv('/app/alwas_blocks_dataset.csv')
+# Encode
+tech_node_encoder = LabelEncoder()
+block_type_encoder = LabelEncoder()
+priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])
+df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
+df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
+df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
+df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
+df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
+df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']
+complexity_encoder = LabelEncoder()
+df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
+bottleneck_encoder = LabelEncoder()
+df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])
+# Safe derived features for bottleneck
+df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
+df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)
+completed = df[df['is_completed'] == 1].copy()
+all_metrics = {}
+# =====================================================================
+# MODEL 1: Hours Estimator — REGULARIZED
+# =====================================================================
+print("=" * 60)
+print("MODEL 1: Hours Estimator (regularized)")
+print("=" * 60)
+HOURS_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count', 'transistor_count_log', 'has_dependencies',
+    'num_dependencies', 'constraint_complexity', 'drc_iterations',
+    'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
+    'size_priority_interaction'
+]
+X_h = completed[HOURS_FEATURES]
+y_h = completed['actual_hours']
+X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)
+hours_model = xgb.XGBRegressor(
+    n_estimators=300,       # reduced from 500
+    learning_rate=0.05,
+    max_depth=4,            # reduced from 7
+    subsample=0.7,          # reduced from 0.8
+    colsample_bytree=0.7,   # reduced from 0.8
+    min_child_weight=10,    # increased from 3
+    reg_alpha=1.0,          # increased from 0.1
+    reg_lambda=5.0,         # increased from 1.0
+    gamma=0.5,              # added: min split loss
+    objective='reg:squarederror',
+    tree_method='hist',
+    random_state=42,
+    early_stopping_rounds=30,
+)
+hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)
+train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
+test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
+train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
+test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
+cv_model_h = xgb.XGBRegressor(
+    n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
+    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
+    gamma=0.5, tree_method='hist', random_state=42,
+)
+cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')
+print(f"  Train R²: {train_r2:.4f}  Test R²: {test_r2:.4f}  Gap: {train_r2-test_r2:.4f}")
+print(f"  Train MAE: {train_mae:.2f}  Test MAE: {test_mae:.2f}")
+print(f"  CV R²: {cv.mean():.4f} ± {cv.std():.4f}")
+all_metrics['hours_estimation'] = {
+    'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
+    'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
+    'gap': round(train_r2 - test_r2, 4),
+    'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
+}
+# =====================================================================
+# MODEL 2: Complexity Classifier — REGULARIZED
+# =====================================================================
+print("\n" + "=" * 60)
+print("MODEL 2: Complexity Classifier (regularized)")
+print("=" * 60)
+COMPLEXITY_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count', 'transistor_count_log', 'has_dependencies',
+    'num_dependencies', 'constraint_complexity', 'drc_iterations',
+    'type_node_interaction', 'complexity_score', 'size_priority_interaction'
+]
+X_c = completed[COMPLEXITY_FEATURES]
+y_c = completed['complexity_encoded']
+X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)
+xgb_clf = xgb.XGBClassifier(
+    n_estimators=200,
+    learning_rate=0.05,
+    max_depth=4,            # reduced from 6
+    subsample=0.7,
+    colsample_bytree=0.7,
+    min_child_weight=10,    # increased
+    reg_alpha=1.0,
+    reg_lambda=5.0,
+    gamma=0.5,
+    objective='multi:softprob',
+    num_class=3,
+    tree_method='hist',
+    random_state=42,
+    early_stopping_rounds=30,
+)
+xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)
+lgb_clf = lgb.LGBMClassifier(
+    n_estimators=200,
+    learning_rate=0.05,
+    num_leaves=15,          # reduced from 63
+    max_depth=4,
+    subsample=0.7,
+    colsample_bytree=0.7,
+    min_child_samples=20,   # increased
+    reg_alpha=1.0,
+    reg_lambda=5.0,
+    random_state=42,
+    verbose=-1,
+)
+lgb_clf.fit(X_train_c, y_train_c)
+train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
+test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
+train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
+test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))
+# Ensemble
+xgb_p = xgb_clf.predict_proba(X_test_c)
+lgb_p = lgb_clf.predict_proba(X_test_c)
+ens_p = (xgb_p + lgb_p) / 2
+y_pred_ens = np.argmax(ens_p, axis=1)
+ens_acc = accuracy_score(y_test_c, y_pred_ens)
+ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')
+cv_model_c = xgb.XGBClassifier(
+    n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
+    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
+    gamma=0.5, tree_method='hist', random_state=42,
+)
+cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')
+print(f"  XGB  Train: {train_xgb:.4f}  Test: {test_xgb:.4f}  Gap: {train_xgb-test_xgb:.4f}")
+print(f"  LGB  Train: {train_lgb:.4f}  Test: {test_lgb:.4f}  Gap: {train_lgb-test_lgb:.4f}")
+print(f"  Ensemble Test Acc: {ens_acc:.4f}  F1: {ens_f1:.4f}")
+print(f"  CV Acc: {cv_c.mean():.4f} ± {cv_c.std():.4f}")
+all_metrics['complexity_classification'] = {
+    'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
+    'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
+    'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
+    'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
+}
+# =====================================================================
+# MODEL 3: Bottleneck — LEAKAGE-FREE
+# =====================================================================
+print("\n" + "=" * 60)
+print("MODEL 3: Bottleneck Predictor (leakage-free)")
+print("=" * 60)
+SAFE_BOTTLENECK_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'hours_logged',
+    'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
+    'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
+    'hours_budget_pct', 'stage_velocity'
+]
+X_b = df[SAFE_BOTTLENECK_FEATURES]
+y_b = df['bottleneck_encoded']
+X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)
+base_bn = xgb.XGBClassifier(
+    n_estimators=300,
+    learning_rate=0.05,
+    max_depth=4,
+    subsample=0.7,
+    colsample_bytree=0.7,
+    min_child_weight=10,
+    reg_alpha=1.0,
+    reg_lambda=5.0,
+    gamma=0.5,
+    objective='multi:softprob',
+    num_class=3,
+    tree_method='hist',
+    random_state=42,
+)
+bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
+bn_model.fit(X_train_b, y_train_b)
+train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
+test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
+test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
+cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')
+print(f"  Train Acc: {train_bn:.4f}  Test Acc: {test_bn:.4f}  Gap: {train_bn-test_bn:.4f}")
+print(f"  F1 (weighted): {test_f1_bn:.4f}")
+print(f"  CV Acc: {cv_bn.mean():.4f} ± {cv_bn.std():.4f}")
+print(f"\n  Classification Report:")
+print(classification_report(y_test_b, bn_model.predict(X_test_b),
+                            target_names=bottleneck_encoder.classes_))
+all_metrics['bottleneck_prediction'] = {
+    'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
+    'gap': round(train_bn - test_bn, 4),
+    'f1_weighted': round(test_f1_bn, 4),
+    'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
+    'features_used': 'SAFE (no leaky features)',
+}
+# =====================================================================
+# MODEL 4: Completion — GROUP-AWARE SPLIT
+# =====================================================================
+print("\n" + "=" * 60)
+print("MODEL 4: Completion Predictor (group-aware split)")
+print("=" * 60)
+COMPLETION_FEATURES = [
+    'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
+    'transistor_count_log', 'has_dependencies', 'num_dependencies',
+    'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
+    'drc_iterations', 'current_stage_idx', 'cumulative_hours',
+    'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
+    'hours_vs_estimate_ratio', 'stages_completed',
+    'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
+]
+# Build samples with block_id
+training_samples = []
+for _, row in completed.iterrows():
+    try:
+        transitions = json.loads(row['transitions'])
+    except:
+        continue
+    total_actual_hours = row['actual_hours']
+    cumulative_hours = 0
+    cumulative_days = 0
+    cumulative_drc = 0
+    cumulative_lvs = 0
+    for i, t in enumerate(transitions):
+        if i == 0:
+            continue
+        cumulative_hours += t.get('hours_in_stage', 0)
+        cumulative_days += t.get('days_in_stage', 0)
+        cumulative_drc += t.get('drc_violations', 0)
+        cumulative_lvs += t.get('lvs_mismatches', 0)
+        remaining = max(0, total_actual_hours - cumulative_hours)
+        training_samples.append({
+            'block_id': row['block_id'],
+            'tech_node_encoded': row.get('tech_node_encoded', 0),
+            'block_type_encoded': row.get('block_type_encoded', 0),
+            'priority_numeric': row['priority_numeric'],
+            'transistor_count_log': row['transistor_count_log'],
+            'has_dependencies': row['has_dependencies'],
+            'num_dependencies': row['num_dependencies'],
+            'constraint_complexity': row['constraint_complexity'],
+            'estimated_hours': row['estimated_hours'],
+            'engineer_skill_factor': row['engineer_skill_factor'],
+            'drc_iterations': row['drc_iterations'],
+            'current_stage_idx': i,
+            'cumulative_hours': cumulative_hours,
+            'cumulative_days': cumulative_days,
+            'cumulative_drc_violations': cumulative_drc,
+            'cumulative_lvs_mismatches': cumulative_lvs,
+            'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
+            'stages_completed': i,
+            'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
+            'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
+            'remaining_hours': remaining,
+        })
+train_df = pd.DataFrame(training_samples)
+# Group-aware split
+unique_blocks = train_df['block_id'].unique()
+rng = np.random.RandomState(42)
+rng.shuffle(unique_blocks)
+split_idx = int(len(unique_blocks) * 0.8)
+train_blocks = set(unique_blocks[:split_idx])
+test_blocks = set(unique_blocks[split_idx:])
+train_mask = train_df['block_id'].isin(train_blocks)
+test_mask = train_df['block_id'].isin(test_blocks)
+X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
+y_train_g = train_df.loc[train_mask, 'remaining_hours']
+X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
+y_test_g = train_df.loc[test_mask, 'remaining_hours']
+completion_model = xgb.XGBRegressor(
+    n_estimators=500,
+    learning_rate=0.03,
+    max_depth=5,            # reduced from 8
+    subsample=0.7,
+    colsample_bytree=0.7,
+    min_child_weight=10,
+    reg_alpha=1.0,
+    reg_lambda=5.0,
+    gamma=0.5,
+    objective='reg:squarederror',
+    tree_method='hist',
+    random_state=42,
+    early_stopping_rounds=30,
+)
+completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)
+train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
+test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
+train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
+test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))
+# GroupKFold CV
+groups = train_df['block_id'].values
+gkf = GroupKFold(n_splits=5)
+cv_model = xgb.XGBRegressor(
+    n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
+    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
+    gamma=0.5, tree_method='hist', random_state=42
+)
+cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES],
+                               train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')
+print(f"  Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
+print(f"  Test samples:  {len(X_test_g)} from {len(test_blocks)} blocks")
+print(f"  Train R²: {train_r2_g:.4f}  Test R²: {test_r2_g:.4f}  Gap: {train_r2_g-test_r2_g:.4f}")
+print(f"  Train MAE: {train_mae_g:.2f}  Test MAE: {test_mae_g:.2f}")
+print(f"  GroupKFold CV R²: {cv_scores_g.mean():.4f} ± {cv_scores_g.std():.4f}")
+all_metrics['completion_prediction'] = {
+    'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
+    'gap': round(train_r2_g - test_r2_g, 4),
+    'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
+    'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
+    'group_cv_r2_std': round(cv_scores_g.std(), 4),
+    'split_type': 'group-aware (block-level)',
+}
+# =====================================================================
+# SAVE ALL v2 MODELS
+# =====================================================================
+print("\n" + "=" * 60)
+print("SAVING v2 MODELS")
+print("=" * 60)
+os.makedirs('/app/models_v2', exist_ok=True)
+joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
+joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
+joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
+joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
+joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')
+# Encoders
+joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
+joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
+joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
+joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
+joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')
+# Feature config
+feature_config = {
+    'hours_features': HOURS_FEATURES,
+    'complexity_features': COMPLEXITY_FEATURES,
+    'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
+    'completion_features': COMPLETION_FEATURES,
+    'tech_nodes': list(tech_node_encoder.classes_),
+    'block_types': list(block_type_encoder.classes_),
+    'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
+    'complexity_classes': list(complexity_encoder.classes_),
+    'bottleneck_classes': list(bottleneck_encoder.classes_),
+}
+with open('/app/models_v2/feature_config.json', 'w') as f:
+    json.dump(feature_config, f, indent=2)
+# Metrics
+all_metrics['training_data'] = {
+    'total_samples': len(df),
+    'completed_blocks': int(df['is_completed'].sum()),
+    'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
+    'completion_train_samples': len(X_train_g),
+}
+with open('/app/models_v2/metrics.json', 'w') as f:
+    json.dump(all_metrics, f, indent=2)
+print("All v2 models saved to /app/models_v2/")
+# Final summary
+print("\n" + "=" * 60)
+print("v1 vs v2 COMPARISON")
+print("=" * 60)
+print(f"""
+┌───────────────────────┬────────────────────────┬────────────────────────┐
+│ Model                 │ v1 (overfit)           │ v2 (fixed)             │
+├───────────────────────┼────────────────────────┼────────────────────────┤
+│ Hours Estimator       │ R²=0.881 (gap 0.113)   │ R²={test_r2:.3f} (gap {train_r2-test_r2:.3f})   │
+│ Complexity Classifier │ Acc=92.3% (gap 5.9%)   │ Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%)   │
+│ Bottleneck Predictor  │ 99.6% (DATA LEAKAGE)   │ {test_bn*100:.1f}% (honest)       │
+│ Completion Predictor  │ R²=0.945 (GROUP LEAK)  │ R²={test_r2_g:.3f} (grouped)     │
+└───────────────────────┴────────────────────────┴────────────────────────┘
+""")