File size: 18,552 Bytes
"""
ALWAS ML Models v2 — Retrained with overfitting fixes:
1. Hours: stronger regularization (lower depth, higher min_child_weight)
2. Complexity: reduced tree depth + stronger L1/L2
3. Bottleneck: removed leaky features
4. Completion: group-aware split
"""
import numpy as np
import pandas as pd
import json
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report, accuracy_score, f1_score
)
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
import lightgbm as lgb

df = pd.read_csv('/app/alwas_blocks_dataset.csv')

# Encode
tech_node_encoder = LabelEncoder()
block_type_encoder = LabelEncoder()
priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])

df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']

complexity_encoder = LabelEncoder()
df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
bottleneck_encoder = LabelEncoder()
df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])

# Safe derived features for bottleneck
df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)

completed = df[df['is_completed'] == 1].copy()

all_metrics = {}

# =====================================================================
# MODEL 1: Hours Estimator — REGULARIZED
# =====================================================================
print("=" * 60)
print("MODEL 1: Hours Estimator (regularized)")
print("=" * 60)

HOURS_FEATURES = [
    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
    'transistor_count', 'transistor_count_log', 'has_dependencies',
    'num_dependencies', 'constraint_complexity', 'drc_iterations',
    'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
    'size_priority_interaction'
]

X_h = completed[HOURS_FEATURES]
y_h = completed['actual_hours']
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)

hours_model = xgb.XGBRegressor(
    n_estimators=300,       # reduced from 500
    learning_rate=0.05,
    max_depth=4,            # reduced from 7
    subsample=0.7,          # reduced from 0.8
    colsample_bytree=0.7,   # reduced from 0.8
    min_child_weight=10,    # increased from 3
    reg_alpha=1.0,          # increased from 0.1
    reg_lambda=5.0,         # increased from 1.0
    gamma=0.5,              # added: min split loss
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    early_stopping_rounds=30,
)
hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)

train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
cv_model_h = xgb.XGBRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
    gamma=0.5, tree_method='hist', random_state=42,
)
cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')

print(f"  Train R²: {train_r2:.4f}  Test R²: {test_r2:.4f}  Gap: {train_r2-test_r2:.4f}")
print(f"  Train MAE: {train_mae:.2f}  Test MAE: {test_mae:.2f}")
print(f"  CV R²: {cv.mean():.4f} ± {cv.std():.4f}")

all_metrics['hours_estimation'] = {
    'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
    'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
    'gap': round(train_r2 - test_r2, 4),
    'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
}

# =====================================================================
# MODEL 2: Complexity Classifier — REGULARIZED
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 2: Complexity Classifier (regularized)")
print("=" * 60)

COMPLEXITY_FEATURES = [
    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
    'transistor_count', 'transistor_count_log', 'has_dependencies',
    'num_dependencies', 'constraint_complexity', 'drc_iterations',
    'type_node_interaction', 'complexity_score', 'size_priority_interaction'
]

X_c = completed[COMPLEXITY_FEATURES]
y_c = completed['complexity_encoded']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)

xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,            # reduced from 6
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=10,    # increased
    reg_alpha=1.0,
    reg_lambda=5.0,
    gamma=0.5,
    objective='multi:softprob',
    num_class=3,
    tree_method='hist',
    random_state=42,
    early_stopping_rounds=30,
)
xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)

lgb_clf = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=15,          # reduced from 63
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_samples=20,   # increased
    reg_alpha=1.0,
    reg_lambda=5.0,
    random_state=42,
    verbose=-1,
)
lgb_clf.fit(X_train_c, y_train_c)

train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))

# Ensemble
xgb_p = xgb_clf.predict_proba(X_test_c)
lgb_p = lgb_clf.predict_proba(X_test_c)
ens_p = (xgb_p + lgb_p) / 2
y_pred_ens = np.argmax(ens_p, axis=1)
ens_acc = accuracy_score(y_test_c, y_pred_ens)
ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')

cv_model_c = xgb.XGBClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
    gamma=0.5, tree_method='hist', random_state=42,
)
cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')

print(f"  XGB  Train: {train_xgb:.4f}  Test: {test_xgb:.4f}  Gap: {train_xgb-test_xgb:.4f}")
print(f"  LGB  Train: {train_lgb:.4f}  Test: {test_lgb:.4f}  Gap: {train_lgb-test_lgb:.4f}")
print(f"  Ensemble Test Acc: {ens_acc:.4f}  F1: {ens_f1:.4f}")
print(f"  CV Acc: {cv_c.mean():.4f} ± {cv_c.std():.4f}")

all_metrics['complexity_classification'] = {
    'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
    'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
    'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
    'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
}

# =====================================================================
# MODEL 3: Bottleneck — LEAKAGE-FREE
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 3: Bottleneck Predictor (leakage-free)")
print("=" * 60)

SAFE_BOTTLENECK_FEATURES = [
    'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
    'transistor_count_log', 'has_dependencies', 'num_dependencies',
    'constraint_complexity', 'estimated_hours', 'hours_logged',
    'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
    'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
    'hours_budget_pct', 'stage_velocity'
]

X_b = df[SAFE_BOTTLENECK_FEATURES]
y_b = df['bottleneck_encoded']
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)

base_bn = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=10,
    reg_alpha=1.0,
    reg_lambda=5.0,
    gamma=0.5,
    objective='multi:softprob',
    num_class=3,
    tree_method='hist',
    random_state=42,
)
bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
bn_model.fit(X_train_b, y_train_b)

train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')

print(f"  Train Acc: {train_bn:.4f}  Test Acc: {test_bn:.4f}  Gap: {train_bn-test_bn:.4f}")
print(f"  F1 (weighted): {test_f1_bn:.4f}")
print(f"  CV Acc: {cv_bn.mean():.4f} ± {cv_bn.std():.4f}")
print(f"\n  Classification Report:")
print(classification_report(y_test_b, bn_model.predict(X_test_b), 
                            target_names=bottleneck_encoder.classes_))

all_metrics['bottleneck_prediction'] = {
    'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
    'gap': round(train_bn - test_bn, 4),
    'f1_weighted': round(test_f1_bn, 4),
    'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
    'features_used': 'SAFE (no leaky features)',
}

# =====================================================================
# MODEL 4: Completion — GROUP-AWARE SPLIT
# =====================================================================
print("\n" + "=" * 60)
print("MODEL 4: Completion Predictor (group-aware split)")
print("=" * 60)

COMPLETION_FEATURES = [
    'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
    'transistor_count_log', 'has_dependencies', 'num_dependencies',
    'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
    'drc_iterations', 'current_stage_idx', 'cumulative_hours',
    'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
    'hours_vs_estimate_ratio', 'stages_completed',
    'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
]

# Build samples with block_id
training_samples = []
for _, row in completed.iterrows():
    try:
        transitions = json.loads(row['transitions'])
    except:
        continue
    total_actual_hours = row['actual_hours']
    cumulative_hours = 0
    cumulative_days = 0
    cumulative_drc = 0
    cumulative_lvs = 0
    for i, t in enumerate(transitions):
        if i == 0:
            continue
        cumulative_hours += t.get('hours_in_stage', 0)
        cumulative_days += t.get('days_in_stage', 0)
        cumulative_drc += t.get('drc_violations', 0)
        cumulative_lvs += t.get('lvs_mismatches', 0)
        remaining = max(0, total_actual_hours - cumulative_hours)
        training_samples.append({
            'block_id': row['block_id'],
            'tech_node_encoded': row.get('tech_node_encoded', 0),
            'block_type_encoded': row.get('block_type_encoded', 0),
            'priority_numeric': row['priority_numeric'],
            'transistor_count_log': row['transistor_count_log'],
            'has_dependencies': row['has_dependencies'],
            'num_dependencies': row['num_dependencies'],
            'constraint_complexity': row['constraint_complexity'],
            'estimated_hours': row['estimated_hours'],
            'engineer_skill_factor': row['engineer_skill_factor'],
            'drc_iterations': row['drc_iterations'],
            'current_stage_idx': i,
            'cumulative_hours': cumulative_hours,
            'cumulative_days': cumulative_days,
            'cumulative_drc_violations': cumulative_drc,
            'cumulative_lvs_mismatches': cumulative_lvs,
            'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
            'stages_completed': i,
            'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
            'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
            'remaining_hours': remaining,
        })

train_df = pd.DataFrame(training_samples)

# Group-aware split
unique_blocks = train_df['block_id'].unique()
rng = np.random.RandomState(42)
rng.shuffle(unique_blocks)
split_idx = int(len(unique_blocks) * 0.8)
train_blocks = set(unique_blocks[:split_idx])
test_blocks = set(unique_blocks[split_idx:])

train_mask = train_df['block_id'].isin(train_blocks)
test_mask = train_df['block_id'].isin(test_blocks)

X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
y_train_g = train_df.loc[train_mask, 'remaining_hours']
X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
y_test_g = train_df.loc[test_mask, 'remaining_hours']

completion_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=5,            # reduced from 8
    subsample=0.7,
    colsample_bytree=0.7,
    min_child_weight=10,
    reg_alpha=1.0,
    reg_lambda=5.0,
    gamma=0.5,
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42,
    early_stopping_rounds=30,
)
completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)

train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))

# GroupKFold CV
groups = train_df['block_id'].values
gkf = GroupKFold(n_splits=5)
cv_model = xgb.XGBRegressor(
    n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
    colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
    gamma=0.5, tree_method='hist', random_state=42
)
cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES], 
                               train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')

print(f"  Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
print(f"  Test samples:  {len(X_test_g)} from {len(test_blocks)} blocks")
print(f"  Train R²: {train_r2_g:.4f}  Test R²: {test_r2_g:.4f}  Gap: {train_r2_g-test_r2_g:.4f}")
print(f"  Train MAE: {train_mae_g:.2f}  Test MAE: {test_mae_g:.2f}")
print(f"  GroupKFold CV R²: {cv_scores_g.mean():.4f} ± {cv_scores_g.std():.4f}")

all_metrics['completion_prediction'] = {
    'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
    'gap': round(train_r2_g - test_r2_g, 4),
    'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
    'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
    'group_cv_r2_std': round(cv_scores_g.std(), 4),
    'split_type': 'group-aware (block-level)',
}

# =====================================================================
# SAVE ALL v2 MODELS
# =====================================================================
print("\n" + "=" * 60)
print("SAVING v2 MODELS")
print("=" * 60)

os.makedirs('/app/models_v2', exist_ok=True)

joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')

# Encoders
joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')

# Feature config
feature_config = {
    'hours_features': HOURS_FEATURES,
    'complexity_features': COMPLEXITY_FEATURES,
    'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
    'completion_features': COMPLETION_FEATURES,
    'tech_nodes': list(tech_node_encoder.classes_),
    'block_types': list(block_type_encoder.classes_),
    'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
    'complexity_classes': list(complexity_encoder.classes_),
    'bottleneck_classes': list(bottleneck_encoder.classes_),
}
with open('/app/models_v2/feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)

# Metrics
all_metrics['training_data'] = {
    'total_samples': len(df),
    'completed_blocks': int(df['is_completed'].sum()),
    'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
    'completion_train_samples': len(X_train_g),
}
with open('/app/models_v2/metrics.json', 'w') as f:
    json.dump(all_metrics, f, indent=2)

print("All v2 models saved to /app/models_v2/")

# Final summary
print("\n" + "=" * 60)
print("v1 vs v2 COMPARISON")
print("=" * 60)
print(f"""
┌───────────────────────┬────────────────────────┬────────────────────────┐
│ Model                 │ v1 (overfit)           │ v2 (fixed)             │
├───────────────────────┼────────────────────────┼────────────────────────┤
│ Hours Estimator       │ R²=0.881 (gap 0.113)   │ R²={test_r2:.3f} (gap {train_r2-test_r2:.3f})   │
│ Complexity Classifier │ Acc=92.3% (gap 5.9%)   │ Acc={test_xgb*100:.1f}% (gap {(train_xgb-test_xgb)*100:.1f}%)   │
│ Bottleneck Predictor  │ 99.6% (DATA LEAKAGE)   │ {test_bn*100:.1f}% (honest)       │
│ Completion Predictor  │ R²=0.945 (GROUP LEAK)  │ R²={test_r2_g:.3f} (grouped)     │
└───────────────────────┴────────────────────────┴────────────────────────┘
""")