Upload training/train_v2.py with huggingface_hub

0764371 verified 14 days ago

18.6 kB

	"""
	ALWAS ML Models v2 — Retrained with overfitting fixes:
	1. Hours: stronger regularization (lower depth, higher min_child_weight)
	2. Complexity: reduced tree depth + stronger L1/L2
	3. Bottleneck: removed leaky features
	4. Completion: group-aware split
	"""
	import numpy as np
	import pandas as pd
	import json
	import joblib
	import os
	from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold
	from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
	from sklearn.metrics import (
	mean_absolute_error, mean_squared_error, r2_score,
	classification_report, accuracy_score, f1_score
	)
	from sklearn.calibration import CalibratedClassifierCV
	import xgboost as xgb
	import lightgbm as lgb

	df = pd.read_csv('/app/alwas_blocks_dataset.csv')

	# Encode
	tech_node_encoder = LabelEncoder()
	block_type_encoder = LabelEncoder()
	priority_encoder = OrdinalEncoder(categories=[['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical']])

	df['tech_node_encoded'] = tech_node_encoder.fit_transform(df['tech_node'])
	df['block_type_encoded'] = block_type_encoder.fit_transform(df['block_type'])
	df['priority_encoded'] = priority_encoder.fit_transform(df[['priority']]).astype(int).flatten()
	df['type_node_interaction'] = df['tech_node_encoded'] * 10 + df['block_type_encoded']
	df['complexity_score'] = df['constraint_complexity'] * df['transistor_count_log']
	df['size_priority_interaction'] = df['transistor_count_log'] * df['priority_numeric']

	complexity_encoder = LabelEncoder()
	df['complexity_encoded'] = complexity_encoder.fit_transform(df['complexity'])
	bottleneck_encoder = LabelEncoder()
	df['bottleneck_encoded'] = bottleneck_encoder.fit_transform(df['bottleneck_risk'])

	# Safe derived features for bottleneck
	df['hours_budget_pct'] = df['hours_logged'] / df['estimated_hours'].clip(lower=1) * 100
	df['stage_velocity'] = df['hours_logged'] / df['current_stage_idx'].clip(lower=1)

	completed = df[df['is_completed'] == 1].copy()

	all_metrics = {}

	# =====================================================================
	# MODEL 1: Hours Estimator — REGULARIZED
	# =====================================================================
	print("=" * 60)
	print("MODEL 1: Hours Estimator (regularized)")
	print("=" * 60)

	HOURS_FEATURES = [
	'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
	'transistor_count', 'transistor_count_log', 'has_dependencies',
	'num_dependencies', 'constraint_complexity', 'drc_iterations',
	'engineer_skill_factor', 'type_node_interaction', 'complexity_score',
	'size_priority_interaction'
	]

	X_h = completed[HOURS_FEATURES]
	y_h = completed['actual_hours']
	X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, random_state=42)

	hours_model = xgb.XGBRegressor(
	n_estimators=300, # reduced from 500
	learning_rate=0.05,
	max_depth=4, # reduced from 7
	subsample=0.7, # reduced from 0.8
	colsample_bytree=0.7, # reduced from 0.8
	min_child_weight=10, # increased from 3
	reg_alpha=1.0, # increased from 0.1
	reg_lambda=5.0, # increased from 1.0
	gamma=0.5, # added: min split loss
	objective='reg:squarederror',
	tree_method='hist',
	random_state=42,
	early_stopping_rounds=30,
	)
	hours_model.fit(X_train_h, y_train_h, eval_set=[(X_test_h, y_test_h)], verbose=False)

	train_r2 = r2_score(y_train_h, hours_model.predict(X_train_h))
	test_r2 = r2_score(y_test_h, hours_model.predict(X_test_h))
	train_mae = mean_absolute_error(y_train_h, hours_model.predict(X_train_h))
	test_mae = mean_absolute_error(y_test_h, hours_model.predict(X_test_h))
	cv_model_h = xgb.XGBRegressor(
	n_estimators=300, learning_rate=0.05, max_depth=4, subsample=0.7,
	colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
	gamma=0.5, tree_method='hist', random_state=42,
	)
	cv = cross_val_score(cv_model_h, X_h, y_h, cv=5, scoring='r2')

	print(f" Train R²: {train_r2:.4f} Test R²: {test_r2:.4f} Gap: {train_r2-test_r2:.4f}")
	print(f" Train MAE: {train_mae:.2f} Test MAE: {test_mae:.2f}")
	print(f" CV R²: {cv.mean():.4f} ± {cv.std():.4f}")

	all_metrics['hours_estimation'] = {
	'train_r2': round(train_r2, 4), 'test_r2': round(test_r2, 4),
	'train_mae': round(train_mae, 2), 'test_mae': round(test_mae, 2),
	'gap': round(train_r2 - test_r2, 4),
	'cv_r2_mean': round(cv.mean(), 4), 'cv_r2_std': round(cv.std(), 4),
	}

	# =====================================================================
	# MODEL 2: Complexity Classifier — REGULARIZED
	# =====================================================================
	print("\n" + "=" * 60)
	print("MODEL 2: Complexity Classifier (regularized)")
	print("=" * 60)

	COMPLEXITY_FEATURES = [
	'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
	'transistor_count', 'transistor_count_log', 'has_dependencies',
	'num_dependencies', 'constraint_complexity', 'drc_iterations',
	'type_node_interaction', 'complexity_score', 'size_priority_interaction'
	]

	X_c = completed[COMPLEXITY_FEATURES]
	y_c = completed['complexity_encoded']
	X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)

	xgb_clf = xgb.XGBClassifier(
	n_estimators=200,
	learning_rate=0.05,
	max_depth=4, # reduced from 6
	subsample=0.7,
	colsample_bytree=0.7,
	min_child_weight=10, # increased
	reg_alpha=1.0,
	reg_lambda=5.0,
	gamma=0.5,
	objective='multi:softprob',
	num_class=3,
	tree_method='hist',
	random_state=42,
	early_stopping_rounds=30,
	)
	xgb_clf.fit(X_train_c, y_train_c, eval_set=[(X_test_c, y_test_c)], verbose=False)

	lgb_clf = lgb.LGBMClassifier(
	n_estimators=200,
	learning_rate=0.05,
	num_leaves=15, # reduced from 63
	max_depth=4,
	subsample=0.7,
	colsample_bytree=0.7,
	min_child_samples=20, # increased
	reg_alpha=1.0,
	reg_lambda=5.0,
	random_state=42,
	verbose=-1,
	)
	lgb_clf.fit(X_train_c, y_train_c)

	train_xgb = accuracy_score(y_train_c, xgb_clf.predict(X_train_c))
	test_xgb = accuracy_score(y_test_c, xgb_clf.predict(X_test_c))
	train_lgb = accuracy_score(y_train_c, lgb_clf.predict(X_train_c))
	test_lgb = accuracy_score(y_test_c, lgb_clf.predict(X_test_c))

	# Ensemble
	xgb_p = xgb_clf.predict_proba(X_test_c)
	lgb_p = lgb_clf.predict_proba(X_test_c)
	ens_p = (xgb_p + lgb_p) / 2
	y_pred_ens = np.argmax(ens_p, axis=1)
	ens_acc = accuracy_score(y_test_c, y_pred_ens)
	ens_f1 = f1_score(y_test_c, y_pred_ens, average='weighted')

	cv_model_c = xgb.XGBClassifier(
	n_estimators=200, learning_rate=0.05, max_depth=4, subsample=0.7,
	colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
	gamma=0.5, tree_method='hist', random_state=42,
	)
	cv_c = cross_val_score(cv_model_c, X_c, y_c, cv=5, scoring='accuracy')

	print(f" XGB Train: {train_xgb:.4f} Test: {test_xgb:.4f} Gap: {train_xgb-test_xgb:.4f}")
	print(f" LGB Train: {train_lgb:.4f} Test: {test_lgb:.4f} Gap: {train_lgb-test_lgb:.4f}")
	print(f" Ensemble Test Acc: {ens_acc:.4f} F1: {ens_f1:.4f}")
	print(f" CV Acc: {cv_c.mean():.4f} ± {cv_c.std():.4f}")

	all_metrics['complexity_classification'] = {
	'xgb_train': round(train_xgb, 4), 'xgb_test': round(test_xgb, 4), 'xgb_gap': round(train_xgb-test_xgb, 4),
	'lgb_train': round(train_lgb, 4), 'lgb_test': round(test_lgb, 4), 'lgb_gap': round(train_lgb-test_lgb, 4),
	'ensemble_accuracy': round(ens_acc, 4), 'ensemble_f1': round(ens_f1, 4),
	'cv_accuracy_mean': round(cv_c.mean(), 4), 'cv_accuracy_std': round(cv_c.std(), 4),
	}

	# =====================================================================
	# MODEL 3: Bottleneck — LEAKAGE-FREE
	# =====================================================================
	print("\n" + "=" * 60)
	print("MODEL 3: Bottleneck Predictor (leakage-free)")
	print("=" * 60)

	SAFE_BOTTLENECK_FEATURES = [
	'tech_node_encoded', 'block_type_encoded', 'priority_encoded',
	'transistor_count_log', 'has_dependencies', 'num_dependencies',
	'constraint_complexity', 'estimated_hours', 'hours_logged',
	'drc_iterations', 'drc_violations_total', 'lvs_mismatches_total',
	'current_stage_idx', 'engineer_skill_factor', 'complexity_score',
	'hours_budget_pct', 'stage_velocity'
	]

	X_b = df[SAFE_BOTTLENECK_FEATURES]
	y_b = df['bottleneck_encoded']
	X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42, stratify=y_b)

	base_bn = xgb.XGBClassifier(
	n_estimators=300,
	learning_rate=0.05,
	max_depth=4,
	subsample=0.7,
	colsample_bytree=0.7,
	min_child_weight=10,
	reg_alpha=1.0,
	reg_lambda=5.0,
	gamma=0.5,
	objective='multi:softprob',
	num_class=3,
	tree_method='hist',
	random_state=42,
	)
	bn_model = CalibratedClassifierCV(base_bn, cv=3, method='isotonic')
	bn_model.fit(X_train_b, y_train_b)

	train_bn = accuracy_score(y_train_b, bn_model.predict(X_train_b))
	test_bn = accuracy_score(y_test_b, bn_model.predict(X_test_b))
	test_f1_bn = f1_score(y_test_b, bn_model.predict(X_test_b), average='weighted')
	cv_bn = cross_val_score(base_bn, X_b, y_b, cv=5, scoring='accuracy')

	print(f" Train Acc: {train_bn:.4f} Test Acc: {test_bn:.4f} Gap: {train_bn-test_bn:.4f}")
	print(f" F1 (weighted): {test_f1_bn:.4f}")
	print(f" CV Acc: {cv_bn.mean():.4f} ± {cv_bn.std():.4f}")
	print(f"\n Classification Report:")
	print(classification_report(y_test_b, bn_model.predict(X_test_b),
	target_names=bottleneck_encoder.classes_))

	all_metrics['bottleneck_prediction'] = {
	'train_accuracy': round(train_bn, 4), 'test_accuracy': round(test_bn, 4),
	'gap': round(train_bn - test_bn, 4),
	'f1_weighted': round(test_f1_bn, 4),
	'cv_accuracy_mean': round(cv_bn.mean(), 4), 'cv_accuracy_std': round(cv_bn.std(), 4),
	'features_used': 'SAFE (no leaky features)',
	}

	# =====================================================================
	# MODEL 4: Completion — GROUP-AWARE SPLIT
	# =====================================================================
	print("\n" + "=" * 60)
	print("MODEL 4: Completion Predictor (group-aware split)")
	print("=" * 60)

	COMPLETION_FEATURES = [
	'tech_node_encoded', 'block_type_encoded', 'priority_numeric',
	'transistor_count_log', 'has_dependencies', 'num_dependencies',
	'constraint_complexity', 'estimated_hours', 'engineer_skill_factor',
	'drc_iterations', 'current_stage_idx', 'cumulative_hours',
	'cumulative_days', 'cumulative_drc_violations', 'cumulative_lvs_mismatches',
	'hours_vs_estimate_ratio', 'stages_completed',
	'avg_hours_per_stage_so_far', 'avg_days_per_stage_so_far'
	]

	# Build samples with block_id
	training_samples = []
	for _, row in completed.iterrows():
	try:
	transitions = json.loads(row['transitions'])
	except:
	continue
	total_actual_hours = row['actual_hours']
	cumulative_hours = 0
	cumulative_days = 0
	cumulative_drc = 0
	cumulative_lvs = 0
	for i, t in enumerate(transitions):
	if i == 0:
	continue
	cumulative_hours += t.get('hours_in_stage', 0)
	cumulative_days += t.get('days_in_stage', 0)
	cumulative_drc += t.get('drc_violations', 0)
	cumulative_lvs += t.get('lvs_mismatches', 0)
	remaining = max(0, total_actual_hours - cumulative_hours)
	training_samples.append({
	'block_id': row['block_id'],
	'tech_node_encoded': row.get('tech_node_encoded', 0),
	'block_type_encoded': row.get('block_type_encoded', 0),
	'priority_numeric': row['priority_numeric'],
	'transistor_count_log': row['transistor_count_log'],
	'has_dependencies': row['has_dependencies'],
	'num_dependencies': row['num_dependencies'],
	'constraint_complexity': row['constraint_complexity'],
	'estimated_hours': row['estimated_hours'],
	'engineer_skill_factor': row['engineer_skill_factor'],
	'drc_iterations': row['drc_iterations'],
	'current_stage_idx': i,
	'cumulative_hours': cumulative_hours,
	'cumulative_days': cumulative_days,
	'cumulative_drc_violations': cumulative_drc,
	'cumulative_lvs_mismatches': cumulative_lvs,
	'hours_vs_estimate_ratio': cumulative_hours / max(row['estimated_hours'], 1),
	'stages_completed': i,
	'avg_hours_per_stage_so_far': cumulative_hours / max(i, 1),
	'avg_days_per_stage_so_far': cumulative_days / max(i, 1),
	'remaining_hours': remaining,
	})

	train_df = pd.DataFrame(training_samples)

	# Group-aware split
	unique_blocks = train_df['block_id'].unique()
	rng = np.random.RandomState(42)
	rng.shuffle(unique_blocks)
	split_idx = int(len(unique_blocks) * 0.8)
	train_blocks = set(unique_blocks[:split_idx])
	test_blocks = set(unique_blocks[split_idx:])

	train_mask = train_df['block_id'].isin(train_blocks)
	test_mask = train_df['block_id'].isin(test_blocks)

	X_train_g = train_df.loc[train_mask, COMPLETION_FEATURES]
	y_train_g = train_df.loc[train_mask, 'remaining_hours']
	X_test_g = train_df.loc[test_mask, COMPLETION_FEATURES]
	y_test_g = train_df.loc[test_mask, 'remaining_hours']

	completion_model = xgb.XGBRegressor(
	n_estimators=500,
	learning_rate=0.03,
	max_depth=5, # reduced from 8
	subsample=0.7,
	colsample_bytree=0.7,
	min_child_weight=10,
	reg_alpha=1.0,
	reg_lambda=5.0,
	gamma=0.5,
	objective='reg:squarederror',
	tree_method='hist',
	random_state=42,
	early_stopping_rounds=30,
	)
	completion_model.fit(X_train_g, y_train_g, eval_set=[(X_test_g, y_test_g)], verbose=False)

	train_r2_g = r2_score(y_train_g, completion_model.predict(X_train_g))
	test_r2_g = r2_score(y_test_g, completion_model.predict(X_test_g))
	train_mae_g = mean_absolute_error(y_train_g, completion_model.predict(X_train_g))
	test_mae_g = mean_absolute_error(y_test_g, completion_model.predict(X_test_g))

	# GroupKFold CV
	groups = train_df['block_id'].values
	gkf = GroupKFold(n_splits=5)
	cv_model = xgb.XGBRegressor(
	n_estimators=500, learning_rate=0.03, max_depth=5, subsample=0.7,
	colsample_bytree=0.7, min_child_weight=10, reg_alpha=1.0, reg_lambda=5.0,
	gamma=0.5, tree_method='hist', random_state=42
	)
	cv_scores_g = cross_val_score(cv_model, train_df[COMPLETION_FEATURES],
	train_df['remaining_hours'], cv=gkf, groups=groups, scoring='r2')

	print(f" Train samples: {len(X_train_g)} from {len(train_blocks)} blocks")
	print(f" Test samples: {len(X_test_g)} from {len(test_blocks)} blocks")
	print(f" Train R²: {train_r2_g:.4f} Test R²: {test_r2_g:.4f} Gap: {train_r2_g-test_r2_g:.4f}")
	print(f" Train MAE: {train_mae_g:.2f} Test MAE: {test_mae_g:.2f}")
	print(f" GroupKFold CV R²: {cv_scores_g.mean():.4f} ± {cv_scores_g.std():.4f}")

	all_metrics['completion_prediction'] = {
	'train_r2': round(train_r2_g, 4), 'test_r2': round(test_r2_g, 4),
	'gap': round(train_r2_g - test_r2_g, 4),
	'train_mae': round(train_mae_g, 2), 'test_mae': round(test_mae_g, 2),
	'group_cv_r2_mean': round(cv_scores_g.mean(), 4),
	'group_cv_r2_std': round(cv_scores_g.std(), 4),
	'split_type': 'group-aware (block-level)',
	}

	# =====================================================================
	# SAVE ALL v2 MODELS
	# =====================================================================
	print("\n" + "=" * 60)
	print("SAVING v2 MODELS")
	print("=" * 60)

	os.makedirs('/app/models_v2', exist_ok=True)

	joblib.dump(hours_model, '/app/models_v2/hours_estimator.joblib')
	joblib.dump(xgb_clf, '/app/models_v2/complexity_xgb.joblib')
	joblib.dump(lgb_clf, '/app/models_v2/complexity_lgb.joblib')
	joblib.dump(bn_model, '/app/models_v2/bottleneck_predictor.joblib')
	joblib.dump(completion_model, '/app/models_v2/completion_predictor.joblib')

	# Encoders
	joblib.dump(tech_node_encoder, '/app/models_v2/tech_node_encoder.joblib')
	joblib.dump(block_type_encoder, '/app/models_v2/block_type_encoder.joblib')
	joblib.dump(priority_encoder, '/app/models_v2/priority_encoder.joblib')
	joblib.dump(complexity_encoder, '/app/models_v2/complexity_encoder.joblib')
	joblib.dump(bottleneck_encoder, '/app/models_v2/bottleneck_encoder.joblib')

	# Feature config
	feature_config = {
	'hours_features': HOURS_FEATURES,
	'complexity_features': COMPLEXITY_FEATURES,
	'bottleneck_features': SAFE_BOTTLENECK_FEATURES,
	'completion_features': COMPLETION_FEATURES,
	'tech_nodes': list(tech_node_encoder.classes_),
	'block_types': list(block_type_encoder.classes_),
	'priorities': ['P4-Low', 'P3-Medium', 'P2-High', 'P1-Critical'],
	'complexity_classes': list(complexity_encoder.classes_),
	'bottleneck_classes': list(bottleneck_encoder.classes_),
	}
	with open('/app/models_v2/feature_config.json', 'w') as f:
	json.dump(feature_config, f, indent=2)

	# Metrics
	all_metrics['training_data'] = {
	'total_samples': len(df),
	'completed_blocks': int(df['is_completed'].sum()),
	'in_progress_blocks': int((~df['is_completed'].astype(bool)).sum()),
	'completion_train_samples': len(X_train_g),
	}
	with open('/app/models_v2/metrics.json', 'w') as f:
	json.dump(all_metrics, f, indent=2)

	print("All v2 models saved to /app/models_v2/")

	# Final summary
	print("\n" + "=" * 60)
	print("v1 vs v2 COMPARISON")
	print("=" * 60)
	print(f"""
	┌───────────────────────┬────────────────────────┬────────────────────────┐
	│ Model │ v1 (overfit) │ v2 (fixed) │
	├───────────────────────┼────────────────────────┼────────────────────────┤
	│ Hours Estimator │ R²=0.881 (gap 0.113) │ R²={test_r2:.3f} (gap {train_r2-test_r2:.3f}) │
	│ Complexity Classifier │ Acc=92.3% (gap 5.9%) │ Acc={test_xgb100:.1f}% (gap {(train_xgb-test_xgb)100:.1f}%) │
	│ Bottleneck Predictor │ 99.6% (DATA LEAKAGE) │ {test_bn*100:.1f}% (honest) │
	│ Completion Predictor │ R²=0.945 (GROUP LEAK) │ R²={test_r2_g:.3f} (grouped) │
	└───────────────────────┴────────────────────────┴────────────────────────┘
	""")