""" Module 3: Model Training - Optimized for speed Train all models: LR, RF, XGBoost, LightGBM, MLP, Autoencoder, Voting Ensemble. Hyperparameter tuning with Optuna. """ import os, sys sys.path.insert(0, '/app/fraud_detection') import numpy as np import pandas as pd import joblib import optuna optuna.logging.set_verbosity(optuna.logging.WARNING) import warnings warnings.filterwarnings('ignore') from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import f1_score, roc_auc_score, average_precision_score import xgboost as xgb import lightgbm as lgb from config import DATA_DIR, MODELS_DIR, SEED # Load data data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) X_train = data['X_train'] X_val = data['X_val'] X_test = data['X_test'] y_train = data['y_train'] y_val = data['y_val'] y_test = data['y_test'] X_train_smote = data['X_train_smote'] y_train_smote = data['y_train_smote'] class_weights = data['class_weights'] scale_pos_weight = class_weights[1] / class_weights[0] print(f"Data loaded. Train: {X_train.shape}, Val: {X_val.shape}") models = {} # === 1. Logistic Regression === print("\n[1/8] Logistic Regression...") lr = LogisticRegression(class_weight=class_weights, max_iter=1000, random_state=SEED, C=0.1, solver='lbfgs') lr.fit(X_train, y_train) models['Logistic_Regression'] = lr p = lr.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # === 2. Random Forest === print("\n[2/8] Random Forest...") rf = RandomForestClassifier(n_estimators=150, max_depth=12, class_weight=class_weights, random_state=SEED, n_jobs=-1) rf.fit(X_train, y_train) models['Random_Forest'] = rf p = rf.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # === 3. XGBoost === print("\n[3/8] XGBoost...") xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) models['XGBoost'] = xgb_model p = xgb_model.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # === 4. LightGBM === print("\n[4/8] LightGBM...") lgbm_model = lgb.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.05, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, n_jobs=-1, verbose=-1) lgbm_model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) models['LightGBM'] = lgbm_model p = lgbm_model.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # === 5. MLP === print("\n[5/8] MLP Neural Network...") mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', alpha=0.001, batch_size=256, learning_rate='adaptive', max_iter=200, random_state=SEED, early_stopping=True, n_iter_no_change=10) mlp.fit(X_train_smote, y_train_smote) models['MLP'] = mlp p = mlp.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # === 6. Autoencoder === print("\n[6/8] Autoencoder...") import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset X_train_legit = X_train[y_train == 0] X_train_np = X_train_legit.values if isinstance(X_train_legit, pd.DataFrame) else X_train_legit input_dim = X_train_np.shape[1] class Autoencoder(nn.Module): def __init__(self, d): super().__init__() self.encoder = nn.Sequential(nn.Linear(d, 64), nn.ReLU(), nn.Dropout(0.2), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU()) self.decoder = nn.Sequential(nn.Linear(16, 32), nn.ReLU(), nn.Dropout(0.2), nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, d)) def forward(self, x): return self.decoder(self.encoder(x)) ae_model = Autoencoder(input_dim) criterion = nn.MSELoss() optimizer = torch.optim.Adam(ae_model.parameters(), lr=0.001, weight_decay=1e-5) train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_np), torch.FloatTensor(X_train_np)), batch_size=256, shuffle=True) ae_model.train() for epoch in range(50): eloss = 0 for bx, _ in train_loader: optimizer.zero_grad() out = ae_model(bx) loss = criterion(out, bx) loss.backward() optimizer.step() eloss += loss.item() if (epoch+1) % 10 == 0: print(f" Epoch {epoch+1}/50, Loss: {eloss/len(train_loader):.6f}") ae_model.eval() X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else X_val with torch.no_grad(): val_out = ae_model(torch.FloatTensor(X_val_np)) recon_error = torch.mean((val_out - torch.FloatTensor(X_val_np))**2, dim=1).numpy() print(f" ROC-AUC: {roc_auc_score(y_val, recon_error):.4f}, PR-AUC: {average_precision_score(y_val, recon_error):.4f}") # Autoencoder wrapper class AutoencoderWrapper: def __init__(self, model): self.model = model self.classes_ = np.array([0, 1]) def predict_proba(self, X): self.model.eval() Xn = X.values if isinstance(X, pd.DataFrame) else X with torch.no_grad(): Xt = torch.FloatTensor(Xn) out = self.model(Xt) re = torch.mean((out - Xt)**2, dim=1).numpy() scores = 1 / (1 + np.exp(-10 * (re - np.median(re)))) return np.column_stack([1-scores, scores]) def predict(self, X, threshold=0.5): return (self.predict_proba(X)[:, 1] >= threshold).astype(int) models['Autoencoder'] = AutoencoderWrapper(ae_model) torch.save(ae_model.state_dict(), os.path.join(MODELS_DIR, "autoencoder.pt")) # === 7. Optuna Tuning === print("\n[7/8] Optuna Tuning...") # XGBoost tuning (15 trials) print(" Tuning XGBoost (15 trials)...") def xgb_obj(trial): m = xgb.XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 250), max_depth=trial.suggest_int('max_depth', 4, 9), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), min_child_weight=trial.suggest_int('mcw', 1, 8), scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) s = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) s.optimize(xgb_obj, n_trials=15) print(f" Best PR-AUC: {s.best_value:.4f}") bp = s.best_params xgb_best = xgb.XGBClassifier(n_estimators=bp['n_estimators'], max_depth=bp['max_depth'], learning_rate=bp['lr'], subsample=bp['ss'], colsample_bytree=bp['csb'], reg_alpha=bp['ra'], reg_lambda=bp['rl'], min_child_weight=bp['mcw'], scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') xgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) models['XGBoost_Tuned'] = xgb_best xgb_tune_params = s.best_params # LightGBM tuning (15 trials) print(" Tuning LightGBM (15 trials)...") def lgb_obj(trial): m = lgb.LGBMClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 300), max_depth=trial.suggest_int('max_depth', 4, 10), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), num_leaves=trial.suggest_int('nl', 15, 100), scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1) m.fit(X_train, y_train, eval_set=[(X_val, y_val)]) return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) s2 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) s2.optimize(lgb_obj, n_trials=15) print(f" Best PR-AUC: {s2.best_value:.4f}") bp2 = s2.best_params lgb_best = lgb.LGBMClassifier(n_estimators=bp2['n_estimators'], max_depth=bp2['max_depth'], learning_rate=bp2['lr'], subsample=bp2['ss'], colsample_bytree=bp2['csb'], reg_alpha=bp2['ra'], reg_lambda=bp2['rl'], num_leaves=bp2['nl'], scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1) lgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)]) models['LightGBM_Tuned'] = lgb_best lgb_tune_params = s2.best_params # RF tuning (5 trials - fast) print(" Tuning Random Forest (5 trials)...") def rf_obj(trial): m = RandomForestClassifier(n_estimators=trial.suggest_int('ne', 100, 200), max_depth=trial.suggest_int('md', 8, 15), min_samples_split=trial.suggest_int('mss', 2, 10), min_samples_leaf=trial.suggest_int('msl', 1, 5), class_weight=class_weights, random_state=SEED, n_jobs=-1) m.fit(X_train, y_train) return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) s3 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) s3.optimize(rf_obj, n_trials=5) print(f" Best PR-AUC: {s3.best_value:.4f}") bp3 = s3.best_params rf_best = RandomForestClassifier(n_estimators=bp3['ne'], max_depth=bp3['md'], min_samples_split=bp3['mss'], min_samples_leaf=bp3['msl'], class_weight=class_weights, random_state=SEED, n_jobs=-1) rf_best.fit(X_train, y_train) models['Random_Forest_Tuned'] = rf_best rf_tune_params = s3.best_params tuning_results = {'xgboost': xgb_tune_params, 'lightgbm': lgb_tune_params, 'random_forest': rf_tune_params} joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib")) # === 8. Voting Ensemble === print("\n[8/8] Voting Ensemble...") ensemble_members = [('XGBoost_Tuned', models['XGBoost_Tuned']), ('LightGBM_Tuned', models['LightGBM_Tuned']), ('Random_Forest_Tuned', models['Random_Forest_Tuned'])] voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft') voting_clf.fit(X_train, y_train) models['Voting_Ensemble'] = voting_clf p = voting_clf.predict_proba(X_val)[:, 1] print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") # Save all joblib.dump(models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) save_models = {k: v for k, v in models.items() if k != 'Autoencoder'} joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib")) print(f"\n=== ALL TRAINING COMPLETE ===") print(f"Models: {list(models.keys())}")