fraud-detection-system / train_all.py
rajvivan's picture
Complete fraud detection system: code, figures, models, paper
408a9b2 verified
"""
Module 3: Model Training - Optimized for speed
Train all models: LR, RF, XGBoost, LightGBM, MLP, Autoencoder, Voting Ensemble.
Hyperparameter tuning with Optuna.
"""
import os, sys
sys.path.insert(0, '/app/fraud_detection')
import numpy as np
import pandas as pd
import joblib
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
import xgboost as xgb
import lightgbm as lgb
from config import DATA_DIR, MODELS_DIR, SEED
# Load data
data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib"))
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']
X_train_smote = data['X_train_smote']
y_train_smote = data['y_train_smote']
class_weights = data['class_weights']
scale_pos_weight = class_weights[1] / class_weights[0]
print(f"Data loaded. Train: {X_train.shape}, Val: {X_val.shape}")
models = {}
# === 1. Logistic Regression ===
print("\n[1/8] Logistic Regression...")
lr = LogisticRegression(class_weight=class_weights, max_iter=1000, random_state=SEED, C=0.1, solver='lbfgs')
lr.fit(X_train, y_train)
models['Logistic_Regression'] = lr
p = lr.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# === 2. Random Forest ===
print("\n[2/8] Random Forest...")
rf = RandomForestClassifier(n_estimators=150, max_depth=12, class_weight=class_weights, random_state=SEED, n_jobs=-1)
rf.fit(X_train, y_train)
models['Random_Forest'] = rf
p = rf.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# === 3. XGBoost ===
print("\n[3/8] XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist')
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
models['XGBoost'] = xgb_model
p = xgb_model.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# === 4. LightGBM ===
print("\n[4/8] LightGBM...")
lgbm_model = lgb.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.05, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, n_jobs=-1, verbose=-1)
lgbm_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
models['LightGBM'] = lgbm_model
p = lgbm_model.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# === 5. MLP ===
print("\n[5/8] MLP Neural Network...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', alpha=0.001, batch_size=256, learning_rate='adaptive', max_iter=200, random_state=SEED, early_stopping=True, n_iter_no_change=10)
mlp.fit(X_train_smote, y_train_smote)
models['MLP'] = mlp
p = mlp.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# === 6. Autoencoder ===
print("\n[6/8] Autoencoder...")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
X_train_legit = X_train[y_train == 0]
X_train_np = X_train_legit.values if isinstance(X_train_legit, pd.DataFrame) else X_train_legit
input_dim = X_train_np.shape[1]
class Autoencoder(nn.Module):
def __init__(self, d):
super().__init__()
self.encoder = nn.Sequential(nn.Linear(d, 64), nn.ReLU(), nn.Dropout(0.2), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU())
self.decoder = nn.Sequential(nn.Linear(16, 32), nn.ReLU(), nn.Dropout(0.2), nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, d))
def forward(self, x):
return self.decoder(self.encoder(x))
ae_model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(ae_model.parameters(), lr=0.001, weight_decay=1e-5)
train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_np), torch.FloatTensor(X_train_np)), batch_size=256, shuffle=True)
ae_model.train()
for epoch in range(50):
eloss = 0
for bx, _ in train_loader:
optimizer.zero_grad()
out = ae_model(bx)
loss = criterion(out, bx)
loss.backward()
optimizer.step()
eloss += loss.item()
if (epoch+1) % 10 == 0:
print(f" Epoch {epoch+1}/50, Loss: {eloss/len(train_loader):.6f}")
ae_model.eval()
X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else X_val
with torch.no_grad():
val_out = ae_model(torch.FloatTensor(X_val_np))
recon_error = torch.mean((val_out - torch.FloatTensor(X_val_np))**2, dim=1).numpy()
print(f" ROC-AUC: {roc_auc_score(y_val, recon_error):.4f}, PR-AUC: {average_precision_score(y_val, recon_error):.4f}")
# Autoencoder wrapper
class AutoencoderWrapper:
def __init__(self, model):
self.model = model
self.classes_ = np.array([0, 1])
def predict_proba(self, X):
self.model.eval()
Xn = X.values if isinstance(X, pd.DataFrame) else X
with torch.no_grad():
Xt = torch.FloatTensor(Xn)
out = self.model(Xt)
re = torch.mean((out - Xt)**2, dim=1).numpy()
scores = 1 / (1 + np.exp(-10 * (re - np.median(re))))
return np.column_stack([1-scores, scores])
def predict(self, X, threshold=0.5):
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
models['Autoencoder'] = AutoencoderWrapper(ae_model)
torch.save(ae_model.state_dict(), os.path.join(MODELS_DIR, "autoencoder.pt"))
# === 7. Optuna Tuning ===
print("\n[7/8] Optuna Tuning...")
# XGBoost tuning (15 trials)
print(" Tuning XGBoost (15 trials)...")
def xgb_obj(trial):
m = xgb.XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 250), max_depth=trial.suggest_int('max_depth', 4, 9), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), min_child_weight=trial.suggest_int('mcw', 1, 8), scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist')
m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
return average_precision_score(y_val, m.predict_proba(X_val)[:, 1])
s = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
s.optimize(xgb_obj, n_trials=15)
print(f" Best PR-AUC: {s.best_value:.4f}")
bp = s.best_params
xgb_best = xgb.XGBClassifier(n_estimators=bp['n_estimators'], max_depth=bp['max_depth'], learning_rate=bp['lr'], subsample=bp['ss'], colsample_bytree=bp['csb'], reg_alpha=bp['ra'], reg_lambda=bp['rl'], min_child_weight=bp['mcw'], scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist')
xgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
models['XGBoost_Tuned'] = xgb_best
xgb_tune_params = s.best_params
# LightGBM tuning (15 trials)
print(" Tuning LightGBM (15 trials)...")
def lgb_obj(trial):
m = lgb.LGBMClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 300), max_depth=trial.suggest_int('max_depth', 4, 10), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), num_leaves=trial.suggest_int('nl', 15, 100), scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
return average_precision_score(y_val, m.predict_proba(X_val)[:, 1])
s2 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
s2.optimize(lgb_obj, n_trials=15)
print(f" Best PR-AUC: {s2.best_value:.4f}")
bp2 = s2.best_params
lgb_best = lgb.LGBMClassifier(n_estimators=bp2['n_estimators'], max_depth=bp2['max_depth'], learning_rate=bp2['lr'], subsample=bp2['ss'], colsample_bytree=bp2['csb'], reg_alpha=bp2['ra'], reg_lambda=bp2['rl'], num_leaves=bp2['nl'], scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1)
lgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)])
models['LightGBM_Tuned'] = lgb_best
lgb_tune_params = s2.best_params
# RF tuning (5 trials - fast)
print(" Tuning Random Forest (5 trials)...")
def rf_obj(trial):
m = RandomForestClassifier(n_estimators=trial.suggest_int('ne', 100, 200), max_depth=trial.suggest_int('md', 8, 15), min_samples_split=trial.suggest_int('mss', 2, 10), min_samples_leaf=trial.suggest_int('msl', 1, 5), class_weight=class_weights, random_state=SEED, n_jobs=-1)
m.fit(X_train, y_train)
return average_precision_score(y_val, m.predict_proba(X_val)[:, 1])
s3 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
s3.optimize(rf_obj, n_trials=5)
print(f" Best PR-AUC: {s3.best_value:.4f}")
bp3 = s3.best_params
rf_best = RandomForestClassifier(n_estimators=bp3['ne'], max_depth=bp3['md'], min_samples_split=bp3['mss'], min_samples_leaf=bp3['msl'], class_weight=class_weights, random_state=SEED, n_jobs=-1)
rf_best.fit(X_train, y_train)
models['Random_Forest_Tuned'] = rf_best
rf_tune_params = s3.best_params
tuning_results = {'xgboost': xgb_tune_params, 'lightgbm': lgb_tune_params, 'random_forest': rf_tune_params}
joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib"))
# === 8. Voting Ensemble ===
print("\n[8/8] Voting Ensemble...")
ensemble_members = [('XGBoost_Tuned', models['XGBoost_Tuned']), ('LightGBM_Tuned', models['LightGBM_Tuned']), ('Random_Forest_Tuned', models['Random_Forest_Tuned'])]
voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft')
voting_clf.fit(X_train, y_train)
models['Voting_Ensemble'] = voting_clf
p = voting_clf.predict_proba(X_val)[:, 1]
print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}")
# Save all
joblib.dump(models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib"))
save_models = {k: v for k, v in models.items() if k != 'Autoencoder'}
joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib"))
print(f"\n=== ALL TRAINING COMPLETE ===")
print(f"Models: {list(models.keys())}")