File size: 5,298 Bytes
408a9b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """Complete the training: RF tuning + Voting Ensemble + Save."""
import os, sys
sys.path.insert(0, '/app/fraud_detection')
import numpy as np
import pandas as pd
import joblib
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from config import DATA_DIR, MODELS_DIR, SEED
# Load data
data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib"))
X_train = data['X_train']
X_val = data['X_val']
y_train = data['y_train']
y_val = data['y_val']
class_weights = data['class_weights']
# Load previously saved models
saved_models = joblib.load(os.path.join(MODELS_DIR, "all_models_with_ae.joblib"))
print(f"Loaded {len(saved_models)} models: {list(saved_models.keys())}")
# Check if RF tuned and XGB tuned already exist
need_rf_tune = 'Random_Forest_Tuned' not in saved_models
need_xgb_tune = 'XGBoost_Tuned' not in saved_models
need_lgbm_tune = 'LightGBM_Tuned' not in saved_models
print(f"Need RF tune: {need_rf_tune}, XGB tune: {need_xgb_tune}, LGBM tune: {need_lgbm_tune}")
# Quick RF tune with just 5 trials
if need_rf_tune:
print("\n--- Quick Optuna RF Tuning (5 trials) ---")
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 200),
'max_depth': trial.suggest_int('max_depth', 8, 15),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
'class_weight': class_weights,
'random_state': SEED,
'n_jobs': -1
}
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:, 1]
return average_precision_score(y_val, val_pred)
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=5, show_progress_bar=False)
print(f" Best PR-AUC: {study.best_value:.4f}")
print(f" Best params: {study.best_params}")
best_params = study.best_params
best_params['class_weight'] = class_weights
best_params['random_state'] = SEED
best_params['n_jobs'] = -1
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)
saved_models['Random_Forest_Tuned'] = best_model
tuning_results = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if os.path.exists(os.path.join(MODELS_DIR, "tuning_results.joblib")) else {}
tuning_results['random_forest'] = study.best_params
joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib"))
# Check if we need XGB/LGBM tuned models from results
if need_xgb_tune or need_lgbm_tune:
print("XGB/LGBM tuned models missing, re-running...")
import xgboost as xgb
import lightgbm as lgb
if need_xgb_tune:
tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib"))
if 'xgboost' in tuning:
scale_pos_weight = class_weights[1] / class_weights[0]
bp = tuning['xgboost']
bp['scale_pos_weight'] = scale_pos_weight
bp['random_state'] = SEED
bp['eval_metric'] = 'aucpr'
bp['n_jobs'] = -1
bp['tree_method'] = 'hist'
m = xgb.XGBClassifier(**bp)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
saved_models['XGBoost_Tuned'] = m
if need_lgbm_tune:
tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib"))
if 'lightgbm' in tuning:
scale_pos_weight = class_weights[1] / class_weights[0]
bp = tuning['lightgbm']
bp['scale_pos_weight'] = scale_pos_weight
bp['random_state'] = SEED
bp['n_jobs'] = -1
bp['verbose'] = -1
m = lgb.LGBMClassifier(**bp)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
saved_models['LightGBM_Tuned'] = m
# Create Voting Ensemble
if 'Voting_Ensemble' not in saved_models:
print("\n--- Creating Voting Ensemble ---")
ensemble_members = []
for name in ['XGBoost_Tuned', 'LightGBM_Tuned', 'Random_Forest_Tuned']:
if name in saved_models:
ensemble_members.append((name, saved_models[name]))
print(f" Members: {[n for n, _ in ensemble_members]}")
voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft')
voting_clf.fit(X_train, y_train)
saved_models['Voting_Ensemble'] = voting_clf
val_pred = voting_clf.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_pred)
val_pr_auc = average_precision_score(y_val, val_pred)
print(f" Voting Ensemble Val ROC-AUC: {val_auc:.4f}, PR-AUC: {val_pr_auc:.4f}")
# Save everything
joblib.dump(saved_models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib"))
save_models = {k: v for k, v in saved_models.items() if k != 'Autoencoder'}
joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib"))
print(f"\nFinal models saved: {list(saved_models.keys())}")
print("TRAINING COMPLETE")
|