"""Complete the training: RF tuning + Voting Ensemble + Save.""" import os, sys sys.path.insert(0, '/app/fraud_detection') import numpy as np import pandas as pd import joblib import optuna optuna.logging.set_verbosity(optuna.logging.WARNING) import warnings warnings.filterwarnings('ignore') from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.metrics import roc_auc_score, average_precision_score from config import DATA_DIR, MODELS_DIR, SEED # Load data data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) X_train = data['X_train'] X_val = data['X_val'] y_train = data['y_train'] y_val = data['y_val'] class_weights = data['class_weights'] # Load previously saved models saved_models = joblib.load(os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) print(f"Loaded {len(saved_models)} models: {list(saved_models.keys())}") # Check if RF tuned and XGB tuned already exist need_rf_tune = 'Random_Forest_Tuned' not in saved_models need_xgb_tune = 'XGBoost_Tuned' not in saved_models need_lgbm_tune = 'LightGBM_Tuned' not in saved_models print(f"Need RF tune: {need_rf_tune}, XGB tune: {need_xgb_tune}, LGBM tune: {need_lgbm_tune}") # Quick RF tune with just 5 trials if need_rf_tune: print("\n--- Quick Optuna RF Tuning (5 trials) ---") def objective(trial): params = { 'n_estimators': trial.suggest_int('n_estimators', 100, 200), 'max_depth': trial.suggest_int('max_depth', 8, 15), 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5), 'class_weight': class_weights, 'random_state': SEED, 'n_jobs': -1 } model = RandomForestClassifier(**params) model.fit(X_train, y_train) val_pred = model.predict_proba(X_val)[:, 1] return average_precision_score(y_val, val_pred) study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) study.optimize(objective, n_trials=5, show_progress_bar=False) print(f" Best PR-AUC: {study.best_value:.4f}") print(f" Best params: {study.best_params}") best_params = study.best_params best_params['class_weight'] = class_weights best_params['random_state'] = SEED best_params['n_jobs'] = -1 best_model = RandomForestClassifier(**best_params) best_model.fit(X_train, y_train) saved_models['Random_Forest_Tuned'] = best_model tuning_results = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if os.path.exists(os.path.join(MODELS_DIR, "tuning_results.joblib")) else {} tuning_results['random_forest'] = study.best_params joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib")) # Check if we need XGB/LGBM tuned models from results if need_xgb_tune or need_lgbm_tune: print("XGB/LGBM tuned models missing, re-running...") import xgboost as xgb import lightgbm as lgb if need_xgb_tune: tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if 'xgboost' in tuning: scale_pos_weight = class_weights[1] / class_weights[0] bp = tuning['xgboost'] bp['scale_pos_weight'] = scale_pos_weight bp['random_state'] = SEED bp['eval_metric'] = 'aucpr' bp['n_jobs'] = -1 bp['tree_method'] = 'hist' m = xgb.XGBClassifier(**bp) m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) saved_models['XGBoost_Tuned'] = m if need_lgbm_tune: tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if 'lightgbm' in tuning: scale_pos_weight = class_weights[1] / class_weights[0] bp = tuning['lightgbm'] bp['scale_pos_weight'] = scale_pos_weight bp['random_state'] = SEED bp['n_jobs'] = -1 bp['verbose'] = -1 m = lgb.LGBMClassifier(**bp) m.fit(X_train, y_train, eval_set=[(X_val, y_val)]) saved_models['LightGBM_Tuned'] = m # Create Voting Ensemble if 'Voting_Ensemble' not in saved_models: print("\n--- Creating Voting Ensemble ---") ensemble_members = [] for name in ['XGBoost_Tuned', 'LightGBM_Tuned', 'Random_Forest_Tuned']: if name in saved_models: ensemble_members.append((name, saved_models[name])) print(f" Members: {[n for n, _ in ensemble_members]}") voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft') voting_clf.fit(X_train, y_train) saved_models['Voting_Ensemble'] = voting_clf val_pred = voting_clf.predict_proba(X_val)[:, 1] val_auc = roc_auc_score(y_val, val_pred) val_pr_auc = average_precision_score(y_val, val_pred) print(f" Voting Ensemble Val ROC-AUC: {val_auc:.4f}, PR-AUC: {val_pr_auc:.4f}") # Save everything joblib.dump(saved_models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) save_models = {k: v for k, v in saved_models.items() if k != 'Autoencoder'} joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib")) print(f"\nFinal models saved: {list(saved_models.keys())}") print("TRAINING COMPLETE")