File size: 5,298 Bytes
408a9b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""Complete the training: RF tuning + Voting Ensemble + Save."""
import os, sys
sys.path.insert(0, '/app/fraud_detection')
import numpy as np
import pandas as pd
import joblib
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from config import DATA_DIR, MODELS_DIR, SEED

# Load data
data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib"))
X_train = data['X_train']
X_val = data['X_val']
y_train = data['y_train']
y_val = data['y_val']
class_weights = data['class_weights']

# Load previously saved models
saved_models = joblib.load(os.path.join(MODELS_DIR, "all_models_with_ae.joblib"))
print(f"Loaded {len(saved_models)} models: {list(saved_models.keys())}")

# Check if RF tuned and XGB tuned already exist
need_rf_tune = 'Random_Forest_Tuned' not in saved_models
need_xgb_tune = 'XGBoost_Tuned' not in saved_models
need_lgbm_tune = 'LightGBM_Tuned' not in saved_models

print(f"Need RF tune: {need_rf_tune}, XGB tune: {need_xgb_tune}, LGBM tune: {need_lgbm_tune}")

# Quick RF tune with just 5 trials
if need_rf_tune:
    print("\n--- Quick Optuna RF Tuning (5 trials) ---")
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 200),
            'max_depth': trial.suggest_int('max_depth', 8, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'class_weight': class_weights,
            'random_state': SEED,
            'n_jobs': -1
        }
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        val_pred = model.predict_proba(X_val)[:, 1]
        return average_precision_score(y_val, val_pred)

    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=5, show_progress_bar=False)
    print(f"  Best PR-AUC: {study.best_value:.4f}")
    print(f"  Best params: {study.best_params}")

    best_params = study.best_params
    best_params['class_weight'] = class_weights
    best_params['random_state'] = SEED
    best_params['n_jobs'] = -1
    best_model = RandomForestClassifier(**best_params)
    best_model.fit(X_train, y_train)
    saved_models['Random_Forest_Tuned'] = best_model
    
    tuning_results = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if os.path.exists(os.path.join(MODELS_DIR, "tuning_results.joblib")) else {}
    tuning_results['random_forest'] = study.best_params
    joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib"))

# Check if we need XGB/LGBM tuned models from results
if need_xgb_tune or need_lgbm_tune:
    print("XGB/LGBM tuned models missing, re-running...")
    import xgboost as xgb
    import lightgbm as lgb
    
    if need_xgb_tune:
        tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib"))
        if 'xgboost' in tuning:
            scale_pos_weight = class_weights[1] / class_weights[0]
            bp = tuning['xgboost']
            bp['scale_pos_weight'] = scale_pos_weight
            bp['random_state'] = SEED
            bp['eval_metric'] = 'aucpr'
            bp['n_jobs'] = -1
            bp['tree_method'] = 'hist'
            m = xgb.XGBClassifier(**bp)
            m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
            saved_models['XGBoost_Tuned'] = m
    
    if need_lgbm_tune:
        tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib"))
        if 'lightgbm' in tuning:
            scale_pos_weight = class_weights[1] / class_weights[0]
            bp = tuning['lightgbm']
            bp['scale_pos_weight'] = scale_pos_weight
            bp['random_state'] = SEED
            bp['n_jobs'] = -1
            bp['verbose'] = -1
            m = lgb.LGBMClassifier(**bp)
            m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
            saved_models['LightGBM_Tuned'] = m

# Create Voting Ensemble
if 'Voting_Ensemble' not in saved_models:
    print("\n--- Creating Voting Ensemble ---")
    ensemble_members = []
    for name in ['XGBoost_Tuned', 'LightGBM_Tuned', 'Random_Forest_Tuned']:
        if name in saved_models:
            ensemble_members.append((name, saved_models[name]))
    
    print(f"  Members: {[n for n, _ in ensemble_members]}")
    voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft')
    voting_clf.fit(X_train, y_train)
    saved_models['Voting_Ensemble'] = voting_clf
    
    val_pred = voting_clf.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, val_pred)
    val_pr_auc = average_precision_score(y_val, val_pred)
    print(f"  Voting Ensemble Val ROC-AUC: {val_auc:.4f}, PR-AUC: {val_pr_auc:.4f}")

# Save everything
joblib.dump(saved_models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib"))
save_models = {k: v for k, v in saved_models.items() if k != 'Autoencoder'}
joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib"))

print(f"\nFinal models saved: {list(saved_models.keys())}")
print("TRAINING COMPLETE")