| """ |
| Module 4: Model Evaluation |
| Comprehensive evaluation: metrics, confusion matrices, ROC/PR curves, |
| threshold analysis, business impact estimation. |
| """ |
| import os, sys |
| sys.path.insert(0, '/app/fraud_detection') |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import joblib |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| from ae_model import AutoencoderWrapper, Autoencoder |
|
|
| from sklearn.metrics import ( |
| precision_score, recall_score, f1_score, roc_auc_score, |
| average_precision_score, matthews_corrcoef, confusion_matrix, |
| roc_curve, precision_recall_curve, classification_report |
| ) |
|
|
| from config import DATA_DIR, MODELS_DIR, FIGURES_DIR, FIG_DPI, FIG_BG |
|
|
| plt.style.use('seaborn-v0_8-whitegrid') |
|
|
| def evaluate_model(model, X, y, model_name, threshold=0.5): |
| """Evaluate a single model with all metrics.""" |
| proba = model.predict_proba(X)[:, 1] |
| preds = (proba >= threshold).astype(int) |
| |
| metrics = { |
| 'Model': model_name, |
| 'Precision': precision_score(y, preds, zero_division=0), |
| 'Recall': recall_score(y, preds, zero_division=0), |
| 'F1': f1_score(y, preds, zero_division=0), |
| 'ROC-AUC': roc_auc_score(y, proba), |
| 'PR-AUC': average_precision_score(y, proba), |
| 'MCC': matthews_corrcoef(y, preds), |
| } |
| |
| cm = confusion_matrix(y, preds) |
| return metrics, cm, proba, preds |
|
|
|
|
| def evaluate_all_models(models, X_test, y_test): |
| """Evaluate all models on test set.""" |
| print("=" * 60) |
| print("MODEL EVALUATION ON TEST SET") |
| print("=" * 60) |
| |
| all_metrics = [] |
| all_cm = {} |
| all_proba = {} |
| all_preds = {} |
| |
| for name, model in models.items(): |
| print(f"\nEvaluating: {name}") |
| metrics, cm, proba, preds = evaluate_model(model, X_test, y_test, name) |
| all_metrics.append(metrics) |
| all_cm[name] = cm |
| all_proba[name] = proba |
| all_preds[name] = preds |
| |
| print(f" Precision: {metrics['Precision']:.4f}") |
| print(f" Recall: {metrics['Recall']:.4f}") |
| print(f" F1: {metrics['F1']:.4f}") |
| print(f" ROC-AUC: {metrics['ROC-AUC']:.4f}") |
| print(f" PR-AUC: {metrics['PR-AUC']:.4f}") |
| print(f" MCC: {metrics['MCC']:.4f}") |
| |
| |
| df_metrics = pd.DataFrame(all_metrics) |
| df_metrics = df_metrics.sort_values('PR-AUC', ascending=False) |
| |
| print("\n" + "=" * 60) |
| print("MODEL COMPARISON TABLE") |
| print("=" * 60) |
| print(df_metrics.to_string(index=False, float_format='%.4f')) |
| |
| |
| df_metrics.to_csv(os.path.join(FIGURES_DIR, "model_comparison.csv"), index=False) |
| |
| return df_metrics, all_cm, all_proba, all_preds |
|
|
|
|
| def plot_confusion_matrices(all_cm, model_names): |
| """Plot confusion matrix grid.""" |
| n = len(model_names) |
| cols = 4 |
| rows = (n + cols - 1) // cols |
| |
| fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows), facecolor=FIG_BG) |
| if rows == 1: |
| axes = axes.reshape(1, -1) |
| |
| for idx, name in enumerate(model_names): |
| r, c = idx // cols, idx % cols |
| cm = all_cm[name] |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[r, c], |
| xticklabels=['Legit', 'Fraud'], yticklabels=['Legit', 'Fraud']) |
| axes[r, c].set_title(name.replace('_', ' '), fontsize=10, fontweight='bold') |
| axes[r, c].set_ylabel('Actual') |
| axes[r, c].set_xlabel('Predicted') |
| |
| |
| for idx in range(n, rows*cols): |
| r, c = idx // cols, idx % cols |
| axes[r, c].set_visible(False) |
| |
| plt.suptitle('Confusion Matrices (Test Set)', fontsize=14, fontweight='bold') |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "confusion_matrices.png"), dpi=FIG_DPI, bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "confusion_matrices.pdf"), bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: confusion_matrices.png/pdf") |
|
|
|
|
| def plot_roc_curves(all_proba, y_test): |
| """Plot ROC curves for all models.""" |
| fig, ax = plt.subplots(1, 1, figsize=(10, 8), facecolor=FIG_BG) |
| |
| colors = plt.cm.tab20(np.linspace(0, 1, len(all_proba))) |
| |
| for (name, proba), color in zip(all_proba.items(), colors): |
| fpr, tpr, _ = roc_curve(y_test, proba) |
| auc = roc_auc_score(y_test, proba) |
| ax.plot(fpr, tpr, color=color, linewidth=2, label=f'{name.replace("_", " ")} (AUC={auc:.4f})') |
| |
| ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random') |
| ax.set_xlabel('False Positive Rate', fontsize=12) |
| ax.set_ylabel('True Positive Rate', fontsize=12) |
| ax.set_title('ROC Curves - All Models', fontsize=14, fontweight='bold') |
| ax.legend(loc='lower right', fontsize=9) |
| ax.set_xlim([0, 1]) |
| ax.set_ylim([0, 1.02]) |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "roc_curves.png"), dpi=FIG_DPI, bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "roc_curves.pdf"), bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: roc_curves.png/pdf") |
|
|
|
|
| def plot_pr_curves(all_proba, y_test): |
| """Plot Precision-Recall curves for all models.""" |
| fig, ax = plt.subplots(1, 1, figsize=(10, 8), facecolor=FIG_BG) |
| |
| colors = plt.cm.tab20(np.linspace(0, 1, len(all_proba))) |
| |
| for (name, proba), color in zip(all_proba.items(), colors): |
| precision, recall, _ = precision_recall_curve(y_test, proba) |
| pr_auc = average_precision_score(y_test, proba) |
| ax.plot(recall, precision, color=color, linewidth=2, label=f'{name.replace("_", " ")} (AP={pr_auc:.4f})') |
| |
| baseline = y_test.mean() |
| ax.axhline(y=baseline, color='k', linestyle='--', linewidth=1, label=f'Baseline ({baseline:.4f})') |
| ax.set_xlabel('Recall', fontsize=12) |
| ax.set_ylabel('Precision', fontsize=12) |
| ax.set_title('Precision-Recall Curves - All Models', fontsize=14, fontweight='bold') |
| ax.legend(loc='upper right', fontsize=9) |
| ax.set_xlim([0, 1]) |
| ax.set_ylim([0, 1.02]) |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "pr_curves.png"), dpi=FIG_DPI, bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "pr_curves.pdf"), bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: pr_curves.png/pdf") |
|
|
|
|
| def threshold_analysis(best_model_name, best_proba, y_test): |
| """Analyze threshold sensitivity for the best model.""" |
| print("\n" + "=" * 60) |
| print(f"THRESHOLD SENSITIVITY ANALYSIS ({best_model_name})") |
| print("=" * 60) |
| |
| thresholds = np.arange(0.05, 0.96, 0.05) |
| results = [] |
| |
| for t in thresholds: |
| preds = (best_proba >= t).astype(int) |
| prec = precision_score(y_test, preds, zero_division=0) |
| rec = recall_score(y_test, preds, zero_division=0) |
| f1 = f1_score(y_test, preds, zero_division=0) |
| mcc = matthews_corrcoef(y_test, preds) |
| results.append({'Threshold': t, 'Precision': prec, 'Recall': rec, 'F1': f1, 'MCC': mcc}) |
| |
| df_thresh = pd.DataFrame(results) |
| print(df_thresh.to_string(index=False, float_format='%.4f')) |
| |
| |
| best_idx = df_thresh['F1'].idxmax() |
| best_thresh = df_thresh.loc[best_idx, 'Threshold'] |
| print(f"\nOptimal threshold (by F1): {best_thresh:.2f} → F1={df_thresh.loc[best_idx, 'F1']:.4f}") |
| |
| |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5), facecolor=FIG_BG) |
| |
| axes[0].plot(df_thresh['Threshold'], df_thresh['Precision'], 'b-', linewidth=2, label='Precision') |
| axes[0].plot(df_thresh['Threshold'], df_thresh['Recall'], 'r-', linewidth=2, label='Recall') |
| axes[0].plot(df_thresh['Threshold'], df_thresh['F1'], 'g-', linewidth=2, label='F1 Score') |
| axes[0].axvline(x=best_thresh, color='gray', linestyle='--', label=f'Best Threshold ({best_thresh:.2f})') |
| axes[0].set_xlabel('Decision Threshold', fontsize=12) |
| axes[0].set_ylabel('Score', fontsize=12) |
| axes[0].set_title(f'Threshold Analysis - {best_model_name}', fontsize=12, fontweight='bold') |
| axes[0].legend() |
| |
| axes[1].plot(df_thresh['Threshold'], df_thresh['MCC'], 'm-', linewidth=2, label='MCC') |
| axes[1].axvline(x=best_thresh, color='gray', linestyle='--') |
| axes[1].set_xlabel('Decision Threshold', fontsize=12) |
| axes[1].set_ylabel('MCC', fontsize=12) |
| axes[1].set_title('Matthews Correlation Coefficient', fontsize=12, fontweight='bold') |
| axes[1].legend() |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "threshold_analysis.png"), dpi=FIG_DPI, bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "threshold_analysis.pdf"), bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: threshold_analysis.png/pdf") |
| |
| return df_thresh, best_thresh |
|
|
|
|
| def business_impact_analysis(all_cm, y_test, X_test_amounts): |
| """Estimate business impact: fraud loss caught vs missed.""" |
| print("\n" + "=" * 60) |
| print("BUSINESS IMPACT ANALYSIS") |
| print("=" * 60) |
| |
| |
| data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) |
| |
| |
| df = pd.read_csv(os.path.join(DATA_DIR, "creditcard.csv")) |
| avg_fraud_amount = df[df['Class'] == 1]['Amount'].mean() |
| avg_legit_amount = df[df['Class'] == 0]['Amount'].mean() |
| total_fraud_in_test = y_test.sum() |
| |
| print(f"Average fraud transaction amount: ${avg_fraud_amount:.2f}") |
| print(f"Total fraudulent transactions in test set: {total_fraud_in_test}") |
| print(f"Estimated total fraud exposure: ${total_fraud_in_test * avg_fraud_amount:,.2f}") |
| |
| impact_results = [] |
| for name, cm in all_cm.items(): |
| tn, fp, fn, tp = cm.ravel() |
| |
| fraud_caught = tp * avg_fraud_amount |
| fraud_missed = fn * avg_fraud_amount |
| false_alarm_cost = fp * 5 |
| |
| net_savings = fraud_caught - false_alarm_cost |
| catch_rate = tp / (tp + fn) if (tp + fn) > 0 else 0 |
| |
| impact_results.append({ |
| 'Model': name, |
| 'True Positives': tp, |
| 'False Negatives': fn, |
| 'False Positives': fp, |
| 'Fraud Caught ($)': fraud_caught, |
| 'Fraud Missed ($)': fraud_missed, |
| 'False Alarm Cost ($)': false_alarm_cost, |
| 'Net Savings ($)': net_savings, |
| 'Catch Rate (%)': catch_rate * 100 |
| }) |
| |
| df_impact = pd.DataFrame(impact_results) |
| df_impact = df_impact.sort_values('Net Savings ($)', ascending=False) |
| |
| print("\n" + df_impact.to_string(index=False, float_format='%.2f')) |
| df_impact.to_csv(os.path.join(FIGURES_DIR, "business_impact.csv"), index=False) |
| |
| return df_impact |
|
|
|
|
| def plot_feature_importance(models, feature_names): |
| """Plot feature importance for tree-based models.""" |
| fig, axes = plt.subplots(2, 2, figsize=(16, 12), facecolor=FIG_BG) |
| |
| tree_models = { |
| 'Random Forest': 'Random_Forest_Tuned', |
| 'XGBoost': 'XGBoost_Tuned', |
| 'LightGBM': 'LightGBM_Tuned', |
| } |
| |
| for idx, (title, key) in enumerate(tree_models.items()): |
| if key in models: |
| r, c = idx // 2, idx % 2 |
| model = models[key] |
| importances = model.feature_importances_ |
| indices = np.argsort(importances)[-15:] |
| |
| axes[r, c].barh(range(len(indices)), importances[indices], color='steelblue', edgecolor='black', linewidth=0.3) |
| axes[r, c].set_yticks(range(len(indices))) |
| axes[r, c].set_yticklabels([feature_names[i] for i in indices], fontsize=9) |
| axes[r, c].set_title(f'{title} - Top 15 Features', fontsize=11, fontweight='bold') |
| axes[r, c].set_xlabel('Importance') |
| |
| |
| if 'Logistic_Regression' in models: |
| lr = models['Logistic_Regression'] |
| coefs = np.abs(lr.coef_[0]) |
| indices = np.argsort(coefs)[-15:] |
| axes[1, 1].barh(range(len(indices)), coefs[indices], color='coral', edgecolor='black', linewidth=0.3) |
| axes[1, 1].set_yticks(range(len(indices))) |
| axes[1, 1].set_yticklabels([feature_names[i] for i in indices], fontsize=9) |
| axes[1, 1].set_title('Logistic Regression - Top 15 Features (|coef|)', fontsize=11, fontweight='bold') |
| axes[1, 1].set_xlabel('Absolute Coefficient') |
| |
| plt.suptitle('Feature Importance Across Models', fontsize=14, fontweight='bold') |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "feature_importance.png"), dpi=FIG_DPI, bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "feature_importance.pdf"), bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: feature_importance.png/pdf") |
|
|
|
|
| def run_evaluation(): |
| """Run complete evaluation pipeline.""" |
| |
| data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) |
| models = joblib.load(os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) |
| |
| X_test = data['X_test'] |
| y_test = data['y_test'] |
| feature_names = data['feature_names'] |
| |
| |
| df_metrics, all_cm, all_proba, all_preds = evaluate_all_models(models, X_test, y_test) |
| |
| |
| best_model_name = df_metrics.iloc[0]['Model'] |
| print(f"\nBest model by PR-AUC: {best_model_name}") |
| |
| |
| plot_confusion_matrices(all_cm, list(models.keys())) |
| |
| |
| plot_roc_curves(all_proba, y_test) |
| |
| |
| plot_pr_curves(all_proba, y_test) |
| |
| |
| df_thresh, best_thresh = threshold_analysis(best_model_name, all_proba[best_model_name], y_test) |
| |
| |
| df_impact = business_impact_analysis(all_cm, y_test, X_test) |
| |
| |
| plot_feature_importance(models, feature_names) |
| |
| |
| eval_results = { |
| 'metrics': df_metrics, |
| 'confusion_matrices': all_cm, |
| 'probabilities': all_proba, |
| 'predictions': all_preds, |
| 'threshold_analysis': df_thresh, |
| 'best_threshold': best_thresh, |
| 'business_impact': df_impact, |
| 'best_model': best_model_name |
| } |
| joblib.dump(eval_results, os.path.join(DATA_DIR, "evaluation_results.joblib")) |
| |
| print("\n" + "=" * 60) |
| print("EVALUATION COMPLETE") |
| print("=" * 60) |
| |
| return eval_results |
|
|
|
|
| if __name__ == "__main__": |
| eval_results = run_evaluation() |
|
|