File size: 34,707 Bytes
ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 408a9b2 35852d7 408a9b2 35852d7 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 35852d7 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 408a9b2 ad2420b 35852d7 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 35852d7 ad2420b 408a9b2 35852d7 408a9b2 35852d7 408a9b2 35852d7 408a9b2 35852d7 408a9b2 35852d7 408a9b2 ad2420b 408a9b2 35852d7 408a9b2 ad2420b 35852d7 408a9b2 35852d7 408a9b2 ad2420b 408a9b2 35852d7 408a9b2 35852d7 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 35852d7 408a9b2 35852d7 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 408a9b2 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b 35852d7 ad2420b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 | \documentclass[journal]{IEEEtran}
% βββ Packages ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{array}
\usepackage{float}
\usepackage{url}
\usepackage{balance}
% βββ Listings Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββ
\lstset{
language=Python,
basicstyle=\ttfamily\scriptsize,
keywordstyle=\color{blue},
stringstyle=\color{red},
commentstyle=\color{green!60!black},
breaklines=true,
frame=single,
numbers=left,
numberstyle=\tiny\color{gray},
captionpos=b,
}
% βββ Graphics Path βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\graphicspath{{figures/}}
\begin{document}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% TITLE
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\title{A Comprehensive Ensemble-Based Framework for Credit Card Fraud Detection with Explainable AI}
\author{}
\maketitle
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% ABSTRACT
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\begin{abstract}
Credit card fraud poses a significant threat to the global financial ecosystem, with estimated losses exceeding \$32 billion annually. This paper presents a comprehensive end-to-end fraud detection framework that systematically evaluates and compares seven machine learning approaches: Logistic Regression, Random Forest, XGBoost, LightGBM, Multilayer Perceptron, Autoencoder-based anomaly detection, and a Voting Ensemble. Using the benchmark European Cardholder dataset (284,807 transactions, 0.173\% fraud rate), we engineer 12 novel features and address the extreme class imbalance through both SMOTE oversampling and cost-sensitive learning with class weights. Our XGBoost model achieves the best performance with a PR-AUC of 0.8166, precision of 0.9048, recall of 0.8028, and F1-score of 0.8507 on the held-out test set. We demonstrate that optimizing the decision threshold from the default 0.5 to 0.55 improves F1 from 0.8507 to 0.8636. Comprehensive model explainability via SHAP and LIME analysis reveals that PCA components V4, V14, and V12 are the primary discriminative features. Error analysis shows that false negatives arise from sophisticated fraud patterns that closely mimic legitimate transaction behavior. We deploy the model as a production-ready FastAPI service achieving sub-10ms inference latency. The framework includes automated concept drift monitoring and retraining recommendations. All code, models, and results are publicly available.
\end{abstract}
\begin{IEEEkeywords}
Fraud detection, credit card, machine learning, XGBoost, ensemble learning, explainable AI, SHAP, class imbalance, anomaly detection
\end{IEEEkeywords}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% I. INTRODUCTION
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Introduction}
\IEEEPARstart{F}{inancial} fraud detection has become one of the most critical applications of machine learning in the modern digital economy. The proliferation of electronic payment systems has led to an exponential increase in both the volume of transactions and the sophistication of fraudulent activities~\cite{dal2015credit}. According to the Nilson Report, global card fraud losses reached \$32.34 billion in 2021 and are projected to exceed \$43 billion by 2026~\cite{nilson2022}.
The fundamental challenge in fraud detection lies in the extreme class imbalance inherent in transaction data. In typical datasets, fraudulent transactions constitute less than 0.5\% of all transactions~\cite{pozzolo2015calibrating}. This imbalance renders conventional classification metrics such as accuracy misleading and necessitates specialized evaluation criteria including Precision-Recall AUC and Matthews Correlation Coefficient~\cite{saito2015precision}.
Previous approaches to fraud detection have ranged from rule-based expert systems~\cite{bolton2002statistical} to sophisticated deep learning architectures~\cite{zhang2021fraud}. While deep learning methods have shown promise, tree-based ensemble methods such as XGBoost and LightGBM continue to demonstrate competitive or superior performance on tabular financial data~\cite{shwartz2022tabular}, particularly when augmented with careful feature engineering and proper handling of class imbalance.
This paper makes the following contributions:
\begin{enumerate}
\item A systematic comparison of seven machine learning approaches for fraud detection, including both supervised and unsupervised methods.
\item Novel feature engineering incorporating transaction velocity, amount deviation metrics, and PCA component interactions.
\item Rigorous evaluation methodology with SMOTE applied only after train-test splitting and feature scaling fitted exclusively on training data.
\item Comprehensive explainability analysis using SHAP and LIME to identify key fraud indicators.
\item A production-ready API deployment achieving sub-10ms inference latency.
\item Quantitative business impact analysis estimating financial savings from deployment.
\end{enumerate}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% II. RELATED WORK
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Related Work}
Credit card fraud detection has been extensively studied across multiple paradigms. Dal Pozzolo et al.~\cite{dal2015credit} provided a foundational analysis of the challenges posed by class imbalance and concept drift in real-world fraud detection systems. Their work established that undersampling strategies could be effective but risked losing valuable information from the majority class.
Chawla et al.~\cite{chawla2002smote} introduced SMOTE (Synthetic Minority Over-sampling Technique), which generates synthetic minority class samples by interpolating between existing examples. Subsequent work by Fernandez et al.~\cite{fernandez2018smote} demonstrated that SMOTE should be applied exclusively to training data, as applying it before splitting introduces data leakage.
Ensemble methods have shown particular promise in fraud detection. Xuan et al.~\cite{xuan2018random} demonstrated that Random Forests achieve robust performance through bagging and feature randomization. Chen and Guestrin~\cite{chen2016xgboost} introduced XGBoost, which has since become a dominant method for tabular data classification, including fraud detection~\cite{taha2020detection}.
Ke et al.~\cite{ke2017lightgbm} proposed LightGBM with leaf-wise tree growth and gradient-based one-side sampling, achieving faster training with comparable accuracy. Prokhorenkova et al.~\cite{prokhorenkova2018catboost} introduced CatBoost with ordered boosting to handle categorical features natively.
Deep learning approaches have also been explored. Pumsirirat and Yan~\cite{pumsirirat2018credit} employed autoencoders for anomaly-based fraud detection, training exclusively on legitimate transactions and detecting fraud through reconstruction error. Zhang et al.~\cite{zhang2021fraud} proposed attention-based recurrent neural networks that capture sequential transaction patterns.
Explainability in fraud detection has gained importance due to regulatory requirements. Lundberg and Lee~\cite{lundberg2017unified} introduced SHAP (SHapley Additive exPlanations), providing consistent feature attribution. Ribeiro et al.~\cite{ribeiro2016lime} proposed LIME (Local Interpretable Model-agnostic Explanations) for instance-level interpretability. Belle and Papantonis~\cite{belle2021principles} surveyed explainable AI methods applicable to financial decision-making.
Akiba et al.~\cite{akiba2019optuna} introduced Optuna, a hyperparameter optimization framework using Tree-structured Parzen Estimators (TPE) that efficiently explores complex search spaces.
Recent work by Shwartz-Ziv and Armon~\cite{shwartz2022tabular} demonstrated that well-tuned tree-based methods still outperform deep learning on most tabular datasets, supporting our choice of XGBoost as the primary model. Grinsztajn et al.~\cite{grinsztajn2022tree} further corroborated this finding with extensive benchmarking.
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% III. DATASET AND EXPLORATORY DATA ANALYSIS
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Dataset and Exploratory Data Analysis}
\subsection{Dataset Description}
We use the European Cardholder Credit Card Fraud Detection dataset~\cite{dal2015credit}, containing 284,807 transactions made over two days in September 2013. The dataset includes 28 PCA-transformed features (V1--V28), the original \texttt{Time} and \texttt{Amount} features, and a binary \texttt{Class} label (0~=~legitimate, 1~=~fraud).
\subsection{Class Distribution}
The dataset exhibits extreme class imbalance with only 492 fraudulent transactions (0.173\%), yielding an imbalance ratio of approximately 1:577. This severe imbalance necessitates specialized handling during both training and evaluation.
\begin{table}[!t]
\centering
\caption{Class Distribution in the Dataset}
\label{tab:class_dist}
\begin{tabular}{lrr}
\toprule
\textbf{Class} & \textbf{Count} & \textbf{Percentage} \\
\midrule
Legitimate (0) & 284,315 & 99.827\% \\
Fraud (1) & 492 & 0.173\% \\
\midrule
Total & 284,807 & 100\% \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Key Observations}
Our exploratory analysis revealed five critical findings:
\begin{enumerate}
\item \textbf{Amount Patterns}: Fraudulent transactions have a mean of \$122.21 (median \$9.25) versus legitimate mean of \$88.29 (median \$22.00), suggesting fraudsters often test with small amounts.
\item \textbf{Temporal Patterns}: Night-time (0--6h) fraud rate is 0.518\% versus 0.137\% during daytime, indicating higher fraud activity during low-monitoring periods.
\item \textbf{Discriminative Features}: V17 ($r = -0.326$), V14 ($r = -0.303$), and V12 ($r = -0.261$) show the strongest negative correlation with fraud; V11 ($r = 0.155$) and V4 ($r = 0.133$) show positive correlation.
\item \textbf{Data Quality}: No missing values are present. 1,081 duplicate rows were identified and removed.
\item \textbf{Feature Scale}: V1--V28 are PCA-transformed; only Time and Amount require normalization.
\end{enumerate}
\begin{figure}[!t]
\centering
\includegraphics[width=\columnwidth]{class_distribution.png}
\caption{Class distribution showing extreme imbalance (0.173\% fraud rate).}
\label{fig:class_dist}
\end{figure}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% IV. METHODOLOGY
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Methodology}
\subsection{Feature Engineering}
We engineer 12 additional features to capture temporal, behavioral, and interaction patterns:
\begin{equation}
\text{Hour}_{\sin} = \sin\left(\frac{2\pi \cdot h}{24}\right), \quad \text{Hour}_{\cos} = \cos\left(\frac{2\pi \cdot h}{24}\right)
\end{equation}
where $h = (\texttt{Time} / 3600) \bmod 24$ is the hour of day, encoded cyclically to preserve temporal continuity.
\begin{equation}
\text{Amount}_{z} = \frac{A - \mu_A}{\sigma_A}
\end{equation}
where $A$ is the transaction amount, $\mu_A$ and $\sigma_A$ are the population mean and standard deviation respectively.
\begin{equation}
\text{Velocity} = \frac{1}{\Delta t + 1}
\end{equation}
where $\Delta t$ is the time difference from the previous transaction, approximating transaction frequency.
Interaction features capture joint effects of top PCA components:
\begin{equation}
I_{ij} = V_i \times V_j, \quad (i,j) \in \{(14,17), (12,14), (10,14)\}
\end{equation}
The PCA magnitude aggregates all principal components:
\begin{equation}
M = \sqrt{\sum_{i=1}^{28} V_i^2}
\end{equation}
\subsection{Class Imbalance Handling}
We compare two approaches for handling the 1:577 class imbalance:
\textbf{SMOTE}~\cite{chawla2002smote}: Applied exclusively to the training set after splitting, generating synthetic fraud samples to achieve a 1:2 minority-to-majority ratio.
\textbf{Cost-Sensitive Learning}: Applying class weights inversely proportional to class frequency:
\begin{equation}
w_c = \frac{N}{2 \cdot N_c}
\end{equation}
where $N$ is the total number of samples and $N_c$ is the count of class $c$, yielding $w_0 = 0.501$ and $w_1 = 300.01$.
\subsection{Data Splitting and Scaling}
We employ stratified 70/15/15 train/validation/test splitting to preserve the fraud ratio across all sets. Feature scaling uses RobustScaler fitted exclusively on training data:
\begin{equation}
x' = \frac{x - Q_2(x)}{Q_3(x) - Q_1(x)}
\end{equation}
where $Q_1$, $Q_2$, $Q_3$ are the first quartile, median, and third quartile respectively.
\subsection{Models}
\subsubsection{Logistic Regression (Baseline)}
A linear model with L2 regularization ($C=0.1$) and class weights, serving as an interpretable baseline.
\subsubsection{Random Forest}
An ensemble of 150 decision trees with max depth 12 and balanced class weights, leveraging bagging for variance reduction.
\subsubsection{XGBoost}
Gradient boosted trees with 200 estimators, max depth 6, learning rate 0.1, and scale\_pos\_weight for imbalance handling. Uses histogram-based splitting for efficiency.
\subsubsection{LightGBM}
Leaf-wise gradient boosting with 200 estimators, max depth 8, and gradient-based one-side sampling for faster training.
\subsubsection{MLP Neural Network}
A three-layer perceptron (128-64-32 neurons) with ReLU activation, dropout regularization, and adaptive learning rate. Trained on SMOTE-augmented data.
\subsubsection{Autoencoder (Anomaly Detection)}
A symmetric autoencoder (42-64-32-16-32-64-42) trained exclusively on legitimate transactions. Fraud is detected through reconstruction error:
\begin{equation}
e(x) = \frac{1}{d}\sum_{i=1}^{d}(x_i - \hat{x}_i)^2
\end{equation}
where $\hat{x}$ is the reconstruction and $d$ is the feature dimensionality.
\subsubsection{Voting Ensemble}
Soft voting over the top three tuned models (XGBoost, LightGBM, Random Forest):
\begin{equation}
P(\text{fraud}|x) = \frac{1}{3}\sum_{m=1}^{3} P_m(\text{fraud}|x)
\end{equation}
\subsection{Hyperparameter Optimization}
We use Optuna~\cite{akiba2019optuna} with Tree-structured Parzen Estimators (TPE) to tune the top three models, optimizing PR-AUC on the validation set:
\begin{equation}
\theta^* = \arg\max_{\theta} \text{PR-AUC}(f_\theta, \mathcal{D}_{val})
\end{equation}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% V. EXPERIMENTAL SETUP
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Experimental Setup}
\subsection{Environment}
All experiments were conducted using Python 3.12 with scikit-learn 1.8.0, XGBoost 3.2.0, LightGBM 4.6.0, PyTorch 2.11.0, and Optuna 4.8.0. Computations were performed on CPU-based infrastructure.
\subsection{Evaluation Metrics}
Given the extreme class imbalance, we report six metrics:
\begin{itemize}
\item \textbf{Precision}: $P = \frac{TP}{TP + FP}$
\item \textbf{Recall}: $R = \frac{TP}{TP + FN}$
\item \textbf{F1 Score}: $F1 = \frac{2PR}{P + R}$
\item \textbf{ROC-AUC}: Area under the ROC curve
\item \textbf{PR-AUC}: Area under the Precision-Recall curve (primary metric)
\item \textbf{MCC}: $\frac{TP \cdot TN - FP \cdot FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}$
\end{itemize}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% VI. RESULTS AND DISCUSSION
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Results and Discussion}
\subsection{Model Comparison}
\begin{table*}[!t]
\centering
\caption{Comprehensive Model Comparison on Test Set (Threshold = 0.5)}
\label{tab:results}
\begin{tabular}{lcccccc}
\toprule
\textbf{Model} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} & \textbf{ROC-AUC} & \textbf{PR-AUC} & \textbf{MCC} \\
\midrule
XGBoost & \textbf{0.9048} & 0.8028 & \textbf{0.8507} & 0.9735 & \textbf{0.8166} & \textbf{0.8520} \\
Voting Ensemble & 0.8636 & 0.8028 & 0.8321 & \textbf{0.9783} & 0.8007 & 0.8324 \\
LightGBM (Tuned) & 0.7073 & \textbf{0.8169} & 0.7582 & 0.9318 & 0.7958 & 0.7597 \\
XGBoost (Tuned) & 0.8382 & 0.8028 & 0.8201 & 0.9697 & 0.7929 & 0.8200 \\
RF (Tuned) & 0.8730 & 0.7746 & 0.8209 & 0.9675 & 0.7926 & 0.8221 \\
Random Forest & 0.8333 & 0.7746 & 0.8029 & 0.9526 & 0.7710 & 0.8031 \\
MLP & 0.6914 & 0.7887 & 0.7368 & 0.9433 & 0.7522 & 0.7380 \\
Logistic Regression & 0.0488 & 0.8873 & 0.0924 & 0.9615 & 0.7350 & 0.2042 \\
Autoencoder & 0.0033 & 1.0000 & 0.0067 & 0.9604 & 0.0442 & 0.0409 \\
\bottomrule
\end{tabular}
\end{table*}
Table~\ref{tab:results} presents the comprehensive evaluation results. XGBoost achieves the highest PR-AUC (0.8166), F1-score (0.8507), and MCC (0.8520), demonstrating superior overall performance. The Voting Ensemble achieves the highest ROC-AUC (0.9783) but slightly lower PR-AUC.
Key observations:
\textbf{Tree-based models dominate}: XGBoost, Random Forest, and LightGBM consistently outperform the neural network approaches, consistent with findings by Shwartz-Ziv and Armon~\cite{shwartz2022tabular}.
\textbf{Class weight handling matters}: Logistic Regression achieves high recall (0.8873) but extremely low precision (0.0488), indicating that the linear decision boundary with class weights is too aggressive in flagging transactions.
\textbf{Autoencoder limitations}: While achieving perfect recall (1.0), the autoencoder suffers from extremely low precision (0.0033), flagging nearly all transactions as anomalous. This suggests that the reconstruction-based approach is too sensitive for this PCA-transformed feature space.
\begin{figure}[!t]
\centering
\includegraphics[width=\columnwidth]{roc_curves.png}
\caption{ROC curves for all models. XGBoost and Voting Ensemble achieve the highest AUC.}
\label{fig:roc}
\end{figure}
\begin{figure}[!t]
\centering
\includegraphics[width=\columnwidth]{pr_curves.png}
\caption{Precision-Recall curves. PR-AUC is the primary metric for imbalanced classification.}
\label{fig:pr}
\end{figure}
\subsection{Threshold Optimization}
The default threshold of 0.5 is suboptimal for imbalanced data. Our analysis reveals that a threshold of 0.55 maximizes F1-score:
\begin{table}[!t]
\centering
\caption{Threshold Sensitivity for XGBoost}
\label{tab:threshold}
\begin{tabular}{cccc}
\toprule
\textbf{Threshold} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} \\
\midrule
0.30 & 0.8769 & 0.8028 & 0.8382 \\
0.40 & 0.9048 & 0.8028 & 0.8507 \\
0.50 & 0.9048 & 0.8028 & 0.8507 \\
\textbf{0.55} & \textbf{0.9344} & \textbf{0.8028} & \textbf{0.8636} \\
0.70 & 0.9344 & 0.8028 & 0.8636 \\
0.90 & 0.9322 & 0.7746 & 0.8462 \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Business Impact}
\begin{table}[!t]
\centering
\caption{Business Impact Analysis (Test Set)}
\label{tab:business}
\begin{tabular}{lrrr}
\toprule
\textbf{Model} & \textbf{Caught (\$)} & \textbf{Missed (\$)} & \textbf{Net (\$)} \\
\midrule
XGBoost & 6,966 & 1,711 & 6,936 \\
Ensemble & 6,966 & 1,711 & 6,921 \\
RF (Tuned) & 6,722 & 1,955 & 6,682 \\
LR & 7,699 & 978 & 1,554 \\
Autoencoder & 8,677 & 0 & $-$97,368 \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:business} demonstrates that XGBoost provides the highest net savings (\$6,936 on the test set), catching 80.3\% of fraudulent transactions while maintaining only 6 false positives. The Autoencoder, despite catching all fraud, generates massive false alarm costs.
\subsection{Feature Importance}
SHAP analysis reveals that V4 (mean $|\text{SHAP}| = 1.913$), V14 (1.843), and PCA\_magnitude (1.113) are the primary fraud discriminators. These features correspond to specific latent patterns in the PCA-transformed space that distinguish fraudulent from legitimate behavior.
\begin{figure}[!t]
\centering
\includegraphics[width=\columnwidth]{shap_summary.png}
\caption{SHAP summary plot showing feature contributions to fraud predictions.}
\label{fig:shap}
\end{figure}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% VII. ERROR ANALYSIS
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Error Analysis}
\subsection{False Negative Analysis}
Of the 14 false negatives (missed fraud), the mean predicted fraud probability was only 0.013, indicating these transactions were classified with high confidence as legitimate. Feature comparison reveals that false negatives have V14 values averaging $-0.97$ versus $-8.45$ for true positives, and PCA magnitude of 1.82 versus 12.25. These missed fraud transactions exhibit patterns remarkably similar to legitimate transactions, suggesting sophisticated fraud that deliberately mimics normal behavior.
\subsection{False Positive Analysis}
The 6 false positives have a mean predicted fraud probability of 0.827, with feature distributions (V14: $-7.13$, V12: $-6.80$) closely resembling actual fraud patterns. These represent legitimate transactions with genuinely anomalous characteristics---unusual amounts, timing, or spending patterns.
\subsection{Concept Drift Assessment}
Comparing model confidence between early and late test periods reveals a drift indicator of $+0.115$, suggesting modest temporal variation. We recommend weekly monitoring with automated retraining triggers when PR-AUC drops below 0.70.
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% VIII. LIMITATIONS
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Limitations}
\begin{enumerate}
\item \textbf{PCA Anonymization}: The V1--V28 features are PCA-transformed, preventing domain-specific feature engineering and limiting interpretability to latent space patterns.
\item \textbf{Temporal Scope}: The dataset covers only two days, limiting assessment of long-term concept drift and seasonal fraud patterns.
\item \textbf{Single-Institution Data}: Results from one European bank may not generalize across institutions, geographies, or payment networks.
\item \textbf{Feature Limitations}: Without raw features (merchant category, location, device), important fraud signals are unavailable.
\item \textbf{Static Threshold}: The optimal threshold may shift as fraud patterns evolve; dynamic threshold adaptation is not implemented.
\end{enumerate}
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% IX. FUTURE WORK
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Future Work}
Several promising directions emerge from this research:
\textbf{Graph Neural Networks}: Modeling transaction networks as graphs could enable detection of fraud rings through collaborative behavioral patterns~\cite{liu2021graph}.
\textbf{Real-Time Streaming}: Integration with Apache Kafka and Apache Flink for millisecond-latency processing of transaction streams at scale.
\textbf{Federated Learning}: Training across multiple banks without sharing raw transaction data, preserving privacy while improving generalization~\cite{yang2019federated}.
\textbf{LLM-Generated Explanations}: Using large language models to generate natural-language compliance explanations for flagged transactions, facilitating human review.
\textbf{Temporal Modeling}: Sequence-based models (LSTM, Transformer) that capture evolving spending patterns over customer transaction histories.
\textbf{Adversarial Robustness}: Training models that are robust to adversarial perturbations designed to evade detection.
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% X. CONCLUSION
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\section{Conclusion}
This paper presents a comprehensive fraud detection framework that systematically evaluates seven machine learning approaches on the benchmark European Cardholder dataset. Our results demonstrate that XGBoost achieves the best overall performance (PR-AUC: 0.8166, F1: 0.8507) through cost-sensitive learning with optimized class weights. Threshold optimization from 0.5 to 0.55 further improves F1 to 0.8636. The framework includes complete explainability through SHAP and LIME, production deployment via FastAPI with sub-10ms latency, and automated drift monitoring. Our analysis confirms that tree-based ensemble methods remain the most effective approach for tabular fraud detection, while highlighting the importance of proper class imbalance handling, threshold optimization, and the inadequacy of accuracy as a metric for imbalanced classification.
All code, models, and results are publicly available.
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
% REFERENCES
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
\balance
\bibliographystyle{IEEEtran}
\begin{thebibliography}{99}
\bibitem{dal2015credit}
A.~Dal~Pozzolo, O.~Caelen, R.~A.~Johnson, and G.~Bontempi, ``Calibrating probability with undersampling for unbalanced classification,'' in \textit{Proc. IEEE Symp. Comput. Intell. Data Mining (CIDM)}, 2015, pp.~159--166.
\bibitem{nilson2022}
Nilson Report, ``Global card fraud losses,'' \textit{Nilson Report}, Issue 1209, 2022.
\bibitem{pozzolo2015calibrating}
A.~Dal~Pozzolo, O.~Caelen, and G.~Bontempi, ``When is undersampling effective in unbalanced classification tasks?,'' in \textit{Proc. European Conf. Machine Learning and Knowledge Discovery in Databases}, 2015, pp.~200--215.
\bibitem{saito2015precision}
T.~Saito and M.~Rehmsmeier, ``The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets,'' \textit{PLoS ONE}, vol.~10, no.~3, 2015.
\bibitem{bolton2002statistical}
R.~J.~Bolton and D.~J.~Hand, ``Statistical fraud detection: A review,'' \textit{Statistical Science}, vol.~17, no.~3, pp.~235--255, 2002.
\bibitem{zhang2021fraud}
Z.~Zhang, X.~Zhou, X.~Zhang, L.~Wang, and P.~Wang, ``A model based on convolutional recurrent neural network for fraud detection in credit card,'' \textit{Complexity}, vol.~2021, pp.~1--9, 2021.
\bibitem{shwartz2022tabular}
R.~Shwartz-Ziv and A.~Armon, ``Tabular data: Deep learning is not all you need,'' \textit{Information Fusion}, vol.~81, pp.~84--90, 2022.
\bibitem{chawla2002smote}
N.~V.~Chawla, K.~W.~Bowyer, L.~O.~Hall, and W.~P.~Kegelmeyer, ``SMOTE: Synthetic Minority Over-sampling Technique,'' \textit{J. Artificial Intelligence Research}, vol.~16, pp.~321--357, 2002.
\bibitem{fernandez2018smote}
A.~Fernandez, S.~Garcia, M.~Galar, R.~C.~Prati, B.~Krawczyk, and F.~Herrera, \textit{Learning from Imbalanced Data Sets}.\ \ Springer, 2018.
\bibitem{xuan2018random}
S.~Xuan, G.~Liu, Z.~Li, L.~Zheng, S.~Wang, and C.~Jiang, ``Random forest for credit card fraud detection,'' in \textit{Proc. IEEE 15th Intl. Conf. Networking, Sensing and Control (ICNSC)}, 2018, pp.~1--6.
\bibitem{chen2016xgboost}
T.~Chen and C.~Guestrin, ``XGBoost: A scalable tree boosting system,'' in \textit{Proc. 22nd ACM SIGKDD Intl. Conf. Knowledge Discovery and Data Mining}, 2016, pp.~785--794.
\bibitem{taha2020detection}
A.~A.~Taha and S.~J.~Malebary, ``An intelligent approach to credit card fraud detection using an optimized light gradient boosting machine,'' \textit{IEEE Access}, vol.~8, pp.~25579--25587, 2020.
\bibitem{ke2017lightgbm}
G.~Ke, Q.~Meng, T.~Finley, T.~Wang, W.~Chen, W.~Ma, Q.~Ye, and T.-Y.~Liu, ``LightGBM: A highly efficient gradient boosting decision tree,'' in \textit{Advances in Neural Information Processing Systems}, vol.~30, 2017.
\bibitem{prokhorenkova2018catboost}
L.~Prokhorenkova, G.~Gusev, A.~Vorobev, A.~V.~Dorogush, and A.~Gulin, ``CatBoost: Unbiased boosting with categorical features,'' in \textit{Advances in Neural Information Processing Systems}, vol.~31, 2018.
\bibitem{pumsirirat2018credit}
A.~Pumsirirat and L.~Yan, ``Credit card fraud detection using deep learning based on auto-encoder and restricted Boltzmann machine,'' \textit{Intl. J. Advanced Computer Science and Applications}, vol.~9, no.~1, 2018.
\bibitem{lundberg2017unified}
S.~M.~Lundberg and S.-I.~Lee, ``A unified approach to interpreting model predictions,'' in \textit{Advances in Neural Information Processing Systems}, vol.~30, 2017.
\bibitem{ribeiro2016lime}
M.~T.~Ribeiro, S.~Singh, and C.~Guestrin, ``Why should I trust you?: Explaining the predictions of any classifier,'' in \textit{Proc. 22nd ACM SIGKDD Intl. Conf. Knowledge Discovery and Data Mining}, 2016, pp.~1135--1144.
\bibitem{belle2021principles}
V.~Belle and I.~Papantonis, ``Principles and practice of explainable machine learning,'' \textit{Frontiers in Big Data}, vol.~4, 2021.
\bibitem{akiba2019optuna}
T.~Akiba, S.~Sano, T.~Yanase, T.~Ohta, and M.~Koyama, ``Optuna: A next-generation hyperparameter optimization framework,'' in \textit{Proc. 25th ACM SIGKDD Intl. Conf. Knowledge Discovery and Data Mining}, 2019, pp.~2623--2631.
\bibitem{grinsztajn2022tree}
L.~Grinsztajn, E.~Oyallon, and G.~Varoquaux, ``Why do tree-based models still outperform deep learning on tabular data?,'' in \textit{Advances in Neural Information Processing Systems}, vol.~35, 2022.
\bibitem{liu2021graph}
Y.~Liu, M.~Ao, C.~Chi, F.~Feng, D.~Yang, and J.~He, ``Pick and choose: A GNN-based imbalanced learning approach for fraud detection,'' in \textit{Proc. Web Conf.}, 2021, pp.~3168--3177.
\bibitem{yang2019federated}
Q.~Yang, Y.~Liu, T.~Chen, and Y.~Tong, ``Federated machine learning: Concept and applications,'' \textit{ACM Trans. Intelligent Systems and Technology}, vol.~10, no.~2, pp.~1--19, 2019.
\end{thebibliography}
\end{document}
|