Upload generate_pdf.py with huggingface_hub
Browse files- generate_pdf.py +281 -536
generate_pdf.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
Generate a comprehensive IEEE-style PDF paper using fpdf2.
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
import os, sys
|
| 6 |
sys.path.insert(0, '/app/fraud_detection')
|
|
@@ -10,425 +10,252 @@ FIGURES_DIR = '/app/fraud_detection/figures'
|
|
| 10 |
PAPER_DIR = '/app/fraud_detection/paper'
|
| 11 |
os.makedirs(PAPER_DIR, exist_ok=True)
|
| 12 |
|
| 13 |
-
LM =
|
| 14 |
-
RM =
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class IEEEPaper(FPDF):
|
| 19 |
def __init__(self):
|
| 20 |
super().__init__('P', 'mm', 'letter')
|
| 21 |
-
self.set_margins(LM,
|
| 22 |
-
self.set_auto_page_break(auto=True, margin=
|
| 23 |
|
| 24 |
def header(self):
|
| 25 |
if self.page_no() > 1:
|
| 26 |
-
self.set_font('Helvetica', 'I',
|
| 27 |
-
self.cell(0,
|
| 28 |
-
self.ln(
|
| 29 |
|
| 30 |
def footer(self):
|
| 31 |
-
self.set_y(-
|
| 32 |
self.set_font('Helvetica', 'I', 7)
|
| 33 |
-
self.cell(0,
|
| 34 |
|
| 35 |
-
def
|
| 36 |
-
self.ln(5)
|
| 37 |
-
self.set_font('Helvetica', 'B', 11)
|
| 38 |
-
self.cell(0, 6, f'{num}. {title.upper()}', ln=True)
|
| 39 |
-
self.ln(2)
|
| 40 |
-
|
| 41 |
-
def subsec(self, label, title):
|
| 42 |
self.ln(3)
|
| 43 |
self.set_font('Helvetica', 'B', 10)
|
| 44 |
-
|
|
|
|
| 45 |
self.ln(1)
|
| 46 |
|
| 47 |
-
def
|
| 48 |
-
"""Body paragraph."""
|
| 49 |
-
self.set_font('Times', '', 9.5)
|
| 50 |
-
self.multi_cell(0, 4.2, text)
|
| 51 |
self.ln(1.5)
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
|
| 55 |
-
self.
|
| 56 |
-
self.
|
| 57 |
-
self.multi_cell(BW - 5, 4.2, text)
|
| 58 |
-
self.ln(1.5)
|
| 59 |
|
| 60 |
def bullet(self, items):
|
| 61 |
-
self.set_font('Times', '', 9
|
| 62 |
for item in items:
|
| 63 |
-
self.set_x(LM +
|
| 64 |
-
self.cell(
|
| 65 |
-
self.multi_cell(BW -
|
| 66 |
-
self.ln(0.
|
| 67 |
-
self.ln(
|
| 68 |
|
| 69 |
-
def fig(self, path, caption, w=
|
| 70 |
if not os.path.exists(path):
|
| 71 |
return
|
| 72 |
-
self.ln(
|
| 73 |
x = (self.w - w) / 2
|
| 74 |
self.image(path, x=x, w=w)
|
| 75 |
-
self.ln(
|
| 76 |
-
self.set_font('Helvetica', 'I',
|
| 77 |
-
self.multi_cell(0, 3.
|
| 78 |
-
self.ln(
|
| 79 |
|
| 80 |
def tbl(self, hdrs, rows, caption=''):
|
| 81 |
if caption:
|
| 82 |
-
self.ln(2)
|
| 83 |
-
self.set_font('Helvetica', 'I', 8)
|
| 84 |
-
self.multi_cell(0, 3.8, caption, align='C')
|
| 85 |
self.ln(1)
|
|
|
|
|
|
|
|
|
|
| 86 |
cw = BW / len(hdrs)
|
| 87 |
-
self.set_font('Helvetica', 'B', 7
|
| 88 |
for h in hdrs:
|
| 89 |
-
self.cell(cw, 4
|
| 90 |
self.ln()
|
| 91 |
-
self.set_font('Times', '', 7
|
| 92 |
for row in rows:
|
| 93 |
for c in row:
|
| 94 |
-
self.cell(cw, 4
|
| 95 |
self.ln()
|
| 96 |
-
self.ln(
|
| 97 |
|
| 98 |
|
| 99 |
def build():
|
| 100 |
pdf = IEEEPaper()
|
|
|
|
| 101 |
|
| 102 |
-
# =====
|
| 103 |
pdf.add_page()
|
| 104 |
-
pdf.ln(18)
|
| 105 |
-
pdf.set_font('Helvetica', 'B', 17)
|
| 106 |
-
pdf.multi_cell(0, 9, 'A Comprehensive Ensemble-Based Framework\nfor Credit Card Fraud Detection\nwith Explainable AI', align='C')
|
| 107 |
pdf.ln(6)
|
| 108 |
-
pdf.set_font('Helvetica', '',
|
| 109 |
-
pdf.
|
| 110 |
-
pdf.
|
| 111 |
-
pdf.
|
| 112 |
-
pdf.
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
pdf.
|
| 116 |
-
|
| 117 |
-
pdf.
|
| 118 |
-
pdf.
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
'
|
| 122 |
-
'
|
| 123 |
-
'
|
| 124 |
-
'
|
| 125 |
-
'
|
| 126 |
-
'
|
| 127 |
-
'recall of 0.8028, F1
|
| 128 |
-
'
|
| 129 |
-
'
|
| 130 |
-
'
|
| 131 |
-
|
| 132 |
-
)
|
| 133 |
-
pdf.
|
| 134 |
-
pdf.cell(0, 5, 'Keywords: fraud detection, credit card, XGBoost, ensemble learning, SHAP, LIME, class imbalance, SMOTE, anomaly detection, explainable AI', ln=True)
|
| 135 |
|
| 136 |
# ===== I. INTRODUCTION =====
|
| 137 |
-
pdf.
|
| 138 |
-
pdf.p(
|
| 139 |
-
'Financial fraud detection has emerged as one of the most consequential applications of machine learning in the modern digital economy. '
|
| 140 |
-
'The global shift toward electronic payment systems has created an unprecedented volume of financial transactions, with Visa alone processing '
|
| 141 |
-
'over 200 billion transactions annually. This massive scale, while enabling economic growth, simultaneously creates fertile ground for '
|
| 142 |
-
'increasingly sophisticated fraudulent activities. According to the Nilson Report [21], worldwide payment card fraud losses reached $32.34 billion '
|
| 143 |
-
'in 2021, representing a 14% year-over-year increase. Projections indicate these losses will exceed $43 billion by 2026 unless detection systems '
|
| 144 |
-
'improve significantly.'
|
| 145 |
-
)
|
| 146 |
-
pdf.p(
|
| 147 |
-
'The fundamental challenge in fraud detection lies in the extreme class imbalance inherent in transaction data. In real-world datasets, '
|
| 148 |
-
'fraudulent transactions typically constitute less than 0.5% of all transactions, often as low as 0.1%. This severe imbalance creates a '
|
| 149 |
-
'paradox where a naive classifier that labels every transaction as legitimate achieves over 99.8% accuracy while catching zero fraud. '
|
| 150 |
-
'This renders conventional accuracy metrics entirely misleading and necessitates the use of specialized evaluation criteria including '
|
| 151 |
-
'Precision-Recall Area Under the Curve (PR-AUC), the F1-score, and the Matthews Correlation Coefficient (MCC), which remain informative '
|
| 152 |
-
'even under extreme class skew [18].'
|
| 153 |
-
)
|
| 154 |
pdf.p(
|
| 155 |
-
'
|
| 156 |
-
'
|
| 157 |
-
'
|
|
|
|
|
|
|
| 158 |
)
|
| 159 |
pdf.p(
|
| 160 |
-
'
|
| 161 |
-
'
|
| 162 |
-
'
|
| 163 |
-
'
|
| 164 |
-
'particularly when combined with thoughtful feature engineering and proper handling of class imbalance.'
|
| 165 |
)
|
| 166 |
-
pdf.p('This paper makes the following
|
| 167 |
pdf.bullet([
|
| 168 |
-
'
|
| 169 |
-
'Novel feature engineering
|
| 170 |
-
'
|
| 171 |
-
'
|
| 172 |
-
'
|
| 173 |
-
'Quantitative business impact analysis translating model performance into dollar-denominated financial outcomes, directly connecting ML metrics to business value.',
|
| 174 |
])
|
| 175 |
|
| 176 |
# ===== II. RELATED WORK =====
|
| 177 |
-
pdf.
|
| 178 |
-
pdf.p(
|
| 179 |
-
'The literature on fraud detection is extensive and spans several decades. Bolton and Hand [12] provided one of the earliest comprehensive surveys '
|
| 180 |
-
'of statistical methods for fraud detection, establishing the field and identifying class imbalance as the central technical challenge. '
|
| 181 |
-
'Dal Pozzolo et al. [1] subsequently provided a foundational analysis of how class imbalance and concept drift interact in real-world '
|
| 182 |
-
'credit card fraud detection systems, demonstrating that undersampling strategies could be effective but risked discarding valuable information '
|
| 183 |
-
'from the majority class. Their follow-up work [22] further investigated conditions under which undersampling outperforms other strategies.'
|
| 184 |
-
)
|
| 185 |
-
pdf.p(
|
| 186 |
-
'The class imbalance problem has generated a rich sub-literature. Chawla et al. [2] introduced SMOTE (Synthetic Minority Over-sampling Technique), '
|
| 187 |
-
'which generates synthetic minority class samples by interpolating between existing examples in feature space. SMOTE became the dominant '
|
| 188 |
-
'oversampling method in the field, with numerous variants proposed subsequently (Borderline-SMOTE, ADASYN, etc.). Critically, Fernandez et al. [3] '
|
| 189 |
-
'established through extensive experimentation that SMOTE must be applied exclusively to training data; applying it before the train-test split '
|
| 190 |
-
'introduces a subtle but severe form of data leakage where synthetic test samples carry information derived from training examples, leading to '
|
| 191 |
-
'dramatically over-optimistic performance estimates.'
|
| 192 |
-
)
|
| 193 |
-
pdf.p(
|
| 194 |
-
'Tree-based ensemble methods have emerged as the dominant paradigm for tabular fraud detection. Xuan et al. [17] demonstrated that '
|
| 195 |
-
'Random Forests achieve robust baseline performance through bagging and feature randomization. Chen and Guestrin [4] introduced XGBoost, '
|
| 196 |
-
'a regularized gradient boosting framework that has since become one of the most widely used algorithms for tabular classification, including '
|
| 197 |
-
'fraud detection [14]. Ke et al. [5] proposed LightGBM with leaf-wise tree growth and gradient-based one-side sampling (GOSS), achieving '
|
| 198 |
-
'faster training with comparable or superior accuracy. Prokhorenkova et al. [16] introduced CatBoost with ordered boosting to handle '
|
| 199 |
-
'categorical features natively without target leakage.'
|
| 200 |
-
)
|
| 201 |
pdf.p(
|
| 202 |
-
'
|
| 203 |
-
'
|
| 204 |
-
'
|
| 205 |
-
'
|
| 206 |
-
'improvement over tree-based methods on static feature representations.'
|
| 207 |
)
|
| 208 |
pdf.p(
|
| 209 |
-
'
|
| 210 |
-
'
|
| 211 |
-
'
|
| 212 |
-
'
|
| 213 |
-
'
|
| 214 |
)
|
| 215 |
pdf.p(
|
| 216 |
-
'For
|
| 217 |
-
'
|
|
|
|
| 218 |
)
|
| 219 |
|
| 220 |
# ===== III. DATASET AND EDA =====
|
| 221 |
-
pdf.
|
| 222 |
-
|
| 223 |
-
pdf.subsec('A.', 'Dataset Description')
|
| 224 |
-
pdf.p(
|
| 225 |
-
'We use the European Cardholder Credit Card Fraud Detection dataset [1], one of the most widely-cited benchmarks in the fraud detection '
|
| 226 |
-
'literature. The dataset contains 284,807 transactions made by European cardholders over a two-day period in September 2013. Each transaction '
|
| 227 |
-
'is described by 31 features: 28 numerical features (V1 through V28) that are the result of a PCA transformation applied to the original '
|
| 228 |
-
'confidential features, plus the raw Time (seconds elapsed from the first transaction in the dataset), Amount (the transaction dollar value), '
|
| 229 |
-
'and Class (the binary label: 0 for legitimate, 1 for fraud). The PCA transformation was applied by the dataset creators to protect cardholder '
|
| 230 |
-
'privacy, which means the original feature semantics (merchant category, geographic location, card type, etc.) are not available. This places '
|
| 231 |
-
'a constraint on domain-specific feature engineering but ensures the dataset can be shared publicly for research.'
|
| 232 |
-
)
|
| 233 |
-
|
| 234 |
-
pdf.subsec('B.', 'Class Distribution and Imbalance')
|
| 235 |
pdf.p(
|
| 236 |
-
'
|
| 237 |
-
'
|
| 238 |
-
'
|
| 239 |
-
'
|
| 240 |
-
'and (iii) most standard ML algorithms will struggle to learn the minority class boundary without explicit countermeasures.'
|
| 241 |
)
|
| 242 |
-
|
| 243 |
pdf.tbl(
|
| 244 |
-
['Class', 'Count', 'Percentage', '
|
| 245 |
[['Legitimate (0)', '284,315', '99.827%', '---'],
|
| 246 |
-
['Fraud (1)', '492', '0.173%', '1
|
| 247 |
['Total', '284,807', '100%', '---']],
|
| 248 |
-
'Table I: Class Distribution
|
| 249 |
)
|
| 250 |
-
pdf.fig(os.path.join(
|
| 251 |
-
'Fig. 1. Class distribution showing
|
| 252 |
|
| 253 |
-
pdf.
|
| 254 |
pdf.p(
|
| 255 |
-
'
|
| 256 |
-
'
|
| 257 |
-
'
|
| 258 |
-
'
|
| 259 |
-
'fraudsters verify that a stolen card number is active before attempting larger purchases, and (ii) moderate-to-high value transactions that '
|
| 260 |
-
'represent the actual theft. The low median indicates that the testing strategy is more common. Figure 2 presents the detailed amount distributions.'
|
| 261 |
)
|
| 262 |
-
pdf.fig(os.path.join(
|
| 263 |
-
'Fig. 2.
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
pdf.
|
| 266 |
pdf.p(
|
| 267 |
-
'
|
| 268 |
-
'fraud
|
| 269 |
-
'
|
| 270 |
-
'
|
| 271 |
-
'
|
| 272 |
-
'as sine and cosine components to preserve the circular nature of time.'
|
| 273 |
)
|
| 274 |
-
pdf.fig(os.path.join(
|
| 275 |
-
'Fig.
|
| 276 |
-
|
| 277 |
-
pdf.subsec('E.', 'Feature Correlations')
|
| 278 |
-
pdf.p(
|
| 279 |
-
'Pearson correlation analysis between each feature and the fraud label identifies the most discriminative PCA components. The features with '
|
| 280 |
-
'the strongest negative correlation with fraud are V17 (r = -0.326), V14 (r = -0.303), and V12 (r = -0.261), meaning that lower values of '
|
| 281 |
-
'these features are associated with higher fraud probability. On the positive side, V11 (r = +0.155) and V4 (r = +0.133) show the strongest '
|
| 282 |
-
'associations. Notably, the raw Amount feature has near-zero correlation (r = 0.006) with fraud, confirming that simple amount-based rules '
|
| 283 |
-
'would be ineffective. The Time feature also shows negligible correlation (r = -0.012). These findings guide both our feature engineering '
|
| 284 |
-
'(creating interaction terms between the top correlated features) and our expectation of which features will dominate model importance.'
|
| 285 |
-
)
|
| 286 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'correlation_heatmap.png'),
|
| 287 |
-
'Fig. 4. Feature correlation with the fraud class. Negative values (red) indicate features whose lower values signal fraud.', w=130)
|
| 288 |
-
|
| 289 |
-
pdf.subsec('F.', 'Feature Distributions by Class')
|
| 290 |
-
pdf.p(
|
| 291 |
-
'Figure 5 visualizes the distributions of the six most discriminative features separated by class. The key observation is that for features '
|
| 292 |
-
'like V14 and V17, the fraud distribution (red) is shifted significantly to the left compared to the legitimate distribution (green), '
|
| 293 |
-
'creating a separable signal that tree-based models can exploit through axis-aligned splits. For V4 and V11, the fraud distribution is shifted '
|
| 294 |
-
'rightward. However, there is substantial overlap between the classes for all features, which explains why no single feature achieves perfect '
|
| 295 |
-
'separation and why ensemble methods that combine weak signals from multiple features outperform univariate approaches.'
|
| 296 |
-
)
|
| 297 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'feature_distributions.png'),
|
| 298 |
-
'Fig. 5. Distribution of the top 6 discriminative features by class, showing partial but informative separation.', w=155)
|
| 299 |
-
|
| 300 |
-
pdf.subsec('G.', 'Five Key Observations')
|
| 301 |
-
pdf.p('Our exploratory analysis yields five principal observations that directly inform the modeling strategy:')
|
| 302 |
-
pdf.bullet([
|
| 303 |
-
'EXTREME CLASS IMBALANCE: With only 0.173% fraud, conventional accuracy is meaningless. All models must employ either oversampling (SMOTE) or cost-sensitive learning, and evaluation must rely on PR-AUC, F1, and MCC rather than accuracy or ROC-AUC alone.',
|
| 304 |
-
'BIMODAL FRAUD AMOUNTS: The bimodal distribution of fraud amounts (small testing transactions + larger theft transactions) means amount-based thresholds will miss most fraud. Feature engineering that captures amount deviations and z-scores is essential.',
|
| 305 |
-
'TEMPORAL EXPLOITATION: The 4x higher nighttime fraud rate provides a usable signal when encoded as cyclic features. Time-based features should improve model discrimination.',
|
| 306 |
-
'PCA FEATURE DOMINANCE: V14, V17, V12, V4, and V11 carry the strongest fraud signal. Interaction features between these variables may capture non-linear relationships that individual features miss.',
|
| 307 |
-
'CLEAN DATA: The absence of missing values and the pre-applied PCA transformation simplify preprocessing but limit domain-specific engineering. The 1,081 duplicate rows are removed to prevent data leakage.',
|
| 308 |
-
])
|
| 309 |
|
| 310 |
# ===== IV. METHODOLOGY =====
|
| 311 |
-
pdf.
|
| 312 |
-
|
| 313 |
-
pdf.subsec('A.', 'Feature Engineering')
|
| 314 |
-
pdf.p(
|
| 315 |
-
'We augment the original 30 features (Time, V1-V28, Amount) with 12 engineered features designed to capture temporal, behavioral, and '
|
| 316 |
-
'interaction patterns that the raw PCA features may not directly encode. The final feature set contains 42 dimensions.'
|
| 317 |
-
)
|
| 318 |
-
pdf.p(
|
| 319 |
-
'Temporal features: We derive the hour of day from the Time column and encode it cyclically using sine and cosine transformations: '
|
| 320 |
-
'Hour_sin = sin(2*pi*h/24) and Hour_cos = cos(2*pi*h/24), where h = (Time/3600) mod 24. This cyclic encoding ensures that hour 23 '
|
| 321 |
-
'and hour 0 are treated as adjacent rather than maximally distant, which is critical for capturing the nighttime fraud pattern. '
|
| 322 |
-
'We also compute Time_diff as the difference in Time from the previous transaction, approximating the inter-arrival time.'
|
| 323 |
-
)
|
| 324 |
pdf.p(
|
| 325 |
-
'
|
| 326 |
-
'
|
| 327 |
-
'and Transaction_velocity
|
|
|
|
| 328 |
)
|
|
|
|
| 329 |
pdf.p(
|
| 330 |
-
'
|
| 331 |
-
'
|
| 332 |
-
'
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
pdf.subsec('B.', 'Class Imbalance Handling')
|
| 336 |
-
pdf.p(
|
| 337 |
-
'We implement and compare two established approaches for handling the 1:577 class imbalance. Both are applied exclusively to the '
|
| 338 |
-
'training data, never to validation or test sets.'
|
| 339 |
)
|
|
|
|
| 340 |
pdf.p(
|
| 341 |
-
'
|
| 342 |
-
'
|
| 343 |
-
'
|
| 344 |
-
'
|
| 345 |
-
'
|
|
|
|
|
|
|
| 346 |
)
|
|
|
|
| 347 |
pdf.p(
|
| 348 |
-
'
|
| 349 |
-
'w_c = N / (2 * N_c), yielding w_0 = 0.501 and w_1 = 300.01. This effectively makes each fraud example 599 times more important '
|
| 350 |
-
'than a legitimate example in the loss function, incentivizing the model to correctly classify fraud even at the cost of some false positives.'
|
| 351 |
-
)
|
| 352 |
-
|
| 353 |
-
pdf.subsec('C.', 'Data Splitting and Scaling')
|
| 354 |
-
pdf.p(
|
| 355 |
-
'After removing 1,081 duplicate rows and engineering features, we perform a stratified 70/15/15 train/validation/test split. '
|
| 356 |
-
'Stratification preserves the original 0.167% fraud ratio in each split: Train (198,608 samples, 331 fraud), Validation (42,559 samples, '
|
| 357 |
-
'71 fraud), Test (42,559 samples, 71 fraud). Feature scaling uses RobustScaler, which normalizes by the interquartile range '
|
| 358 |
-
'x\' = (x - median) / IQR, providing robustness to outliers that are common in financial transaction data. The scaler is fitted exclusively '
|
| 359 |
-
'on the training set and then applied identically to validation and test sets, preventing any information leakage.'
|
| 360 |
)
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
pdf.p(
|
| 364 |
-
'1) Logistic Regression (Baseline): An L2-regularized linear model with C=0.1 and balanced class weights, serving as an interpretable '
|
| 365 |
-
'baseline. Its coefficients directly indicate feature importance and direction of effect.'
|
| 366 |
-
)
|
| 367 |
-
pdf.p(
|
| 368 |
-
'2) Random Forest: An ensemble of 150 decision trees (max_depth=12, min_samples_split=5) with balanced class weights. '
|
| 369 |
-
'Each tree is trained on a bootstrap sample with random feature subsets, providing variance reduction through averaging.'
|
| 370 |
-
)
|
| 371 |
-
pdf.p(
|
| 372 |
-
'3) XGBoost: Gradient boosted trees with 200 estimators, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, '
|
| 373 |
-
'and scale_pos_weight derived from class frequencies. Uses histogram-based splitting for computational efficiency.'
|
| 374 |
-
)
|
| 375 |
-
pdf.p(
|
| 376 |
-
'4) LightGBM: Leaf-wise gradient boosting with 200 estimators, max_depth=8, learning_rate=0.05, and gradient-based one-side sampling. '
|
| 377 |
-
'The leaf-wise growth strategy can produce deeper trees than XGBoost for the same number of leaves, potentially capturing more complex patterns.'
|
| 378 |
-
)
|
| 379 |
-
pdf.p(
|
| 380 |
-
'5) MLP Neural Network: A three-layer perceptron (128-64-32 neurons) with ReLU activation, dropout (implicit via alpha=0.001 L2 '
|
| 381 |
-
'regularization), adaptive learning rate, and early stopping. Trained on SMOTE-augmented data since sklearn MLPClassifier does not '
|
| 382 |
-
'support class weights directly.'
|
| 383 |
-
)
|
| 384 |
-
pdf.p(
|
| 385 |
-
'6) Autoencoder (Anomaly Detection): A symmetric autoencoder with architecture 42-64-32-16-32-64-42, trained for 50 epochs '
|
| 386 |
-
'exclusively on legitimate transactions. The core assumption is that the autoencoder learns to reconstruct normal transaction patterns; '
|
| 387 |
-
'when a fraudulent transaction is presented, the reconstruction error e(x) = (1/d) * sum((x_i - x_hat_i)^2) will be anomalously high. '
|
| 388 |
-
'This approach requires no labeled fraud examples during training, making it potentially useful for zero-day fraud detection.'
|
| 389 |
-
)
|
| 390 |
-
pdf.p(
|
| 391 |
-
'7) Voting Ensemble: Soft voting over the three best-performing tuned models (XGBoost, LightGBM, Random Forest), where the final '
|
| 392 |
-
'fraud probability is the arithmetic mean of the three individual model probabilities. This leverages the diversity of different '
|
| 393 |
-
'tree-building strategies to reduce variance.'
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
pdf.subsec('E.', 'Hyperparameter Optimization')
|
| 397 |
-
pdf.p(
|
| 398 |
-
'We tune the top three models (XGBoost, LightGBM, Random Forest) using Optuna [11] with the Tree-structured Parzen Estimator (TPE) '
|
| 399 |
-
'sampler. For each model, Optuna explores the hyperparameter space (learning rate, tree depth, regularization, subsampling) over '
|
| 400 |
-
'15-20 trials, optimizing PR-AUC on the validation set. The TPE sampler adaptively focuses trials on promising regions of the search '
|
| 401 |
-
'space, achieving better sample efficiency than grid or random search.'
|
| 402 |
-
)
|
| 403 |
-
|
| 404 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'architecture_diagram.png'),
|
| 405 |
-
'Fig. 6. End-to-end system architecture: from transaction input through feature engineering, model inference, to API output and monitoring.', w=155)
|
| 406 |
|
| 407 |
# ===== V. EXPERIMENTAL SETUP =====
|
| 408 |
-
pdf.
|
| 409 |
pdf.p(
|
| 410 |
-
'
|
| 411 |
-
'
|
| 412 |
-
'
|
| 413 |
-
'was approximately 25 minutes on a 2-core CPU.'
|
| 414 |
-
)
|
| 415 |
-
pdf.p(
|
| 416 |
-
'We report six evaluation metrics on the held-out test set: (1) Precision = TP/(TP+FP), measuring the fraction of flagged transactions '
|
| 417 |
-
'that are actually fraudulent; (2) Recall = TP/(TP+FN), measuring the fraction of actual fraud that is caught; (3) F1-score, the harmonic '
|
| 418 |
-
'mean of precision and recall; (4) ROC-AUC, the area under the Receiver Operating Characteristic curve; (5) PR-AUC (Average Precision), '
|
| 419 |
-
'the area under the Precision-Recall curve, which is our primary metric as it is more informative than ROC-AUC under extreme class imbalance [18]; '
|
| 420 |
-
'and (6) Matthews Correlation Coefficient (MCC), which provides a balanced measure that accounts for all four confusion matrix quadrants '
|
| 421 |
-
'and returns values between -1 and +1.'
|
| 422 |
)
|
| 423 |
|
| 424 |
# ===== VI. RESULTS AND DISCUSSION =====
|
| 425 |
-
pdf.
|
| 426 |
-
|
| 427 |
-
pdf.
|
| 428 |
-
pdf.p(
|
| 429 |
-
'Table II presents the comprehensive evaluation of all models on the test set using a default threshold of 0.5. The results reveal '
|
| 430 |
-
'a clear hierarchy with important nuances.'
|
| 431 |
-
)
|
| 432 |
pdf.tbl(
|
| 433 |
['Model', 'Prec.', 'Recall', 'F1', 'ROC-AUC', 'PR-AUC', 'MCC'],
|
| 434 |
[
|
|
@@ -442,267 +269,185 @@ def build():
|
|
| 442 |
['Logistic Reg.', '0.049', '0.887', '0.092', '0.962', '0.735', '0.204'],
|
| 443 |
['Autoencoder', '0.003', '1.000', '0.007', '0.960', '0.044', '0.041'],
|
| 444 |
],
|
| 445 |
-
'Table II: Comprehensive Model Comparison on Test Set (threshold
|
| 446 |
)
|
| 447 |
-
|
| 448 |
pdf.p(
|
| 449 |
-
'
|
| 450 |
-
'
|
| 451 |
-
'
|
| 452 |
-
'on the more informative PR-AUC metric, suggesting that the three ensemble members are not sufficiently diverse to benefit from averaging.'
|
| 453 |
)
|
| 454 |
pdf.p(
|
| 455 |
-
'
|
| 456 |
-
'
|
| 457 |
-
'
|
| 458 |
-
'would overwhelm any operational fraud investigation team.'
|
| 459 |
)
|
| 460 |
pdf.p(
|
| 461 |
-
'
|
| 462 |
-
'
|
| 463 |
-
'legitimate
|
| 464 |
-
'space: the autoencoder learns to reconstruct the dominant variance directions, but the fraud signal may reside in minor PCA components '
|
| 465 |
-
'whose reconstruction error is similar to legitimate noise, making discrimination unreliable.'
|
| 466 |
)
|
| 467 |
pdf.p(
|
| 468 |
-
'
|
| 469 |
-
'
|
| 470 |
-
'the base XGBoost (0.817), suggesting the base configuration was already near-optimal and the tuning search space introduced suboptimal regions.'
|
| 471 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
-
pdf.
|
| 474 |
-
'Fig. 7. ROC curves for the top 5 models. All achieve ROC-AUC > 0.93, but ROC-AUC alone is insufficient for evaluation under extreme imbalance.', w=125)
|
| 475 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'pr_curves.png'),
|
| 476 |
-
'Fig. 8. Precision-Recall curves -- the primary evaluation metric. XGBoost achieves the largest area under the curve (0.817).', w=125)
|
| 477 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'confusion_matrices.png'),
|
| 478 |
-
'Fig. 9. Confusion matrices for six selected models. XGBoost achieves the best balance: 57 true positives with only 6 false positives.', w=160)
|
| 479 |
-
|
| 480 |
-
pdf.subsec('B.', 'Threshold Optimization')
|
| 481 |
pdf.p(
|
| 482 |
-
'
|
| 483 |
-
'
|
| 484 |
-
'
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
)
|
| 490 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'threshold_analysis.png'),
|
| 491 |
-
'Fig. 10. Threshold sensitivity analysis: (a) precision/recall/F1 vs. threshold, (b) MCC vs. threshold. Optimal F1 at threshold = 0.55.', w=145)
|
| 492 |
-
|
| 493 |
-
pdf.subsec('C.', 'Business Impact Analysis')
|
| 494 |
pdf.p(
|
| 495 |
-
'
|
| 496 |
-
'
|
| 497 |
-
'
|
| 498 |
-
'
|
| 499 |
-
'total investigation cost, reducing net savings to only $1,554. The Autoencoder, despite catching all 71 fraud transactions, generates '
|
| 500 |
-
'21,209 false alarms at $106,045 -- a net loss of $97,368.'
|
| 501 |
)
|
| 502 |
pdf.tbl(
|
| 503 |
-
['Model', 'TP', 'FN', 'FP', 'Caught($)', '
|
| 504 |
[
|
| 505 |
-
['XGBoost', '57', '14', '6', '6,966', '
|
| 506 |
-
['
|
| 507 |
-
['LGBM-T', '58', '13', '24', '7,088', '
|
| 508 |
-
['LR', '63', '8', '1229', '7,699', '
|
| 509 |
-
['AE', '71', '0', '21209', '8,677', '
|
| 510 |
],
|
| 511 |
-
'Table III: Business Impact Analysis
|
| 512 |
-
)
|
| 513 |
-
pdf.p(
|
| 514 |
-
'This analysis underscores a critical insight: maximizing recall without regard for precision is counterproductive in operational settings. '
|
| 515 |
-
'The Autoencoder catches every fraud but would bankrupt the operations team with false alarm investigations. The optimal model balances '
|
| 516 |
-
'fraud catch rate against false alarm volume, and XGBoost achieves this balance most effectively.'
|
| 517 |
-
)
|
| 518 |
-
|
| 519 |
-
# ===== EXPLAINABILITY =====
|
| 520 |
-
pdf.subsec('D.', 'Feature Importance and Explainability')
|
| 521 |
-
pdf.p(
|
| 522 |
-
'Model explainability is critical for operational trust, regulatory compliance, and scientific insight. We employ two complementary methods: '
|
| 523 |
-
'SHAP for global feature attribution and LIME for local, instance-level explanation.'
|
| 524 |
)
|
| 525 |
-
pdf.p(
|
| 526 |
-
'SHAP Analysis: Figure 11 shows the SHAP summary plot for XGBoost, computed over 2,000 test samples. The top three features by '
|
| 527 |
-
'mean absolute SHAP value are V4 (1.913), V14 (1.843), and PCA_magnitude (1.113). The SHAP analysis reveals several important patterns: '
|
| 528 |
-
'(i) High values of V4 push predictions toward fraud, while low values push toward legitimate; (ii) Low (more negative) values of V14 '
|
| 529 |
-
'are strongly associated with fraud, consistent with the negative correlation observed in EDA; (iii) High PCA_magnitude indicates '
|
| 530 |
-
'transactions that are far from the centroid in PCA space, which are more likely to be anomalous. Notably, the engineered feature '
|
| 531 |
-
'V10_V14_interaction ranks 9th, validating our hypothesis that interaction terms capture additional signal beyond individual features.'
|
| 532 |
-
)
|
| 533 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'shap_summary.png'),
|
| 534 |
-
'Fig. 11. SHAP summary plot: each dot is one test sample. Color indicates feature value (red=high, blue=low). Horizontal position shows SHAP impact.', w=140)
|
| 535 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'shap_top10.png'),
|
| 536 |
-
'Fig. 12. Top 10 features by mean |SHAP value|. V4, V14, and PCA_magnitude are the dominant fraud predictors.', w=125)
|
| 537 |
|
|
|
|
| 538 |
pdf.p(
|
| 539 |
-
'
|
| 540 |
-
'
|
| 541 |
-
'
|
| 542 |
-
'
|
|
|
|
| 543 |
)
|
| 544 |
-
pdf.fig(os.path.join(
|
| 545 |
-
'Fig.
|
| 546 |
-
pdf.fig(os.path.join(
|
| 547 |
-
'Fig.
|
| 548 |
|
| 549 |
# ===== VII. ERROR ANALYSIS =====
|
| 550 |
-
pdf.
|
| 551 |
-
|
| 552 |
-
pdf.subsec('A.', 'False Negative Analysis (Missed Fraud)')
|
| 553 |
-
pdf.p(
|
| 554 |
-
'The XGBoost model misses 14 of 71 fraudulent transactions in the test set (19.7% miss rate). Understanding why these transactions '
|
| 555 |
-
'escape detection is critical for improving the system. Analysis of the 14 false negatives reveals that their mean predicted fraud '
|
| 556 |
-
'probability is only 0.013, far below the 0.5 threshold -- the model is highly confident they are legitimate, not merely borderline.'
|
| 557 |
-
)
|
| 558 |
pdf.p(
|
| 559 |
-
'
|
| 560 |
-
'
|
| 561 |
-
'
|
| 562 |
-
'
|
| 563 |
-
'
|
|
|
|
| 564 |
)
|
|
|
|
| 565 |
pdf.p(
|
| 566 |
-
'
|
| 567 |
-
'
|
| 568 |
-
'
|
|
|
|
| 569 |
)
|
| 570 |
-
|
| 571 |
-
pdf.subsec('B.', 'False Positive Analysis (False Alarms)')
|
| 572 |
-
pdf.p(
|
| 573 |
-
'The 6 false positives have a mean predicted fraud probability of 0.827, with some reaching 1.0 -- the model is highly confident '
|
| 574 |
-
'these are fraud, yet they are legitimate. Feature analysis shows these transactions have V14 averaging -7.13 (vs -0.04 for true negatives) '
|
| 575 |
-
'and PCA_magnitude of 7.86 (vs 0.28 for true negatives). These legitimate transactions genuinely exhibit the same anomalous feature '
|
| 576 |
-
'patterns as actual fraud, likely representing unusual but lawful spending behavior (e.g., first-time purchases in an unusual category, '
|
| 577 |
-
'international transactions, or large purchases for individuals who typically make small ones). No amount of model tuning can distinguish '
|
| 578 |
-
'these from actual fraud without additional contextual information.'
|
| 579 |
-
)
|
| 580 |
-
|
| 581 |
-
pdf.subsec('C.', 'Concept Drift Assessment and Retraining Recommendations')
|
| 582 |
pdf.p(
|
| 583 |
-
'Comparing model confidence between
|
| 584 |
-
'
|
| 585 |
-
'
|
|
|
|
| 586 |
)
|
| 587 |
-
pdf.
|
| 588 |
-
|
| 589 |
-
'Weekly computation of PR-AUC, F1, and false positive rate on recent labeled data to track model degradation.',
|
| 590 |
-
'Automated retraining trigger when PR-AUC drops below 0.70 or false positive rate exceeds 2x the baseline.',
|
| 591 |
-
'Sliding window training using the most recent 3-6 months of labeled data, rather than static historical training.',
|
| 592 |
-
'Population Stability Index (PSI) monitoring on all input features, with alerts when PSI exceeds 0.25 for any feature.',
|
| 593 |
-
'A/B testing framework for deploying model updates, with gradual traffic ramps from 1% to 100%.',
|
| 594 |
-
'Quarterly fraud pattern reviews with domain experts to identify emerging attack vectors that models may not yet capture.',
|
| 595 |
-
])
|
| 596 |
-
pdf.fig(os.path.join(FIGURES_DIR, 'error_analysis.png'),
|
| 597 |
-
'Fig. 15. Error analysis: (a) FN probability distribution, (b) FP probability distribution, (c) overall score distribution by class.', w=160)
|
| 598 |
|
| 599 |
# ===== VIII. LIMITATIONS =====
|
| 600 |
-
pdf.
|
| 601 |
-
pdf.p('We acknowledge several important limitations of this work:')
|
| 602 |
pdf.bullet([
|
| 603 |
-
'PCA Anonymization:
|
| 604 |
-
'Temporal Scope:
|
| 605 |
-
'Single-Institution
|
| 606 |
-
'Static
|
| 607 |
-
'Static Threshold:
|
| 608 |
-
'
|
| 609 |
])
|
| 610 |
|
| 611 |
# ===== IX. FUTURE WORK =====
|
| 612 |
-
pdf.
|
| 613 |
-
pdf.p('Several promising research directions emerge from this work:')
|
| 614 |
pdf.p(
|
| 615 |
-
'Graph Neural Networks
|
| 616 |
-
'
|
| 617 |
-
'from individual transaction features alone. Graph convolutional networks can propagate suspicion scores through the network, flagging '
|
| 618 |
-
'accounts that transact heavily with known fraudulent nodes.'
|
| 619 |
)
|
| 620 |
pdf.p(
|
| 621 |
-
'Real-Time Streaming with Apache Kafka
|
| 622 |
-
'
|
| 623 |
-
'
|
| 624 |
)
|
| 625 |
pdf.p(
|
| 626 |
-
'
|
| 627 |
-
'
|
| 628 |
-
'privacy while dramatically expanding the effective training set. This is particularly valuable for detecting cross-institutional '
|
| 629 |
-
'fraud patterns where the same stolen credentials are used across multiple banks.'
|
| 630 |
)
|
| 631 |
pdf.p(
|
| 632 |
-
'
|
| 633 |
-
'
|
| 634 |
-
'(e.g., "This transaction was blocked because the purchase amount was unusually high for this card, occurring at an unusual time, '
|
| 635 |
-
'with spending patterns inconsistent with the cardholder\'s history"), reducing the burden on human fraud analysts.'
|
| 636 |
-
)
|
| 637 |
-
pdf.p(
|
| 638 |
-
'Temporal Sequence Modeling: Transformers and LSTM networks operating on the sequence of a cardholder\'s recent transactions could capture '
|
| 639 |
-
'behavioral patterns (typical spending days, preferred merchants, usual amounts) and flag departures from established routines. '
|
| 640 |
-
'This approach treats fraud detection as an anomaly in a time series rather than a static classification problem.'
|
| 641 |
)
|
| 642 |
|
| 643 |
# ===== X. CONCLUSION =====
|
| 644 |
-
pdf.
|
| 645 |
-
pdf.p(
|
| 646 |
-
'This paper presents a comprehensive, end-to-end fraud detection framework that systematically evaluates seven diverse machine learning '
|
| 647 |
-
'approaches on the benchmark European Cardholder credit card fraud dataset. Through careful feature engineering (12 new features), '
|
| 648 |
-
'rigorous methodology (SMOTE after splitting, scaler fitted on train only), and thorough evaluation (six metrics including PR-AUC, MCC), '
|
| 649 |
-
'we demonstrate that XGBoost with cost-sensitive learning achieves the best overall performance with a PR-AUC of 0.817, F1-score of 0.851, '
|
| 650 |
-
'and MCC of 0.852.'
|
| 651 |
-
)
|
| 652 |
-
pdf.p(
|
| 653 |
-
'Our threshold optimization analysis reveals that shifting the decision boundary from 0.50 to 0.55 yields a Pareto improvement, '
|
| 654 |
-
'increasing F1 to 0.864 and precision to 0.934 without sacrificing recall. Business impact analysis quantifies that XGBoost catches '
|
| 655 |
-
'80.3% of fraud while generating only 6 false alarms on a 42,559-transaction test set, resulting in estimated net savings of $6,936. '
|
| 656 |
-
'In contrast, the Autoencoder catches all fraud but generates over 21,000 false alarms -- a cautionary tale against optimizing recall alone.'
|
| 657 |
-
)
|
| 658 |
pdf.p(
|
| 659 |
-
'
|
| 660 |
-
'
|
| 661 |
-
'
|
| 662 |
-
'
|
| 663 |
)
|
| 664 |
pdf.p(
|
| 665 |
-
'
|
| 666 |
-
'
|
| 667 |
-
'
|
| 668 |
-
'
|
|
|
|
|
|
|
| 669 |
)
|
| 670 |
|
| 671 |
# ===== REFERENCES =====
|
| 672 |
-
pdf.
|
| 673 |
refs = [
|
| 674 |
-
'[1] A. Dal Pozzolo
|
| 675 |
-
'[2] N. V. Chawla
|
| 676 |
-
'[3] A. Fernandez
|
| 677 |
-
'[4] T. Chen and C. Guestrin, "XGBoost: A scalable tree boosting system,"
|
| 678 |
-
'[5] G. Ke
|
| 679 |
-
'[6] A. Pumsirirat and L. Yan, "Credit card fraud detection using deep learning
|
| 680 |
-
'[7] S. M. Lundberg and S.-I. Lee, "A unified approach to interpreting model predictions,"
|
| 681 |
-
'[8] M. T. Ribeiro
|
| 682 |
-
'[9] R. Shwartz-Ziv and A. Armon, "Tabular data: Deep learning is not all you need,"
|
| 683 |
-
'[10] L. Grinsztajn
|
| 684 |
-
'[11] T. Akiba
|
| 685 |
-
'[12] R. J. Bolton and D. J. Hand, "Statistical fraud detection: A review," Statistical Science, vol. 17,
|
| 686 |
-
'[13] Z. Zhang
|
| 687 |
-
'[14] A. A. Taha and S. J. Malebary, "An intelligent approach to credit card fraud detection
|
| 688 |
-
'[15] V. Belle and I. Papantonis, "Principles and practice of explainable
|
| 689 |
-
'[16] L. Prokhorenkova
|
| 690 |
-
'[17] S. Xuan
|
| 691 |
-
'[18] T. Saito and M. Rehmsmeier, "The
|
| 692 |
-
'[19] Y. Liu
|
| 693 |
-
'[20] Q. Yang
|
| 694 |
'[21] Nilson Report, "Global card fraud losses," Issue 1209, 2022.',
|
| 695 |
-
'[22] A. Dal Pozzolo
|
| 696 |
]
|
| 697 |
-
pdf.set_font('Times', '', 7
|
| 698 |
for ref in refs:
|
| 699 |
-
pdf.multi_cell(0, 3.
|
| 700 |
-
pdf.ln(0.
|
| 701 |
|
| 702 |
-
# Save
|
| 703 |
out = os.path.join(PAPER_DIR, 'fraud_detection_paper.pdf')
|
| 704 |
pdf.output(out)
|
| 705 |
-
print(f"
|
| 706 |
|
| 707 |
|
| 708 |
if __name__ == '__main__':
|
|
|
|
| 1 |
"""
|
| 2 |
+
Generate a tight, comprehensive IEEE-style PDF paper using fpdf2.
|
| 3 |
+
Target: 12 pages. No wasted whitespace. Redundant figures removed.
|
| 4 |
"""
|
| 5 |
import os, sys
|
| 6 |
sys.path.insert(0, '/app/fraud_detection')
|
|
|
|
| 10 |
PAPER_DIR = '/app/fraud_detection/paper'
|
| 11 |
os.makedirs(PAPER_DIR, exist_ok=True)
|
| 12 |
|
| 13 |
+
LM = 14
|
| 14 |
+
RM = 14
|
| 15 |
+
TM = 14
|
| 16 |
+
BW = 215.9 - LM - RM
|
| 17 |
|
| 18 |
|
| 19 |
class IEEEPaper(FPDF):
|
| 20 |
def __init__(self):
|
| 21 |
super().__init__('P', 'mm', 'letter')
|
| 22 |
+
self.set_margins(LM, TM, RM)
|
| 23 |
+
self.set_auto_page_break(auto=True, margin=16)
|
| 24 |
|
| 25 |
def header(self):
|
| 26 |
if self.page_no() > 1:
|
| 27 |
+
self.set_font('Helvetica', 'I', 6.5)
|
| 28 |
+
self.cell(0, 3, 'IEEE -- Fraud Detection with Explainable AI', align='C')
|
| 29 |
+
self.ln(4)
|
| 30 |
|
| 31 |
def footer(self):
|
| 32 |
+
self.set_y(-12)
|
| 33 |
self.set_font('Helvetica', 'I', 7)
|
| 34 |
+
self.cell(0, 8, f'{self.page_no()}', align='C')
|
| 35 |
|
| 36 |
+
def sec(self, num, title):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
self.ln(3)
|
| 38 |
self.set_font('Helvetica', 'B', 10)
|
| 39 |
+
t = f'{num}. {title.upper()}' if num else title.upper()
|
| 40 |
+
self.cell(0, 5, t, ln=True)
|
| 41 |
self.ln(1)
|
| 42 |
|
| 43 |
+
def sub(self, label, title):
|
|
|
|
|
|
|
|
|
|
| 44 |
self.ln(1.5)
|
| 45 |
+
self.set_font('Helvetica', 'B', 9)
|
| 46 |
+
self.cell(0, 4.5, f'{label} {title}', ln=True)
|
| 47 |
+
self.ln(0.5)
|
| 48 |
|
| 49 |
+
def p(self, text):
|
| 50 |
+
self.set_font('Times', '', 9)
|
| 51 |
+
self.multi_cell(0, 3.8, text)
|
| 52 |
+
self.ln(1)
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def bullet(self, items):
|
| 55 |
+
self.set_font('Times', '', 9)
|
| 56 |
for item in items:
|
| 57 |
+
self.set_x(LM + 3)
|
| 58 |
+
self.cell(3, 3.8, '-')
|
| 59 |
+
self.multi_cell(BW - 6, 3.8, item)
|
| 60 |
+
self.ln(0.3)
|
| 61 |
+
self.ln(0.5)
|
| 62 |
|
| 63 |
+
def fig(self, path, caption, w=145):
|
| 64 |
if not os.path.exists(path):
|
| 65 |
return
|
| 66 |
+
self.ln(1.5)
|
| 67 |
x = (self.w - w) / 2
|
| 68 |
self.image(path, x=x, w=w)
|
| 69 |
+
self.ln(1)
|
| 70 |
+
self.set_font('Helvetica', 'I', 7.5)
|
| 71 |
+
self.multi_cell(0, 3.5, caption, align='C')
|
| 72 |
+
self.ln(1.5)
|
| 73 |
|
| 74 |
def tbl(self, hdrs, rows, caption=''):
|
| 75 |
if caption:
|
|
|
|
|
|
|
|
|
|
| 76 |
self.ln(1)
|
| 77 |
+
self.set_font('Helvetica', 'I', 7.5)
|
| 78 |
+
self.multi_cell(0, 3.5, caption, align='C')
|
| 79 |
+
self.ln(0.5)
|
| 80 |
cw = BW / len(hdrs)
|
| 81 |
+
self.set_font('Helvetica', 'B', 7)
|
| 82 |
for h in hdrs:
|
| 83 |
+
self.cell(cw, 4, h, border=1, align='C')
|
| 84 |
self.ln()
|
| 85 |
+
self.set_font('Times', '', 7)
|
| 86 |
for row in rows:
|
| 87 |
for c in row:
|
| 88 |
+
self.cell(cw, 4, str(c), border=1, align='C')
|
| 89 |
self.ln()
|
| 90 |
+
self.ln(1.5)
|
| 91 |
|
| 92 |
|
| 93 |
def build():
|
| 94 |
pdf = IEEEPaper()
|
| 95 |
+
F = FIGURES_DIR
|
| 96 |
|
| 97 |
+
# ===== PAGE 1: Title + Abstract + Start of Intro =====
|
| 98 |
pdf.add_page()
|
|
|
|
|
|
|
|
|
|
| 99 |
pdf.ln(6)
|
| 100 |
+
pdf.set_font('Helvetica', 'B', 15)
|
| 101 |
+
pdf.multi_cell(0, 7.5, 'A Comprehensive Ensemble-Based Framework\nfor Credit Card Fraud Detection with Explainable AI', align='C')
|
| 102 |
+
pdf.ln(3)
|
| 103 |
+
pdf.set_font('Helvetica', '', 10)
|
| 104 |
+
pdf.cell(0, 5, 'Raj Vivan', align='C', ln=True)
|
| 105 |
+
pdf.set_font('Helvetica', 'I', 8.5)
|
| 106 |
+
pdf.cell(0, 4, 'Department of Computer Science | Independent Research', align='C', ln=True)
|
| 107 |
+
pdf.ln(4)
|
| 108 |
+
|
| 109 |
+
pdf.set_font('Helvetica', 'B', 9)
|
| 110 |
+
pdf.cell(0, 4, 'Abstract', align='C', ln=True)
|
| 111 |
+
pdf.ln(1)
|
| 112 |
+
pdf.p(
|
| 113 |
+
'Credit card fraud poses a significant and growing threat to the global financial ecosystem, with estimated annual losses exceeding '
|
| 114 |
+
'$32 billion. This paper presents a comprehensive, end-to-end fraud detection framework that systematically develops, evaluates, and '
|
| 115 |
+
'compares seven machine learning approaches: Logistic Regression, Random Forest, XGBoost, LightGBM, Multilayer Perceptron, '
|
| 116 |
+
'Autoencoder-based anomaly detection, and a Voting Ensemble. Using the European Cardholder benchmark dataset (284,807 transactions, '
|
| 117 |
+
'0.173% fraud rate), we engineer 12 novel features and address class imbalance through SMOTE oversampling (applied exclusively after '
|
| 118 |
+
'train-test splitting) and cost-sensitive learning. XGBoost achieves the best performance with PR-AUC of 0.8166, precision of 0.9048, '
|
| 119 |
+
'recall of 0.8028, and F1 of 0.8507. Threshold optimization from 0.5 to 0.55 improves F1 to 0.8636. SHAP and LIME explainability '
|
| 120 |
+
'analysis identifies V4, V14, and PCA_magnitude as primary fraud discriminators. Error analysis reveals that false negatives arise '
|
| 121 |
+
'from sophisticated fraud closely mimicking legitimate behavior. The model is deployed as a FastAPI service with sub-10ms latency. '
|
| 122 |
+
'All code, models, and results are publicly available.'
|
| 123 |
+
)
|
| 124 |
+
pdf.set_font('Helvetica', 'I', 7.5)
|
| 125 |
+
pdf.cell(0, 4, 'Keywords: fraud detection, XGBoost, ensemble learning, SHAP, LIME, class imbalance, SMOTE, anomaly detection', ln=True)
|
|
|
|
| 126 |
|
| 127 |
# ===== I. INTRODUCTION =====
|
| 128 |
+
pdf.sec('I', 'Introduction')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
pdf.p(
|
| 130 |
+
'Financial fraud detection has emerged as one of the most consequential applications of machine learning. The global shift toward '
|
| 131 |
+
'electronic payments has created unprecedented transaction volumes while simultaneously enabling sophisticated fraud. According to '
|
| 132 |
+
'the Nilson Report [21], worldwide card fraud losses reached $32.34 billion in 2021 (a 14% increase year-over-year), projected to '
|
| 133 |
+
'exceed $43 billion by 2026. The fundamental challenge lies in extreme class imbalance: fraudulent transactions typically constitute '
|
| 134 |
+
'less than 0.5% of all transactions, rendering accuracy metrics meaningless and necessitating PR-AUC, F1, and MCC [18].'
|
| 135 |
)
|
| 136 |
pdf.p(
|
| 137 |
+
'A second challenge is concept drift [1]: fraudsters continuously adapt, causing model performance to degrade over time. Previous '
|
| 138 |
+
'approaches range from rule-based expert systems [12] to deep learning architectures [13]. However, extensive benchmarking by '
|
| 139 |
+
'Shwartz-Ziv and Armon [9] and Grinsztajn et al. [10] demonstrates that well-tuned gradient-boosted trees consistently outperform '
|
| 140 |
+
'deep learning on tabular data, including fraud detection, when combined with thoughtful feature engineering.'
|
|
|
|
| 141 |
)
|
| 142 |
+
pdf.p('This paper makes the following contributions:')
|
| 143 |
pdf.bullet([
|
| 144 |
+
'Systematic comparison of seven ML approaches spanning linear models, tree ensembles, neural networks, and anomaly detection.',
|
| 145 |
+
'Novel feature engineering producing 12 features capturing temporal cycles, transaction velocity, and PCA interactions.',
|
| 146 |
+
'Rigorous methodology: SMOTE only after splitting; scaler fitted on train only; six metrics including PR-AUC and MCC.',
|
| 147 |
+
'SHAP (global) and LIME (local) explainability analysis identifying key fraud indicators.',
|
| 148 |
+
'Production FastAPI deployment achieving sub-10ms latency with business impact quantification.',
|
|
|
|
| 149 |
])
|
| 150 |
|
| 151 |
# ===== II. RELATED WORK =====
|
| 152 |
+
pdf.sec('II', 'Related Work')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
pdf.p(
|
| 154 |
+
'Bolton and Hand [12] provided an early survey of statistical fraud detection. Dal Pozzolo et al. [1] analyzed class imbalance and '
|
| 155 |
+
'concept drift in real-world systems, while their follow-up [22] investigated when undersampling is effective. Chawla et al. [2] '
|
| 156 |
+
'introduced SMOTE for synthetic minority oversampling; Fernandez et al. [3] later demonstrated that SMOTE must be applied exclusively '
|
| 157 |
+
'to training data to avoid data leakage that produces over-optimistic estimates.'
|
|
|
|
| 158 |
)
|
| 159 |
pdf.p(
|
| 160 |
+
'Tree-based methods dominate tabular fraud detection. Xuan et al. [17] showed Random Forests achieve robust baseline performance. '
|
| 161 |
+
'Chen and Guestrin [4] introduced XGBoost; Ke et al. [5] proposed LightGBM with leaf-wise growth and GOSS; Prokhorenkova et al. [16] '
|
| 162 |
+
'introduced CatBoost with ordered boosting. Taha and Malebary [14] demonstrated optimized LightGBM for fraud detection. '
|
| 163 |
+
'Pumsirirat and Yan [6] employed autoencoders trained on legitimate transactions only, detecting fraud via reconstruction error. '
|
| 164 |
+
'Zhang et al. [13] proposed attention-based RNNs for sequential patterns.'
|
| 165 |
)
|
| 166 |
pdf.p(
|
| 167 |
+
'For explainability, Lundberg and Lee [7] introduced SHAP based on Shapley values from cooperative game theory. Ribeiro et al. [8] '
|
| 168 |
+
'proposed LIME for instance-level interpretation via local linear approximations. Belle and Papantonis [15] surveyed XAI methods for '
|
| 169 |
+
'financial decision-making. Akiba et al. [11] introduced Optuna with TPE sampling for efficient hyperparameter optimization.'
|
| 170 |
)
|
| 171 |
|
| 172 |
# ===== III. DATASET AND EDA =====
|
| 173 |
+
pdf.sec('III', 'Dataset and Exploratory Data Analysis')
|
| 174 |
+
pdf.sub('A.', 'Dataset Description')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
pdf.p(
|
| 176 |
+
'We use the European Cardholder dataset [1], containing 284,807 transactions over two days in September 2013. Each transaction has '
|
| 177 |
+
'28 PCA-transformed features (V1-V28), raw Time and Amount, and a binary Class label (0=legitimate, 1=fraud). The PCA transformation '
|
| 178 |
+
'protects cardholder privacy but prevents domain-specific feature engineering. The dataset has extreme class imbalance: only 492 '
|
| 179 |
+
'fraudulent transactions (0.173%), yielding an imbalance ratio of 1:577.'
|
|
|
|
| 180 |
)
|
|
|
|
| 181 |
pdf.tbl(
|
| 182 |
+
['Class', 'Count', 'Percentage', 'Ratio'],
|
| 183 |
[['Legitimate (0)', '284,315', '99.827%', '---'],
|
| 184 |
+
['Fraud (1)', '492', '0.173%', '1:577'],
|
| 185 |
['Total', '284,807', '100%', '---']],
|
| 186 |
+
'Table I: Class Distribution'
|
| 187 |
)
|
| 188 |
+
pdf.fig(os.path.join(F, 'class_distribution.png'),
|
| 189 |
+
'Fig. 1. Class distribution showing extreme imbalance (0.173% fraud).', w=130)
|
| 190 |
|
| 191 |
+
pdf.sub('B.', 'Transaction Amount and Temporal Patterns')
|
| 192 |
pdf.p(
|
| 193 |
+
'Legitimate transactions have a mean of $88.29 (median $22.00); fraudulent transactions have a higher mean of $122.21 but lower '
|
| 194 |
+
'median of $9.25. This bimodal fraud pattern suggests two strategies: (i) low-value "testing" transactions verifying stolen cards, '
|
| 195 |
+
'and (ii) moderate-to-high-value theft. The nighttime (0-6h) fraud rate is 0.518%, nearly 4x the daytime rate of 0.137%, consistent '
|
| 196 |
+
'with fraudsters exploiting low-monitoring periods. These patterns motivate our cyclic temporal and amount-based feature engineering.'
|
|
|
|
|
|
|
| 197 |
)
|
| 198 |
+
pdf.fig(os.path.join(F, 'amount_analysis.png'),
|
| 199 |
+
'Fig. 2. Amount analysis: (a) legitimate, (b) fraud, (c) log-scaled comparison, (d) boxplot.', w=145)
|
| 200 |
+
pdf.fig(os.path.join(F, 'time_analysis.png'),
|
| 201 |
+
'Fig. 3. Temporal patterns: (a) transaction density by hour, (b) fraud rate by hour.', w=135)
|
| 202 |
|
| 203 |
+
pdf.sub('C.', 'Feature Correlations and Key Observations')
|
| 204 |
pdf.p(
|
| 205 |
+
'Pearson correlation identifies V17 (r=-0.326), V14 (r=-0.303), and V12 (r=-0.261) as having the strongest negative correlation '
|
| 206 |
+
'with fraud; V11 (+0.155) and V4 (+0.133) show the strongest positive correlation. Amount has near-zero correlation (r=0.006), '
|
| 207 |
+
'confirming that amount-based rules alone would be ineffective. Five key observations: (1) the 1:577 imbalance makes accuracy '
|
| 208 |
+
'meaningless; (2) bimodal fraud amounts require engineered deviation features; (3) the 4x nighttime fraud rate provides temporal '
|
| 209 |
+
'signal; (4) V14, V17, V12, V4, V11 carry the strongest fraud signal; (5) no missing values exist, with 1,081 duplicates removed.'
|
|
|
|
| 210 |
)
|
| 211 |
+
pdf.fig(os.path.join(F, 'correlation_heatmap.png'),
|
| 212 |
+
'Fig. 4. Feature correlation with fraud class. Red bars indicate negative correlation (lower values signal fraud).', w=120)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
# ===== IV. METHODOLOGY =====
|
| 215 |
+
pdf.sec('IV', 'Methodology')
|
| 216 |
+
pdf.sub('A.', 'Feature Engineering')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
pdf.p(
|
| 218 |
+
'We augment the original 30 features with 12 engineered features (42 total). Temporal features: cyclic hour encoding '
|
| 219 |
+
'(Hour_sin, Hour_cos) and Time_diff (inter-arrival time). Amount features: Amount_log, Amount_deviation_mean, '
|
| 220 |
+
'Amount_deviation_median, Amount_zscore, and Transaction_velocity. Interaction features: V14*V17, V12*V14, V10*V14 (capturing '
|
| 221 |
+
'joint effects between top discriminators). PCA_magnitude: L2 norm of all V features, measuring overall transaction abnormality.'
|
| 222 |
)
|
| 223 |
+
pdf.sub('B.', 'Class Imbalance, Splitting, and Scaling')
|
| 224 |
pdf.p(
|
| 225 |
+
'Two approaches are compared, both applied exclusively to training data. SMOTE [2] generates synthetic fraud at a 1:2 ratio '
|
| 226 |
+
'(99,138 synthetic + 198,277 legitimate), used only for MLP training. Cost-sensitive learning applies balanced class weights '
|
| 227 |
+
'(w0=0.501, w1=300.01) for tree models and Logistic Regression. Stratified 70/15/15 splitting preserves the fraud ratio across '
|
| 228 |
+
'all sets (Train: 198,608 samples/331 fraud; Val/Test: 42,559/71 each). RobustScaler normalizes by IQR, fitted on train only.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
)
|
| 230 |
+
pdf.sub('C.', 'Model Descriptions')
|
| 231 |
pdf.p(
|
| 232 |
+
'(1) Logistic Regression: L2-regularized (C=0.1), balanced weights, interpretable baseline. '
|
| 233 |
+
'(2) Random Forest: 150 trees, depth 12, balanced weights. '
|
| 234 |
+
'(3) XGBoost: 200 estimators, depth 6, lr=0.1, scale_pos_weight from class frequencies, histogram splitting. '
|
| 235 |
+
'(4) LightGBM: 200 estimators, depth 8, lr=0.05, leaf-wise growth with GOSS. '
|
| 236 |
+
'(5) MLP: 128-64-32 neurons, ReLU, adaptive lr, early stopping, trained on SMOTE data. '
|
| 237 |
+
'(6) Autoencoder: 42-64-32-16-32-64-42, trained 50 epochs on legitimate only, detects fraud via reconstruction error. '
|
| 238 |
+
'(7) Voting Ensemble: soft voting over three best tuned models (XGBoost, LightGBM, RF).'
|
| 239 |
)
|
| 240 |
+
pdf.sub('D.', 'Hyperparameter Optimization')
|
| 241 |
pdf.p(
|
| 242 |
+
'Optuna [11] with TPE sampler tunes XGBoost, LightGBM, and Random Forest (15-20 trials each), optimizing PR-AUC on the validation set.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
)
|
| 244 |
+
pdf.fig(os.path.join(F, 'architecture_diagram.png'),
|
| 245 |
+
'Fig. 5. End-to-end system architecture from transaction input through inference to API output and monitoring.', w=145)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
# ===== V. EXPERIMENTAL SETUP =====
|
| 248 |
+
pdf.sec('V', 'Experimental Setup')
|
| 249 |
pdf.p(
|
| 250 |
+
'Experiments use Python 3.12 with scikit-learn 1.8.0, XGBoost 3.2.0, LightGBM 4.6.0, PyTorch 2.11.0, Optuna 4.8.0, SHAP 0.51.0, '
|
| 251 |
+
'and LIME 0.2.0.1 on CPU infrastructure (~25 min total training). Six metrics reported: Precision, Recall, F1, ROC-AUC, '
|
| 252 |
+
'PR-AUC (primary, most informative under extreme imbalance [18]), and MCC (balanced measure across all confusion matrix quadrants).'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
)
|
| 254 |
|
| 255 |
# ===== VI. RESULTS AND DISCUSSION =====
|
| 256 |
+
pdf.sec('VI', 'Results and Discussion')
|
| 257 |
+
pdf.sub('A.', 'Model Comparison')
|
| 258 |
+
pdf.p('Table II presents comprehensive test set evaluation at threshold 0.5.')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
pdf.tbl(
|
| 260 |
['Model', 'Prec.', 'Recall', 'F1', 'ROC-AUC', 'PR-AUC', 'MCC'],
|
| 261 |
[
|
|
|
|
| 269 |
['Logistic Reg.', '0.049', '0.887', '0.092', '0.962', '0.735', '0.204'],
|
| 270 |
['Autoencoder', '0.003', '1.000', '0.007', '0.960', '0.044', '0.041'],
|
| 271 |
],
|
| 272 |
+
'Table II: Comprehensive Model Comparison on Test Set (threshold=0.5)'
|
| 273 |
)
|
|
|
|
| 274 |
pdf.p(
|
| 275 |
+
'Observation 1 -- Tree models dominate: XGBoost achieves the highest PR-AUC (0.817), F1 (0.851), and MCC (0.852), confirming [9] '
|
| 276 |
+
'that gradient-boosted trees remain strongest for tabular data. The Voting Ensemble achieves marginally higher ROC-AUC (0.978) '
|
| 277 |
+
'but does not improve PR-AUC, suggesting insufficient member diversity.'
|
|
|
|
| 278 |
)
|
| 279 |
pdf.p(
|
| 280 |
+
'Observation 2 -- Precision-recall tradeoff: Logistic Regression achieves high recall (0.887) but catastrophic precision (0.049), '
|
| 281 |
+
'flagging 1,229 legitimate transactions. The aggressive class weight (300x) creates a boundary that is far too liberal, producing '
|
| 282 |
+
'a flood of false alarms that would overwhelm any operational team.'
|
|
|
|
| 283 |
)
|
| 284 |
pdf.p(
|
| 285 |
+
'Observation 3 -- Autoencoder failure: Perfect recall (1.0) but precision of only 0.003 (21,209 false positives). The PCA-transformed '
|
| 286 |
+
'space causes the autoencoder to reconstruct dominant variance directions; fraud signals in minor components produce similar '
|
| 287 |
+
'reconstruction errors to legitimate noise, making discrimination unreliable. PR-AUC of 0.044 is near-random.'
|
|
|
|
|
|
|
| 288 |
)
|
| 289 |
pdf.p(
|
| 290 |
+
'Observation 4 -- Tuning: Optuna dramatically improved LightGBM (PR-AUC 0.012 to 0.796 by correcting over-aggressive '
|
| 291 |
+
'scale_pos_weight). However, tuned XGBoost (0.793) slightly underperformed the base (0.817), suggesting the base was near-optimal.'
|
|
|
|
| 292 |
)
|
| 293 |
+
pdf.fig(os.path.join(F, 'roc_curves.png'),
|
| 294 |
+
'Fig. 6. ROC curves for top 5 models (all ROC-AUC > 0.93).', w=115)
|
| 295 |
+
pdf.fig(os.path.join(F, 'pr_curves.png'),
|
| 296 |
+
'Fig. 7. Precision-Recall curves. XGBoost achieves the highest AP (0.817).', w=115)
|
| 297 |
+
pdf.fig(os.path.join(F, 'confusion_matrices.png'),
|
| 298 |
+
'Fig. 8. Confusion matrices. XGBoost: 57 TP, 6 FP -- best balance.', w=150)
|
| 299 |
|
| 300 |
+
pdf.sub('B.', 'Threshold Optimization')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
pdf.p(
|
| 302 |
+
'A systematic threshold sweep (0.05 to 0.95) on XGBoost reveals an optimal threshold of 0.55, improving F1 from 0.851 to 0.864 '
|
| 303 |
+
'and precision from 0.905 to 0.934 while maintaining identical recall (0.803). This is a Pareto improvement: borderline false '
|
| 304 |
+
'positives are eliminated without losing any true positives. Above 0.85, recall degrades as the model becomes overly conservative.'
|
| 305 |
+
)
|
| 306 |
+
pdf.fig(os.path.join(F, 'threshold_analysis.png'),
|
| 307 |
+
'Fig. 9. Threshold analysis: (a) Precision/Recall/F1, (b) MCC. Optimal F1 at 0.55.', w=135)
|
| 308 |
+
|
| 309 |
+
pdf.sub('C.', 'Business Impact')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
pdf.p(
|
| 311 |
+
'Using average fraud amount $122.21 and $5/false alarm investigation cost, XGBoost yields the highest net savings ($6,936 on the '
|
| 312 |
+
'42,559-transaction test set), catching 80.3% of fraud with only 6 false alarms ($30 cost). Logistic Regression catches 88.7% but '
|
| 313 |
+
'generates 1,229 false alarms ($6,145 cost), yielding only $1,554 net. The Autoencoder catches 100% but produces 21,209 false '
|
| 314 |
+
'alarms at $106,045 -- a net loss of $97,368. This underscores that maximizing recall alone is operationally counterproductive.'
|
|
|
|
|
|
|
| 315 |
)
|
| 316 |
pdf.tbl(
|
| 317 |
+
['Model', 'TP', 'FN', 'FP', 'Caught($)', 'FP Cost($)', 'Net($)'],
|
| 318 |
[
|
| 319 |
+
['XGBoost', '57', '14', '6', '6,966', '30', '6,936'],
|
| 320 |
+
['Ensemble', '57', '14', '9', '6,966', '45', '6,921'],
|
| 321 |
+
['LGBM-T', '58', '13', '24', '7,088', '120', '6,968'],
|
| 322 |
+
['LR', '63', '8', '1229', '7,699', '6,145', '1,554'],
|
| 323 |
+
['AE', '71', '0', '21209', '8,677', '106,045', '-97,368'],
|
| 324 |
],
|
| 325 |
+
'Table III: Business Impact Analysis'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
+
pdf.sub('D.', 'Explainability (SHAP and LIME)')
|
| 329 |
pdf.p(
|
| 330 |
+
'SHAP analysis (2,000 test samples) reveals V4 (mean |SHAP|=1.913), V14 (1.843), and PCA_magnitude (1.113) as the dominant '
|
| 331 |
+
'fraud predictors. High V4 values push toward fraud; low (negative) V14 values are strongly associated with fraud, consistent '
|
| 332 |
+
'with EDA correlations. The engineered V10_V14_interaction ranks 9th, validating that interaction terms capture additional signal. '
|
| 333 |
+
'LIME analysis on a correctly classified fraud sample (P=1.0) shows Time_diff, V4, V12, and V14 as the strongest local contributors, '
|
| 334 |
+
'providing the granular instance-level explanation needed for regulatory compliance and analyst review.'
|
| 335 |
)
|
| 336 |
+
pdf.fig(os.path.join(F, 'shap_summary.png'),
|
| 337 |
+
'Fig. 10. SHAP summary: each dot = one sample; color = feature value; x-axis = SHAP impact on fraud prediction.', w=130)
|
| 338 |
+
pdf.fig(os.path.join(F, 'lime_explanation.png'),
|
| 339 |
+
'Fig. 11. LIME explanation for a single fraud sample (P=1.0). Red = increases fraud risk; green = decreases it.', w=130)
|
| 340 |
|
| 341 |
# ===== VII. ERROR ANALYSIS =====
|
| 342 |
+
pdf.sec('VII', 'Error Analysis')
|
| 343 |
+
pdf.sub('A.', 'False Negatives (Missed Fraud)')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
pdf.p(
|
| 345 |
+
'XGBoost misses 14 of 71 fraud transactions (19.7%). Their mean predicted probability is only 0.013 -- the model is highly '
|
| 346 |
+
'confident they are legitimate, not borderline. Feature comparison explains why: FN transactions have V14 averaging -0.97 '
|
| 347 |
+
'vs -8.45 for true positives, V12 at -0.41 vs -7.69, and PCA_magnitude of 1.82 vs 12.25. These missed cases have feature '
|
| 348 |
+
'values dramatically closer to legitimate transactions, representing sophisticated fraud that mimics normal behavior. Lowering '
|
| 349 |
+
'the threshold would not help: at 0.12, only one additional FN would be caught while generating many more false alarms. '
|
| 350 |
+
'Catching these requires additional data sources (transaction sequences, device fingerprints, geography).'
|
| 351 |
)
|
| 352 |
+
pdf.sub('B.', 'False Positives (False Alarms)')
|
| 353 |
pdf.p(
|
| 354 |
+
'The 6 false positives have mean predicted probability 0.827 (some reaching 1.0). Their V14 averages -7.13 (vs -0.04 for TN) '
|
| 355 |
+
'and PCA_magnitude 7.86 (vs 0.28 for TN). These legitimate transactions genuinely exhibit fraud-like anomalous patterns -- '
|
| 356 |
+
'unusual but lawful spending (e.g., first-time purchases in unusual categories, international transactions). No model tuning can '
|
| 357 |
+
'distinguish these without additional contextual information.'
|
| 358 |
)
|
| 359 |
+
pdf.sub('C.', 'Concept Drift and Retraining')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
pdf.p(
|
| 361 |
+
'Comparing model confidence between early and late test periods reveals a drift indicator of +0.115. We recommend: (1) weekly '
|
| 362 |
+
'PR-AUC monitoring on labeled data; (2) automated retraining when PR-AUC drops below 0.70; (3) sliding window training on 3-6 '
|
| 363 |
+
'months of recent data; (4) PSI monitoring on all features (alert when PSI > 0.25); (5) A/B testing for model updates; '
|
| 364 |
+
'(6) quarterly fraud pattern reviews with domain experts.'
|
| 365 |
)
|
| 366 |
+
pdf.fig(os.path.join(F, 'error_analysis.png'),
|
| 367 |
+
'Fig. 12. Error analysis: (a) FN probability distribution, (b) FP probability distribution, (c) score distribution by class.', w=150)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
# ===== VIII. LIMITATIONS =====
|
| 370 |
+
pdf.sec('VIII', 'Limitations')
|
|
|
|
| 371 |
pdf.bullet([
|
| 372 |
+
'PCA Anonymization: prevents domain-specific feature engineering (merchant, location, device) and limits interpretability.',
|
| 373 |
+
'Temporal Scope: only two days of data, limiting drift assessment and seasonal pattern detection.',
|
| 374 |
+
'Single-Institution: results may not generalize across banks, geographies, or payment networks.',
|
| 375 |
+
'Static Features: no sequential transaction history (spending velocity, merchant novelty) which are critical in production.',
|
| 376 |
+
'Static Threshold: optimal 0.55 was determined on test data and may shift; production needs dynamic adaptation.',
|
| 377 |
+
'Simple Autoencoder: more advanced architectures (VAE, adversarial) might improve anomaly detection performance.',
|
| 378 |
])
|
| 379 |
|
| 380 |
# ===== IX. FUTURE WORK =====
|
| 381 |
+
pdf.sec('IX', 'Future Work')
|
|
|
|
| 382 |
pdf.p(
|
| 383 |
+
'Graph Neural Networks [19]: Modeling transaction networks as graphs enables fraud ring detection through suspicion propagation '
|
| 384 |
+
'across connected accounts -- impossible from individual transaction features alone.'
|
|
|
|
|
|
|
| 385 |
)
|
| 386 |
pdf.p(
|
| 387 |
+
'Real-Time Streaming: Integration with Apache Kafka and Flink would enable millions of transactions/second with consistent '
|
| 388 |
+
'sub-100ms latency guarantees. Federated Learning [20]: collaborative training across banks without sharing raw data preserves '
|
| 389 |
+
'privacy while expanding effective training sets for rare fraud types.'
|
| 390 |
)
|
| 391 |
pdf.p(
|
| 392 |
+
'LLM-Generated Explanations: Large language models could translate SHAP values into natural-language justifications for blocked '
|
| 393 |
+
'transactions, reducing analyst burden and satisfying regulatory requirements for explainable decisions.'
|
|
|
|
|
|
|
| 394 |
)
|
| 395 |
pdf.p(
|
| 396 |
+
'Temporal Sequence Modeling: Transformers or LSTMs on cardholder transaction sequences could capture behavioral patterns and flag '
|
| 397 |
+
'departures from established routines, treating fraud detection as time-series anomaly detection.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
)
|
| 399 |
|
| 400 |
# ===== X. CONCLUSION =====
|
| 401 |
+
pdf.sec('X', 'Conclusion')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
pdf.p(
|
| 403 |
+
'This paper presents a comprehensive fraud detection framework evaluating seven ML approaches on the European Cardholder benchmark. '
|
| 404 |
+
'XGBoost with cost-sensitive learning achieves best overall performance (PR-AUC 0.817, F1 0.851, MCC 0.852). Threshold optimization '
|
| 405 |
+
'to 0.55 improves F1 to 0.864 without sacrificing recall. Business impact analysis shows XGBoost catches 80.3% of fraud with only '
|
| 406 |
+
'6 false alarms ($6,936 net savings), while the Autoencoder\'s 100% recall generates 21,000+ false alarms at $97,368 net loss.'
|
| 407 |
)
|
| 408 |
pdf.p(
|
| 409 |
+
'SHAP and LIME identify V4, V14, and PCA_magnitude as primary fraud discriminators. Error analysis reveals that 14 missed fraud '
|
| 410 |
+
'cases have feature profiles indistinguishable from legitimate transactions, requiring additional data sources to catch. The complete '
|
| 411 |
+
'system -- feature engineering, training, evaluation, explainability, and FastAPI deployment with sub-10ms latency -- demonstrates '
|
| 412 |
+
'that production-grade fraud detection is achievable with well-tuned classical ML. Tree-based ensembles, particularly XGBoost, '
|
| 413 |
+
'remain state-of-the-art for tabular fraud detection, outperforming deep learning and linear alternatives on all metrics that '
|
| 414 |
+
'matter for imbalanced classification.'
|
| 415 |
)
|
| 416 |
|
| 417 |
# ===== REFERENCES =====
|
| 418 |
+
pdf.sec('', 'References')
|
| 419 |
refs = [
|
| 420 |
+
'[1] A. Dal Pozzolo et al., "Calibrating probability with undersampling for unbalanced classification," IEEE CIDM, 2015.',
|
| 421 |
+
'[2] N. V. Chawla et al., "SMOTE: Synthetic Minority Over-sampling Technique," JAIR, vol. 16, pp. 321-357, 2002.',
|
| 422 |
+
'[3] A. Fernandez et al., Learning from Imbalanced Data Sets. Springer, 2018.',
|
| 423 |
+
'[4] T. Chen and C. Guestrin, "XGBoost: A scalable tree boosting system," ACM SIGKDD, 2016.',
|
| 424 |
+
'[5] G. Ke et al., "LightGBM: A highly efficient gradient boosting decision tree," NeurIPS, 2017.',
|
| 425 |
+
'[6] A. Pumsirirat and L. Yan, "Credit card fraud detection using deep learning," IJACSA, vol. 9, 2018.',
|
| 426 |
+
'[7] S. M. Lundberg and S.-I. Lee, "A unified approach to interpreting model predictions," NeurIPS, 2017.',
|
| 427 |
+
'[8] M. T. Ribeiro et al., "Why should I trust you?," ACM SIGKDD, 2016.',
|
| 428 |
+
'[9] R. Shwartz-Ziv and A. Armon, "Tabular data: Deep learning is not all you need," Info. Fusion, vol. 81, 2022.',
|
| 429 |
+
'[10] L. Grinsztajn et al., "Why do tree-based models still outperform deep learning on tabular data?," NeurIPS, 2022.',
|
| 430 |
+
'[11] T. Akiba et al., "Optuna: A next-generation hyperparameter optimization framework," ACM SIGKDD, 2019.',
|
| 431 |
+
'[12] R. J. Bolton and D. J. Hand, "Statistical fraud detection: A review," Statistical Science, vol. 17, 2002.',
|
| 432 |
+
'[13] Z. Zhang et al., "A model based on convolutional RNN for fraud detection," Complexity, 2021.',
|
| 433 |
+
'[14] A. A. Taha and S. J. Malebary, "An intelligent approach to credit card fraud detection," IEEE Access, vol. 8, 2020.',
|
| 434 |
+
'[15] V. Belle and I. Papantonis, "Principles and practice of explainable ML," Frontiers in Big Data, vol. 4, 2021.',
|
| 435 |
+
'[16] L. Prokhorenkova et al., "CatBoost: Unbiased boosting with categorical features," NeurIPS, 2018.',
|
| 436 |
+
'[17] S. Xuan et al., "Random forest for credit card fraud detection," IEEE ICNSC, 2018.',
|
| 437 |
+
'[18] T. Saito and M. Rehmsmeier, "The PR plot is more informative than ROC on imbalanced datasets," PLoS ONE, 2015.',
|
| 438 |
+
'[19] Y. Liu et al., "Pick and choose: A GNN-based imbalanced learning for fraud detection," Web Conf., 2021.',
|
| 439 |
+
'[20] Q. Yang et al., "Federated machine learning: Concept and applications," ACM TIST, vol. 10, 2019.',
|
| 440 |
'[21] Nilson Report, "Global card fraud losses," Issue 1209, 2022.',
|
| 441 |
+
'[22] A. Dal Pozzolo et al., "When is undersampling effective?," ECML PKDD, 2015.',
|
| 442 |
]
|
| 443 |
+
pdf.set_font('Times', '', 7)
|
| 444 |
for ref in refs:
|
| 445 |
+
pdf.multi_cell(0, 3.2, ref)
|
| 446 |
+
pdf.ln(0.5)
|
| 447 |
|
|
|
|
| 448 |
out = os.path.join(PAPER_DIR, 'fraud_detection_paper.pdf')
|
| 449 |
pdf.output(out)
|
| 450 |
+
print(f"PDF saved: {out} ({pdf.page_no()} pages)")
|
| 451 |
|
| 452 |
|
| 453 |
if __name__ == '__main__':
|