rajvivan
/

fraud-detection-system

Joblib

Model card Files Files and versions

xet

Community

rajvivan commited on 9 days ago

Commit

bf86f89

verified ·

1 Parent(s): 5bf93f1

Upload generate_pdf.py with huggingface_hub

Browse files

Files changed (1) hide show

generate_pdf.py +281 -536

generate_pdf.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-Generate a comprehensive IEEE-style PDF paper using fpdf2.
-Full descriptions, observations, analysis in every section.
 """
 import os, sys
 sys.path.insert(0, '/app/fraud_detection')
@@ -10,425 +10,252 @@ FIGURES_DIR = '/app/fraud_detection/figures'
 PAPER_DIR = '/app/fraud_detection/paper'
 os.makedirs(PAPER_DIR, exist_ok=True)
-LM = 15   # left margin
-RM = 15   # right margin
-BW = 215.9 - LM - RM  # body width (Letter)
 class IEEEPaper(FPDF):
     def __init__(self):
         super().__init__('P', 'mm', 'letter')
-        self.set_margins(LM, 18, RM)
-        self.set_auto_page_break(auto=True, margin=22)
     def header(self):
         if self.page_no() > 1:
-            self.set_font('Helvetica', 'I', 7)
-            self.cell(0, 4, 'IEEE Transactions -- Comprehensive Fraud Detection Framework', align='C')
-            self.ln(6)
     def footer(self):
-        self.set_y(-14)
         self.set_font('Helvetica', 'I', 7)
-        self.cell(0, 10, f'{self.page_no()}', align='C')
-    def section(self, num, title):
-        self.ln(5)
-        self.set_font('Helvetica', 'B', 11)
-        self.cell(0, 6, f'{num}.  {title.upper()}', ln=True)
-        self.ln(2)
-    def subsec(self, label, title):
         self.ln(3)
         self.set_font('Helvetica', 'B', 10)
-        self.cell(0, 5, f'{label} {title}', ln=True)
         self.ln(1)
-    def p(self, text):
-        """Body paragraph."""
-        self.set_font('Times', '', 9.5)
-        self.multi_cell(0, 4.2, text)
         self.ln(1.5)
-    def p_indent(self, text):
-        """Indented body paragraph."""
-        self.set_font('Times', '', 9.5)
-        self.set_x(LM + 5)
-        self.multi_cell(BW - 5, 4.2, text)
-        self.ln(1.5)
     def bullet(self, items):
-        self.set_font('Times', '', 9.5)
         for item in items:
-            self.set_x(LM + 4)
-            self.cell(4, 4.2, '-')
-            self.multi_cell(BW - 8, 4.2, item)
-            self.ln(0.5)
-        self.ln(1)
-    def fig(self, path, caption, w=155):
         if not os.path.exists(path):
             return
-        self.ln(3)
         x = (self.w - w) / 2
         self.image(path, x=x, w=w)
-        self.ln(2)
-        self.set_font('Helvetica', 'I', 8)
-        self.multi_cell(0, 3.8, caption, align='C')
-        self.ln(3)
     def tbl(self, hdrs, rows, caption=''):
         if caption:
-            self.ln(2)
-            self.set_font('Helvetica', 'I', 8)
-            self.multi_cell(0, 3.8, caption, align='C')
             self.ln(1)
         cw = BW / len(hdrs)
-        self.set_font('Helvetica', 'B', 7.5)
         for h in hdrs:
-            self.cell(cw, 4.5, h, border=1, align='C')
         self.ln()
-        self.set_font('Times', '', 7.5)
         for row in rows:
             for c in row:
-                self.cell(cw, 4.5, str(c), border=1, align='C')
             self.ln()
-        self.ln(2)
 def build():
     pdf = IEEEPaper()
-    # ===== TITLE PAGE =====
     pdf.add_page()
-    pdf.ln(18)
-    pdf.set_font('Helvetica', 'B', 17)
-    pdf.multi_cell(0, 9, 'A Comprehensive Ensemble-Based Framework\nfor Credit Card Fraud Detection\nwith Explainable AI', align='C')
     pdf.ln(6)
-    pdf.set_font('Helvetica', '', 11)
-    pdf.cell(0, 6, 'Raj Vivan', align='C', ln=True)
-    pdf.set_font('Helvetica', 'I', 9)
-    pdf.cell(0, 5, 'Department of Computer Science  |  Independent Research', align='C', ln=True)
-    pdf.ln(10)
-    # --- ABSTRACT ---
-    pdf.set_font('Helvetica', 'B', 10)
-    pdf.cell(0, 5, 'Abstract', align='C', ln=True)
-    pdf.ln(2)
-    pdf.p(
-        'Credit card fraud poses a significant and growing threat to the global financial ecosystem, with estimated annual losses exceeding $32 billion. '
-        'As electronic payment volumes increase, so does the sophistication of fraudulent activities, demanding increasingly advanced detection systems. '
-        'This paper presents a comprehensive, end-to-end fraud detection framework that systematically develops, evaluates, and compares seven distinct machine learning approaches: '
-        'Logistic Regression (as an interpretable baseline), Random Forest, XGBoost, LightGBM, a Multilayer Perceptron neural network, an Autoencoder-based anomaly detector, '
-        'and a soft Voting Ensemble combining the three best-performing tuned models. Using the widely-cited European Cardholder benchmark dataset containing 284,807 transactions '
-        'with an extreme class imbalance of 0.173% fraud, we engineer 12 novel features capturing temporal, behavioral, and interaction patterns. We rigorously address class imbalance '
-        'through both SMOTE oversampling (applied exclusively after train-test splitting) and cost-sensitive learning via balanced class weights. '
-        'Our results demonstrate that XGBoost with cost-sensitive learning achieves the best overall performance with a Precision-Recall AUC of 0.8166, precision of 0.9048, '
-        'recall of 0.8028, F1-score of 0.8507, and Matthews Correlation Coefficient of 0.8520 on the held-out test set. Optimizing the decision threshold from the default 0.5 '
-        'to 0.55 further improves the F1-score to 0.8636. Comprehensive model explainability through SHAP and LIME analysis identifies PCA components V4, V14, and V12 as the '
-        'primary discriminative features driving fraud predictions. Detailed error analysis reveals that the 14 false negatives arise from sophisticated fraud attempts whose '
-        'feature distributions closely mimic legitimate transaction behavior. We deploy the final model as a production-ready FastAPI service achieving sub-10ms average inference latency, '
-        'and provide concept drift monitoring recommendations for sustained operational performance. All code, trained models, figures, and this paper are publicly available.'
-    )
-    pdf.set_font('Helvetica', 'I', 8)
-    pdf.cell(0, 5, 'Keywords: fraud detection, credit card, XGBoost, ensemble learning, SHAP, LIME, class imbalance, SMOTE, anomaly detection, explainable AI', ln=True)
     # ===== I. INTRODUCTION =====
-    pdf.section('I', 'Introduction')
-    pdf.p(
-        'Financial fraud detection has emerged as one of the most consequential applications of machine learning in the modern digital economy. '
-        'The global shift toward electronic payment systems has created an unprecedented volume of financial transactions, with Visa alone processing '
-        'over 200 billion transactions annually. This massive scale, while enabling economic growth, simultaneously creates fertile ground for '
-        'increasingly sophisticated fraudulent activities. According to the Nilson Report [21], worldwide payment card fraud losses reached $32.34 billion '
-        'in 2021, representing a 14% year-over-year increase. Projections indicate these losses will exceed $43 billion by 2026 unless detection systems '
-        'improve significantly.'
-    )
-    pdf.p(
-        'The fundamental challenge in fraud detection lies in the extreme class imbalance inherent in transaction data. In real-world datasets, '
-        'fraudulent transactions typically constitute less than 0.5% of all transactions, often as low as 0.1%. This severe imbalance creates a '
-        'paradox where a naive classifier that labels every transaction as legitimate achieves over 99.8% accuracy while catching zero fraud. '
-        'This renders conventional accuracy metrics entirely misleading and necessitates the use of specialized evaluation criteria including '
-        'Precision-Recall Area Under the Curve (PR-AUC), the F1-score, and the Matthews Correlation Coefficient (MCC), which remain informative '
-        'even under extreme class skew [18].'
-    )
     pdf.p(
-        'A second major challenge is the evolving nature of fraud patterns. Fraudsters continuously adapt their techniques to evade detection, '
-        'a phenomenon known as concept drift [1]. A model trained on historical data may see its performance degrade rapidly as new attack vectors '
-        'emerge. This necessitates continuous monitoring, periodic retraining, and robust model architectures that generalize well to unseen patterns.'
     )
     pdf.p(
-        'Previous approaches to fraud detection span a wide spectrum, from early rule-based expert systems [12] that relied on manually defined '
-        'thresholds and patterns, to modern deep learning architectures employing recurrent networks [13] and graph neural networks [19]. '
-        'However, recent extensive benchmarking by Shwartz-Ziv and Armon [9] and Grinsztajn et al. [10] has demonstrated that well-tuned '
-        'gradient-boosted tree methods consistently match or outperform deep learning on tabular data tasks, including fraud detection, '
-        'particularly when combined with thoughtful feature engineering and proper handling of class imbalance.'
     )
-    pdf.p('This paper makes the following six contributions:')
     pdf.bullet([
-        'A systematic, head-to-head comparison of seven diverse machine learning approaches for fraud detection, spanning linear models, tree ensembles, neural networks, and anomaly detection.',
-        'Novel feature engineering that produces 12 new features capturing temporal cycles, transaction velocity, amount deviations, and PCA component interactions.',
-        'A rigorous evaluation methodology that applies SMOTE only after stratified train-test splitting and fits feature scaling exclusively on training data, preventing all forms of data leakage.',
-        'Comprehensive model explainability analysis using both SHAP (global feature attribution) and LIME (local, instance-level interpretation) to provide actionable insights into fraud prediction drivers.',
-        'A production-ready FastAPI deployment achieving sub-10ms average inference latency with real-time risk scoring, demonstrating deployment feasibility.',
-        'Quantitative business impact analysis translating model performance into dollar-denominated financial outcomes, directly connecting ML metrics to business value.',
     ])
     # ===== II. RELATED WORK =====
-    pdf.section('II', 'Related Work')
-    pdf.p(
-        'The literature on fraud detection is extensive and spans several decades. Bolton and Hand [12] provided one of the earliest comprehensive surveys '
-        'of statistical methods for fraud detection, establishing the field and identifying class imbalance as the central technical challenge. '
-        'Dal Pozzolo et al. [1] subsequently provided a foundational analysis of how class imbalance and concept drift interact in real-world '
-        'credit card fraud detection systems, demonstrating that undersampling strategies could be effective but risked discarding valuable information '
-        'from the majority class. Their follow-up work [22] further investigated conditions under which undersampling outperforms other strategies.'
-    )
-    pdf.p(
-        'The class imbalance problem has generated a rich sub-literature. Chawla et al. [2] introduced SMOTE (Synthetic Minority Over-sampling Technique), '
-        'which generates synthetic minority class samples by interpolating between existing examples in feature space. SMOTE became the dominant '
-        'oversampling method in the field, with numerous variants proposed subsequently (Borderline-SMOTE, ADASYN, etc.). Critically, Fernandez et al. [3] '
-        'established through extensive experimentation that SMOTE must be applied exclusively to training data; applying it before the train-test split '
-        'introduces a subtle but severe form of data leakage where synthetic test samples carry information derived from training examples, leading to '
-        'dramatically over-optimistic performance estimates.'
-    )
-    pdf.p(
-        'Tree-based ensemble methods have emerged as the dominant paradigm for tabular fraud detection. Xuan et al. [17] demonstrated that '
-        'Random Forests achieve robust baseline performance through bagging and feature randomization. Chen and Guestrin [4] introduced XGBoost, '
-        'a regularized gradient boosting framework that has since become one of the most widely used algorithms for tabular classification, including '
-        'fraud detection [14]. Ke et al. [5] proposed LightGBM with leaf-wise tree growth and gradient-based one-side sampling (GOSS), achieving '
-        'faster training with comparable or superior accuracy. Prokhorenkova et al. [16] introduced CatBoost with ordered boosting to handle '
-        'categorical features natively without target leakage.'
-    )
     pdf.p(
-        'Deep learning approaches have also been explored for fraud detection. Pumsirirat and Yan [6] employed autoencoders trained exclusively '
-        'on legitimate transactions, detecting fraud through elevated reconstruction error. This unsupervised approach has the advantage of not '
-        'requiring labeled fraud examples but typically suffers from high false positive rates. Zhang et al. [13] proposed attention-based '
-        'recurrent neural networks that capture sequential transaction patterns, though their complexity often does not justify the marginal '
-        'improvement over tree-based methods on static feature representations.'
     )
     pdf.p(
-        'The growing importance of regulatory compliance has brought model explainability to the forefront. Lundberg and Lee [7] introduced '
-        'SHAP (SHapley Additive exPlanations), grounded in cooperative game theory, which provides theoretically consistent and locally accurate '
-        'feature attribution values. Ribeiro et al. [8] proposed LIME (Local Interpretable Model-agnostic Explanations) for instance-level '
-        'interpretability through local linear approximations. Belle and Papantonis [15] surveyed the broader landscape of explainable AI methods '
-        'applicable to financial decision-making, noting the tension between model performance and interpretability.'
     )
     pdf.p(
-        'For hyperparameter optimization, Akiba et al. [11] introduced Optuna, a framework using Tree-structured Parzen Estimators (TPE) '
-        'that efficiently explores complex search spaces through adaptive sampling, outperforming grid and random search strategies.'
     )
     # ===== III. DATASET AND EDA =====
-    pdf.section('III', 'Dataset and Exploratory Data Analysis')
-    pdf.subsec('A.', 'Dataset Description')
-    pdf.p(
-        'We use the European Cardholder Credit Card Fraud Detection dataset [1], one of the most widely-cited benchmarks in the fraud detection '
-        'literature. The dataset contains 284,807 transactions made by European cardholders over a two-day period in September 2013. Each transaction '
-        'is described by 31 features: 28 numerical features (V1 through V28) that are the result of a PCA transformation applied to the original '
-        'confidential features, plus the raw Time (seconds elapsed from the first transaction in the dataset), Amount (the transaction dollar value), '
-        'and Class (the binary label: 0 for legitimate, 1 for fraud). The PCA transformation was applied by the dataset creators to protect cardholder '
-        'privacy, which means the original feature semantics (merchant category, geographic location, card type, etc.) are not available. This places '
-        'a constraint on domain-specific feature engineering but ensures the dataset can be shared publicly for research.'
-    )
-    pdf.subsec('B.', 'Class Distribution and Imbalance')
     pdf.p(
-        'The dataset exhibits extreme class imbalance: only 492 out of 284,807 transactions are labeled as fraudulent, representing merely 0.173% '
-        'of the total. This yields an imbalance ratio of approximately 1:577 (one fraud per 577 legitimate transactions). Figure 1 illustrates this '
-        'distribution. The severity of this imbalance has profound implications for model training and evaluation: (i) standard cross-entropy loss '
-        'will overwhelmingly optimize for the majority class, (ii) accuracy is rendered meaningless (a constant "not fraud" classifier achieves 99.83%), '
-        'and (iii) most standard ML algorithms will struggle to learn the minority class boundary without explicit countermeasures.'
     )
     pdf.tbl(
-        ['Class', 'Count', 'Percentage', 'Imbalance Ratio'],
         [['Legitimate (0)', '284,315', '99.827%', '---'],
-         ['Fraud (1)', '492', '0.173%', '1 : 577'],
          ['Total', '284,807', '100%', '---']],
-        'Table I: Class Distribution in the Credit Card Fraud Dataset'
     )
-    pdf.fig(os.path.join(FIGURES_DIR, 'class_distribution.png'),
-            'Fig. 1. Class distribution showing the extreme imbalance between legitimate (99.83%) and fraudulent (0.17%) transactions.', w=140)
-    pdf.subsec('C.', 'Transaction Amount Analysis')
     pdf.p(
-        'Analysis of transaction amounts reveals distinct behavioral patterns between the two classes. Legitimate transactions have a mean amount of '
-        '$88.29 with a median of $22.00, indicating a right-skewed distribution dominated by small everyday purchases with occasional large transactions '
-        '(maximum: $25,691.16). Fraudulent transactions, perhaps counter-intuitively, have a higher mean of $122.21 but a lower median of only $9.25. '
-        'This bimodal pattern in fraud amounts suggests two distinct fraud strategies: (i) low-value "testing" transactions (often under $5) where '
-        'fraudsters verify that a stolen card number is active before attempting larger purchases, and (ii) moderate-to-high value transactions that '
-        'represent the actual theft. The low median indicates that the testing strategy is more common. Figure 2 presents the detailed amount distributions.'
     )
-    pdf.fig(os.path.join(FIGURES_DIR, 'amount_analysis.png'),
-            'Fig. 2. Transaction amount analysis: (a) legitimate amounts, (b) fraud amounts, (c) log-scaled comparison, (d) boxplot.', w=155)
-    pdf.subsec('D.', 'Temporal Patterns')
     pdf.p(
-        'Temporal analysis reveals significant differences in the timing of legitimate versus fraudulent transactions. Figure 3 shows that the '
-        'fraud rate during nighttime hours (midnight to 6 AM) is 0.518%, nearly four times the daytime rate of 0.137%. This is consistent with '
-        'known fraud patterns: fraudsters preferentially operate during off-peak hours when transaction monitoring systems may have reduced staffing, '
-        'when cardholders are less likely to notice unauthorized transactions on their accounts, and when automated systems process transactions '
-        'with less stringent real-time checks. This temporal signal motivates our cyclic hour-of-day feature engineering, which encodes the hour '
-        'as sine and cosine components to preserve the circular nature of time.'
     )
-    pdf.fig(os.path.join(FIGURES_DIR, 'time_analysis.png'),
-            'Fig. 3. Temporal patterns: (a) transaction density by hour showing fraud concentration at night, (b) fraud rate by hour.', w=145)
-    pdf.subsec('E.', 'Feature Correlations')
-    pdf.p(
-        'Pearson correlation analysis between each feature and the fraud label identifies the most discriminative PCA components. The features with '
-        'the strongest negative correlation with fraud are V17 (r = -0.326), V14 (r = -0.303), and V12 (r = -0.261), meaning that lower values of '
-        'these features are associated with higher fraud probability. On the positive side, V11 (r = +0.155) and V4 (r = +0.133) show the strongest '
-        'associations. Notably, the raw Amount feature has near-zero correlation (r = 0.006) with fraud, confirming that simple amount-based rules '
-        'would be ineffective. The Time feature also shows negligible correlation (r = -0.012). These findings guide both our feature engineering '
-        '(creating interaction terms between the top correlated features) and our expectation of which features will dominate model importance.'
-    )
-    pdf.fig(os.path.join(FIGURES_DIR, 'correlation_heatmap.png'),
-            'Fig. 4. Feature correlation with the fraud class. Negative values (red) indicate features whose lower values signal fraud.', w=130)
-    pdf.subsec('F.', 'Feature Distributions by Class')
-    pdf.p(
-        'Figure 5 visualizes the distributions of the six most discriminative features separated by class. The key observation is that for features '
-        'like V14 and V17, the fraud distribution (red) is shifted significantly to the left compared to the legitimate distribution (green), '
-        'creating a separable signal that tree-based models can exploit through axis-aligned splits. For V4 and V11, the fraud distribution is shifted '
-        'rightward. However, there is substantial overlap between the classes for all features, which explains why no single feature achieves perfect '
-        'separation and why ensemble methods that combine weak signals from multiple features outperform univariate approaches.'
-    )
-    pdf.fig(os.path.join(FIGURES_DIR, 'feature_distributions.png'),
-            'Fig. 5. Distribution of the top 6 discriminative features by class, showing partial but informative separation.', w=155)
-    pdf.subsec('G.', 'Five Key Observations')
-    pdf.p('Our exploratory analysis yields five principal observations that directly inform the modeling strategy:')
-    pdf.bullet([
-        'EXTREME CLASS IMBALANCE: With only 0.173% fraud, conventional accuracy is meaningless. All models must employ either oversampling (SMOTE) or cost-sensitive learning, and evaluation must rely on PR-AUC, F1, and MCC rather than accuracy or ROC-AUC alone.',
-        'BIMODAL FRAUD AMOUNTS: The bimodal distribution of fraud amounts (small testing transactions + larger theft transactions) means amount-based thresholds will miss most fraud. Feature engineering that captures amount deviations and z-scores is essential.',
-        'TEMPORAL EXPLOITATION: The 4x higher nighttime fraud rate provides a usable signal when encoded as cyclic features. Time-based features should improve model discrimination.',
-        'PCA FEATURE DOMINANCE: V14, V17, V12, V4, and V11 carry the strongest fraud signal. Interaction features between these variables may capture non-linear relationships that individual features miss.',
-        'CLEAN DATA: The absence of missing values and the pre-applied PCA transformation simplify preprocessing but limit domain-specific engineering. The 1,081 duplicate rows are removed to prevent data leakage.',
-    ])
     # ===== IV. METHODOLOGY =====
-    pdf.section('IV', 'Methodology')
-    pdf.subsec('A.', 'Feature Engineering')
-    pdf.p(
-        'We augment the original 30 features (Time, V1-V28, Amount) with 12 engineered features designed to capture temporal, behavioral, and '
-        'interaction patterns that the raw PCA features may not directly encode. The final feature set contains 42 dimensions.'
-    )
-    pdf.p(
-        'Temporal features: We derive the hour of day from the Time column and encode it cyclically using sine and cosine transformations: '
-        'Hour_sin = sin(2*pi*h/24) and Hour_cos = cos(2*pi*h/24), where h = (Time/3600) mod 24. This cyclic encoding ensures that hour 23 '
-        'and hour 0 are treated as adjacent rather than maximally distant, which is critical for capturing the nighttime fraud pattern. '
-        'We also compute Time_diff as the difference in Time from the previous transaction, approximating the inter-arrival time.'
-    )
     pdf.p(
-        'Amount features: We compute Amount_log = log(1 + Amount) to compress the heavy-tailed amount distribution, Amount_deviation_mean and '
-        'Amount_deviation_median to capture how far each transaction deviates from typical amounts, Amount_zscore for standardized deviation, '
-        'and Transaction_velocity = 1/(Time_diff + 1) as a proxy for how rapidly transactions are occurring.'
     )
     pdf.p(
-        'Interaction features: We create three pairwise products between the most discriminative PCA components: V14*V17, V12*V14, and V10*V14. '
-        'These capture joint effects that axis-aligned tree splits would require multiple levels to approximate. Finally, PCA_magnitude computes '
-        'the L2 norm across all 28 PCA features, providing a summary measure of overall transaction "abnormality" in the latent space.'
-    )
-    pdf.subsec('B.', 'Class Imbalance Handling')
-    pdf.p(
-        'We implement and compare two established approaches for handling the 1:577 class imbalance. Both are applied exclusively to the '
-        'training data, never to validation or test sets.'
     )
     pdf.p(
-        'SMOTE (Synthetic Minority Over-sampling Technique) [2]: We generate synthetic fraud samples by interpolating between existing fraud '
-        'examples in the 42-dimensional feature space. We use a sampling_strategy of 0.5, creating enough synthetic fraud to achieve a 1:2 '
-        'minority-to-majority ratio (99,138 synthetic fraud samples added to 198,277 legitimate). This ratio was chosen as a compromise between '
-        'the extreme original imbalance and full 1:1 balancing, which can introduce too much synthetic noise. SMOTE data is used exclusively '
-        'for the MLP neural network, which does not natively support class weighting.'
     )
     pdf.p(
-        'Cost-Sensitive Learning: For tree-based models and Logistic Regression, we apply balanced class weights computed as '
-        'w_c = N / (2 * N_c), yielding w_0 = 0.501 and w_1 = 300.01. This effectively makes each fraud example 599 times more important '
-        'than a legitimate example in the loss function, incentivizing the model to correctly classify fraud even at the cost of some false positives.'
-    )
-    pdf.subsec('C.', 'Data Splitting and Scaling')
-    pdf.p(
-        'After removing 1,081 duplicate rows and engineering features, we perform a stratified 70/15/15 train/validation/test split. '
-        'Stratification preserves the original 0.167% fraud ratio in each split: Train (198,608 samples, 331 fraud), Validation (42,559 samples, '
-        '71 fraud), Test (42,559 samples, 71 fraud). Feature scaling uses RobustScaler, which normalizes by the interquartile range '
-        'x\' = (x - median) / IQR, providing robustness to outliers that are common in financial transaction data. The scaler is fitted exclusively '
-        'on the training set and then applied identically to validation and test sets, preventing any information leakage.'
     )
-    pdf.subsec('D.', 'Model Descriptions')
-    pdf.p(
-        '1) Logistic Regression (Baseline): An L2-regularized linear model with C=0.1 and balanced class weights, serving as an interpretable '
-        'baseline. Its coefficients directly indicate feature importance and direction of effect.'
-    )
-    pdf.p(
-        '2) Random Forest: An ensemble of 150 decision trees (max_depth=12, min_samples_split=5) with balanced class weights. '
-        'Each tree is trained on a bootstrap sample with random feature subsets, providing variance reduction through averaging.'
-    )
-    pdf.p(
-        '3) XGBoost: Gradient boosted trees with 200 estimators, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, '
-        'and scale_pos_weight derived from class frequencies. Uses histogram-based splitting for computational efficiency.'
-    )
-    pdf.p(
-        '4) LightGBM: Leaf-wise gradient boosting with 200 estimators, max_depth=8, learning_rate=0.05, and gradient-based one-side sampling. '
-        'The leaf-wise growth strategy can produce deeper trees than XGBoost for the same number of leaves, potentially capturing more complex patterns.'
-    )
-    pdf.p(
-        '5) MLP Neural Network: A three-layer perceptron (128-64-32 neurons) with ReLU activation, dropout (implicit via alpha=0.001 L2 '
-        'regularization), adaptive learning rate, and early stopping. Trained on SMOTE-augmented data since sklearn MLPClassifier does not '
-        'support class weights directly.'
-    )
-    pdf.p(
-        '6) Autoencoder (Anomaly Detection): A symmetric autoencoder with architecture 42-64-32-16-32-64-42, trained for 50 epochs '
-        'exclusively on legitimate transactions. The core assumption is that the autoencoder learns to reconstruct normal transaction patterns; '
-        'when a fraudulent transaction is presented, the reconstruction error e(x) = (1/d) * sum((x_i - x_hat_i)^2) will be anomalously high. '
-        'This approach requires no labeled fraud examples during training, making it potentially useful for zero-day fraud detection.'
-    )
-    pdf.p(
-        '7) Voting Ensemble: Soft voting over the three best-performing tuned models (XGBoost, LightGBM, Random Forest), where the final '
-        'fraud probability is the arithmetic mean of the three individual model probabilities. This leverages the diversity of different '
-        'tree-building strategies to reduce variance.'
-    )
-    pdf.subsec('E.', 'Hyperparameter Optimization')
-    pdf.p(
-        'We tune the top three models (XGBoost, LightGBM, Random Forest) using Optuna [11] with the Tree-structured Parzen Estimator (TPE) '
-        'sampler. For each model, Optuna explores the hyperparameter space (learning rate, tree depth, regularization, subsampling) over '
-        '15-20 trials, optimizing PR-AUC on the validation set. The TPE sampler adaptively focuses trials on promising regions of the search '
-        'space, achieving better sample efficiency than grid or random search.'
-    )
-    pdf.fig(os.path.join(FIGURES_DIR, 'architecture_diagram.png'),
-            'Fig. 6. End-to-end system architecture: from transaction input through feature engineering, model inference, to API output and monitoring.', w=155)
     # ===== V. EXPERIMENTAL SETUP =====
-    pdf.section('V', 'Experimental Setup')
     pdf.p(
-        'All experiments were conducted using Python 3.12 on CPU infrastructure. The primary libraries and their versions are: '
-        'scikit-learn 1.8.0 (Logistic Regression, Random Forest, MLP, preprocessing), XGBoost 3.2.0, LightGBM 4.6.0, PyTorch 2.11.0 (Autoencoder), '
-        'Optuna 4.8.0 (hyperparameter optimization), SHAP 0.51.0, and LIME 0.2.0.1. Total training time for all models including Optuna tuning '
-        'was approximately 25 minutes on a 2-core CPU.'
-    )
-    pdf.p(
-        'We report six evaluation metrics on the held-out test set: (1) Precision = TP/(TP+FP), measuring the fraction of flagged transactions '
-        'that are actually fraudulent; (2) Recall = TP/(TP+FN), measuring the fraction of actual fraud that is caught; (3) F1-score, the harmonic '
-        'mean of precision and recall; (4) ROC-AUC, the area under the Receiver Operating Characteristic curve; (5) PR-AUC (Average Precision), '
-        'the area under the Precision-Recall curve, which is our primary metric as it is more informative than ROC-AUC under extreme class imbalance [18]; '
-        'and (6) Matthews Correlation Coefficient (MCC), which provides a balanced measure that accounts for all four confusion matrix quadrants '
-        'and returns values between -1 and +1.'
     )
     # ===== VI. RESULTS AND DISCUSSION =====
-    pdf.section('VI', 'Results and Discussion')
-    pdf.subsec('A.', 'Model Comparison')
-    pdf.p(
-        'Table II presents the comprehensive evaluation of all models on the test set using a default threshold of 0.5. The results reveal '
-        'a clear hierarchy with important nuances.'
-    )
     pdf.tbl(
         ['Model', 'Prec.', 'Recall', 'F1', 'ROC-AUC', 'PR-AUC', 'MCC'],
         [
@@ -442,267 +269,185 @@ def build():
             ['Logistic Reg.', '0.049', '0.887', '0.092', '0.962', '0.735', '0.204'],
             ['Autoencoder', '0.003', '1.000', '0.007', '0.960', '0.044', '0.041'],
         ],
-        'Table II: Comprehensive Model Comparison on Test Set (threshold = 0.5)'
     )
     pdf.p(
-        'Key Observation 1 - Tree-based models dominate: XGBoost achieves the highest PR-AUC (0.817), F1-score (0.851), and MCC (0.852), '
-        'confirming the findings of Shwartz-Ziv and Armon [9] that gradient-boosted trees remain the strongest approach for tabular data. '
-        'The Voting Ensemble achieves a marginally higher ROC-AUC (0.978) through model averaging but does not improve upon the single XGBoost '
-        'on the more informative PR-AUC metric, suggesting that the three ensemble members are not sufficiently diverse to benefit from averaging.'
     )
     pdf.p(
-        'Key Observation 2 - The precision-recall tradeoff is stark: Logistic Regression achieves high recall (0.887) but catastrophically '
-        'low precision (0.049), flagging over 1,200 legitimate transactions as fraud for every 63 true fraud caught. This linear model '
-        'creates an extremely aggressive decision boundary due to the large class weight (300x), resulting in a flood of false alarms that '
-        'would overwhelm any operational fraud investigation team.'
     )
     pdf.p(
-        'Key Observation 3 - Autoencoder anomaly detection fails in this setting: While the autoencoder achieves perfect recall (1.0) by '
-        'construction (all fraud has higher reconstruction error than the median), its precision is only 0.003, flagging over 21,000 '
-        'legitimate transactions. The PR-AUC of 0.044 is near-random. This failure is likely attributable to the PCA-transformed feature '
-        'space: the autoencoder learns to reconstruct the dominant variance directions, but the fraud signal may reside in minor PCA components '
-        'whose reconstruction error is similar to legitimate noise, making discrimination unreliable.'
     )
     pdf.p(
-        'Key Observation 4 - Optuna tuning shows mixed results: Tuning improved LightGBM dramatically (PR-AUC from 0.012 to 0.796) by '
-        'correcting an initial configuration where scale_pos_weight was too aggressive. However, the tuned XGBoost (0.793) slightly underperformed '
-        'the base XGBoost (0.817), suggesting the base configuration was already near-optimal and the tuning search space introduced suboptimal regions.'
     )
-    pdf.fig(os.path.join(FIGURES_DIR, 'roc_curves.png'),
-            'Fig. 7. ROC curves for the top 5 models. All achieve ROC-AUC > 0.93, but ROC-AUC alone is insufficient for evaluation under extreme imbalance.', w=125)
-    pdf.fig(os.path.join(FIGURES_DIR, 'pr_curves.png'),
-            'Fig. 8. Precision-Recall curves -- the primary evaluation metric. XGBoost achieves the largest area under the curve (0.817).', w=125)
-    pdf.fig(os.path.join(FIGURES_DIR, 'confusion_matrices.png'),
-            'Fig. 9. Confusion matrices for six selected models. XGBoost achieves the best balance: 57 true positives with only 6 false positives.', w=160)
-    pdf.subsec('B.', 'Threshold Optimization')
     pdf.p(
-        'The default classification threshold of 0.5 is a common but arbitrary choice that is rarely optimal, especially for imbalanced problems. '
-        'We perform a systematic sweep of thresholds from 0.05 to 0.95 for the best model (XGBoost) and evaluate Precision, Recall, F1, and MCC '
-        'at each threshold. Figure 10 visualizes the tradeoff. The optimal threshold by F1 is 0.55, which improves F1 from 0.851 to 0.864 and '
-        'precision from 0.905 to 0.934 while maintaining the same recall of 0.803. This means raising the threshold by 0.05 eliminates some '
-        'borderline false positives without losing any true positives, a Pareto improvement. At higher thresholds (>0.85), recall begins to drop '
-        'as the model becomes overly conservative. In practice, the operational threshold should be set based on the specific cost ratio between '
-        'missed fraud and false alarm investigation costs.'
-    )
-    pdf.fig(os.path.join(FIGURES_DIR, 'threshold_analysis.png'),
-            'Fig. 10. Threshold sensitivity analysis: (a) precision/recall/F1 vs. threshold, (b) MCC vs. threshold. Optimal F1 at threshold = 0.55.', w=145)
-    pdf.subsec('C.', 'Business Impact Analysis')
     pdf.p(
-        'To translate ML metrics into actionable financial outcomes, we estimate the business impact using the average fraud transaction amount '
-        'of $122.21 and an estimated $5 per false alarm investigation cost. Table III shows that XGBoost provides the highest net savings ($6,936 '
-        'on the 42,559-transaction test set) by catching 57 of 71 fraudulent transactions (80.3% catch rate) while generating only 6 false alarms '
-        '($30 investigation cost). By contrast, Logistic Regression catches more fraud (63 of 71, 88.7%) but generates 1,229 false alarms at $6,145 '
-        'total investigation cost, reducing net savings to only $1,554. The Autoencoder, despite catching all 71 fraud transactions, generates '
-        '21,209 false alarms at $106,045 -- a net loss of $97,368.'
     )
     pdf.tbl(
-        ['Model', 'TP', 'FN', 'FP', 'Caught($)', 'Missed($)', 'FP Cost($)', 'Net($)'],
         [
-            ['XGBoost', '57', '14', '6', '6,966', '1,711', '30', '6,936'],
-            ['Ens.', '57', '14', '9', '6,966', '1,711', '45', '6,921'],
-            ['LGBM-T', '58', '13', '24', '7,088', '1,589', '120', '6,968'],
-            ['LR', '63', '8', '1229', '7,699', '978', '6,145', '1,554'],
-            ['AE', '71', '0', '21209', '8,677', '0', '106,045', '-97,368'],
         ],
-        'Table III: Business Impact Analysis on Test Set'
-    )
-    pdf.p(
-        'This analysis underscores a critical insight: maximizing recall without regard for precision is counterproductive in operational settings. '
-        'The Autoencoder catches every fraud but would bankrupt the operations team with false alarm investigations. The optimal model balances '
-        'fraud catch rate against false alarm volume, and XGBoost achieves this balance most effectively.'
-    )
-    # ===== EXPLAINABILITY =====
-    pdf.subsec('D.', 'Feature Importance and Explainability')
-    pdf.p(
-        'Model explainability is critical for operational trust, regulatory compliance, and scientific insight. We employ two complementary methods: '
-        'SHAP for global feature attribution and LIME for local, instance-level explanation.'
     )
-    pdf.p(
-        'SHAP Analysis: Figure 11 shows the SHAP summary plot for XGBoost, computed over 2,000 test samples. The top three features by '
-        'mean absolute SHAP value are V4 (1.913), V14 (1.843), and PCA_magnitude (1.113). The SHAP analysis reveals several important patterns: '
-        '(i) High values of V4 push predictions toward fraud, while low values push toward legitimate; (ii) Low (more negative) values of V14 '
-        'are strongly associated with fraud, consistent with the negative correlation observed in EDA; (iii) High PCA_magnitude indicates '
-        'transactions that are far from the centroid in PCA space, which are more likely to be anomalous. Notably, the engineered feature '
-        'V10_V14_interaction ranks 9th, validating our hypothesis that interaction terms capture additional signal beyond individual features.'
-    )
-    pdf.fig(os.path.join(FIGURES_DIR, 'shap_summary.png'),
-            'Fig. 11. SHAP summary plot: each dot is one test sample. Color indicates feature value (red=high, blue=low). Horizontal position shows SHAP impact.', w=140)
-    pdf.fig(os.path.join(FIGURES_DIR, 'shap_top10.png'),
-            'Fig. 12. Top 10 features by mean |SHAP value|. V4, V14, and PCA_magnitude are the dominant fraud predictors.', w=125)
     pdf.p(
-        'LIME Analysis: For instance-level interpretability, we apply LIME to a correctly classified fraud transaction (P(fraud)=1.0000). '
-        'Figure 13 shows the contribution of each feature to this specific prediction. All top features push toward the fraud class, with '
-        'Time_diff, V4, V12, and V14 being the strongest contributors. This granular explanation could be presented to a human fraud '
-        'analyst to justify why a specific transaction was blocked, supporting regulatory requirements for explainable decisions.'
     )
-    pdf.fig(os.path.join(FIGURES_DIR, 'lime_explanation.png'),
-            'Fig. 13. LIME explanation for a single fraud sample. Red bars increase fraud risk; green bars decrease it.', w=140)
-    pdf.fig(os.path.join(FIGURES_DIR, 'feature_importance.png'),
-            'Fig. 14. Feature importance comparison across four model types (RF, XGBoost, LightGBM, Logistic Regression).', w=155)
     # ===== VII. ERROR ANALYSIS =====
-    pdf.section('VII', 'Error Analysis')
-    pdf.subsec('A.', 'False Negative Analysis (Missed Fraud)')
-    pdf.p(
-        'The XGBoost model misses 14 of 71 fraudulent transactions in the test set (19.7% miss rate). Understanding why these transactions '
-        'escape detection is critical for improving the system. Analysis of the 14 false negatives reveals that their mean predicted fraud '
-        'probability is only 0.013, far below the 0.5 threshold -- the model is highly confident they are legitimate, not merely borderline.'
-    )
     pdf.p(
-        'Feature comparison provides the explanation: false negatives have V14 averaging -0.97 compared to -8.45 for true positives (a 7.5x '
-        'difference), V12 averaging -0.41 vs -7.69, and PCA_magnitude of 1.82 vs 12.25. In other words, missed fraud transactions have feature '
-        'values that are dramatically closer to legitimate transactions than to caught fraud. These represent sophisticated fraud attempts that '
-        'have been designed (intentionally or coincidentally) to mimic legitimate behavioral patterns. They operate within normal amount ranges, '
-        'at normal hours, and produce PCA component values that fall squarely within the legitimate distribution.'
     )
     pdf.p(
-        'Implication: These false negatives cannot be eliminated by simply lowering the threshold -- at threshold 0.12, only one additional '
-        'FN would be caught while generating many more false alarms. Catching these sophisticated fraud attempts likely requires additional '
-        'data sources (transaction sequences, device fingerprints, geographic data) that are not available in the PCA-anonymized dataset.'
     )
-    pdf.subsec('B.', 'False Positive Analysis (False Alarms)')
-    pdf.p(
-        'The 6 false positives have a mean predicted fraud probability of 0.827, with some reaching 1.0 -- the model is highly confident '
-        'these are fraud, yet they are legitimate. Feature analysis shows these transactions have V14 averaging -7.13 (vs -0.04 for true negatives) '
-        'and PCA_magnitude of 7.86 (vs 0.28 for true negatives). These legitimate transactions genuinely exhibit the same anomalous feature '
-        'patterns as actual fraud, likely representing unusual but lawful spending behavior (e.g., first-time purchases in an unusual category, '
-        'international transactions, or large purchases for individuals who typically make small ones). No amount of model tuning can distinguish '
-        'these from actual fraud without additional contextual information.'
-    )
-    pdf.subsec('C.', 'Concept Drift Assessment and Retraining Recommendations')
     pdf.p(
-        'Comparing model confidence between the first and second halves of the test period reveals a drift indicator of +0.115 in mean fraud '
-        'probability for actual fraud cases. While this magnitude is modest, it suggests that even within the two-day dataset window, the '
-        'statistical properties of fraud are not stationary.'
     )
-    pdf.p('Based on this analysis and industry best practices, we recommend the following operational monitoring regime:')
-    pdf.bullet([
-        'Weekly computation of PR-AUC, F1, and false positive rate on recent labeled data to track model degradation.',
-        'Automated retraining trigger when PR-AUC drops below 0.70 or false positive rate exceeds 2x the baseline.',
-        'Sliding window training using the most recent 3-6 months of labeled data, rather than static historical training.',
-        'Population Stability Index (PSI) monitoring on all input features, with alerts when PSI exceeds 0.25 for any feature.',
-        'A/B testing framework for deploying model updates, with gradual traffic ramps from 1% to 100%.',
-        'Quarterly fraud pattern reviews with domain experts to identify emerging attack vectors that models may not yet capture.',
-    ])
-    pdf.fig(os.path.join(FIGURES_DIR, 'error_analysis.png'),
-            'Fig. 15. Error analysis: (a) FN probability distribution, (b) FP probability distribution, (c) overall score distribution by class.', w=160)
     # ===== VIII. LIMITATIONS =====
-    pdf.section('VIII', 'Limitations')
-    pdf.p('We acknowledge several important limitations of this work:')
     pdf.bullet([
-        'PCA Anonymization: The V1-V28 features are PCA-transformed, which prevents domain-specific feature engineering (merchant category, geographic location, card type) and limits interpretability. Real-world systems with access to raw features would likely achieve significantly better performance.',
-        'Temporal Scope: The dataset covers only two days of transactions, severely limiting assessment of long-term concept drift, seasonal patterns, and fraud evolution over weeks or months.',
-        'Single-Institution Data: Results from one European bank may not generalize across different institutions, geographies, payment networks, or regulatory environments.',
-        'Static Feature Set: Our feature engineering does not incorporate sequential transaction history (e.g., spending velocity over the past week, unusual merchant for this cardholder). Such features are critical in production systems but require per-customer state management.',
-        'Static Threshold: The optimal threshold of 0.55 was determined on the test set and may shift as fraud patterns evolve. A production system should implement dynamic threshold adaptation based on recent performance metrics.',
-        'Limited Autoencoder Architecture: Our autoencoder uses a simple symmetric architecture. More advanced anomaly detection methods (Variational Autoencoders, adversarial training) might achieve better performance on this task.',
     ])
     # ===== IX. FUTURE WORK =====
-    pdf.section('IX', 'Future Work')
-    pdf.p('Several promising research directions emerge from this work:')
     pdf.p(
-        'Graph Neural Networks for Fraud Ring Detection [19]: Modeling the transaction network as a graph -- where nodes represent cards, '
-        'merchants, and accounts, and edges represent transactions -- would enable detection of coordinated fraud rings that cannot be identified '
-        'from individual transaction features alone. Graph convolutional networks can propagate suspicion scores through the network, flagging '
-        'accounts that transact heavily with known fraudulent nodes.'
     )
     pdf.p(
-        'Real-Time Streaming with Apache Kafka: Production fraud detection requires sub-100ms end-to-end latency from transaction initiation '
-        'to decision. Integrating the model with Apache Kafka for event streaming and Apache Flink for real-time feature computation would '
-        'enable processing millions of transactions per second with consistent low-latency guarantees.'
     )
     pdf.p(
-        'Federated Learning Across Banks [20]: Individual banks have limited fraud data, especially for rare fraud types. Federated learning '
-        'allows multiple institutions to collaboratively train a shared model without exchanging raw transaction data, preserving customer '
-        'privacy while dramatically expanding the effective training set. This is particularly valuable for detecting cross-institutional '
-        'fraud patterns where the same stolen credentials are used across multiple banks.'
     )
     pdf.p(
-        'LLM-Generated Compliance Explanations: When a transaction is blocked, regulatory requirements often demand a human-readable justification. '
-        'Large language models could translate SHAP values and feature contributions into natural-language narratives '
-        '(e.g., "This transaction was blocked because the purchase amount was unusually high for this card, occurring at an unusual time, '
-        'with spending patterns inconsistent with the cardholder\'s history"), reducing the burden on human fraud analysts.'
-    )
-    pdf.p(
-        'Temporal Sequence Modeling: Transformers and LSTM networks operating on the sequence of a cardholder\'s recent transactions could capture '
-        'behavioral patterns (typical spending days, preferred merchants, usual amounts) and flag departures from established routines. '
-        'This approach treats fraud detection as an anomaly in a time series rather than a static classification problem.'
     )
     # ===== X. CONCLUSION =====
-    pdf.section('X', 'Conclusion')
-    pdf.p(
-        'This paper presents a comprehensive, end-to-end fraud detection framework that systematically evaluates seven diverse machine learning '
-        'approaches on the benchmark European Cardholder credit card fraud dataset. Through careful feature engineering (12 new features), '
-        'rigorous methodology (SMOTE after splitting, scaler fitted on train only), and thorough evaluation (six metrics including PR-AUC, MCC), '
-        'we demonstrate that XGBoost with cost-sensitive learning achieves the best overall performance with a PR-AUC of 0.817, F1-score of 0.851, '
-        'and MCC of 0.852.'
-    )
-    pdf.p(
-        'Our threshold optimization analysis reveals that shifting the decision boundary from 0.50 to 0.55 yields a Pareto improvement, '
-        'increasing F1 to 0.864 and precision to 0.934 without sacrificing recall. Business impact analysis quantifies that XGBoost catches '
-        '80.3% of fraud while generating only 6 false alarms on a 42,559-transaction test set, resulting in estimated net savings of $6,936. '
-        'In contrast, the Autoencoder catches all fraud but generates over 21,000 false alarms -- a cautionary tale against optimizing recall alone.'
-    )
     pdf.p(
-        'SHAP and LIME explainability analysis identifies V4, V14, and PCA_magnitude as the primary fraud discriminators, providing actionable '
-        'insights for fraud investigation teams. Error analysis reveals that the 14 missed fraud cases represent sophisticated attacks whose '
-        'feature profiles are indistinguishable from legitimate transactions, suggesting that additional data sources beyond PCA-anonymized '
-        'features are needed to catch the most evasive fraud.'
     )
     pdf.p(
-        'The complete system -- from feature engineering through model training, evaluation, explainability, and FastAPI deployment with sub-10ms '
-        'latency -- demonstrates that production-grade fraud detection can be achieved with well-tuned classical ML methods. Tree-based ensemble '
-        'methods, particularly XGBoost, remain the state-of-the-art for tabular fraud detection, outperforming both deep learning (MLP, Autoencoder) '
-        'and linear (Logistic Regression) alternatives on all metrics that matter for imbalanced classification.'
     )
     # ===== REFERENCES =====
-    pdf.section('', 'References')
     refs = [
-        '[1]  A. Dal Pozzolo, O. Caelen, R. A. Johnson, and G. Bontempi, "Calibrating probability with undersampling for unbalanced classification," in Proc. IEEE CIDM, 2015, pp. 159-166.',
-        '[2]  N. V. Chawla, K. W. Bowyer, L. O. Hall, and W. P. Kegelmeyer, "SMOTE: Synthetic Minority Over-sampling Technique," J. Artificial Intelligence Research, vol. 16, pp. 321-357, 2002.',
-        '[3]  A. Fernandez, S. Garcia, M. Galar, R. C. Prati, B. Krawczyk, and F. Herrera, Learning from Imbalanced Data Sets. Springer, 2018.',
-        '[4]  T. Chen and C. Guestrin, "XGBoost: A scalable tree boosting system," in Proc. 22nd ACM SIGKDD, 2016, pp. 785-794.',
-        '[5]  G. Ke, Q. Meng, T. Finley, et al., "LightGBM: A highly efficient gradient boosting decision tree," in NeurIPS, vol. 30, 2017.',
-        '[6]  A. Pumsirirat and L. Yan, "Credit card fraud detection using deep learning based on auto-encoder," IJACSA, vol. 9, no. 1, 2018.',
-        '[7]  S. M. Lundberg and S.-I. Lee, "A unified approach to interpreting model predictions," in NeurIPS, vol. 30, 2017.',
-        '[8]  M. T. Ribeiro, S. Singh, and C. Guestrin, "Why should I trust you?: Explaining the predictions of any classifier," in Proc. ACM SIGKDD, 2016, pp. 1135-1144.',
-        '[9]  R. Shwartz-Ziv and A. Armon, "Tabular data: Deep learning is not all you need," Information Fusion, vol. 81, pp. 84-90, 2022.',
-        '[10] L. Grinsztajn, E. Oyallon, and G. Varoquaux, "Why do tree-based models still outperform deep learning on tabular data?," in NeurIPS, vol. 35, 2022.',
-        '[11] T. Akiba, S. Sano, T. Yanase, T. Ohta, and M. Koyama, "Optuna: A next-generation hyperparameter optimization framework," in Proc. ACM SIGKDD, 2019.',
-        '[12] R. J. Bolton and D. J. Hand, "Statistical fraud detection: A review," Statistical Science, vol. 17, no. 3, pp. 235-255, 2002.',
-        '[13] Z. Zhang, X. Zhou, X. Zhang, L. Wang, and P. Wang, "A model based on convolutional recurrent neural network for fraud detection," Complexity, 2021.',
-        '[14] A. A. Taha and S. J. Malebary, "An intelligent approach to credit card fraud detection using an optimized light gradient boosting machine," IEEE Access, vol. 8, 2020.',
-        '[15] V. Belle and I. Papantonis, "Principles and practice of explainable machine learning," Frontiers in Big Data, vol. 4, 2021.',
-        '[16] L. Prokhorenkova, G. Gusev, A. Vorobev, A. V. Dorogush, and A. Gulin, "CatBoost: Unbiased boosting with categorical features," in NeurIPS, vol. 31, 2018.',
-        '[17] S. Xuan, G. Liu, Z. Li, L. Zheng, S. Wang, and C. Jiang, "Random forest for credit card fraud detection," in Proc. IEEE ICNSC, 2018.',
-        '[18] T. Saito and M. Rehmsmeier, "The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets," PLoS ONE, 2015.',
-        '[19] Y. Liu, M. Ao, C. Chi, F. Feng, D. Yang, and J. He, "Pick and choose: A GNN-based imbalanced learning approach for fraud detection," in Web Conf., 2021.',
-        '[20] Q. Yang, Y. Liu, T. Chen, and Y. Tong, "Federated machine learning: Concept and applications," ACM TIST, vol. 10, no. 2, 2019.',
         '[21] Nilson Report, "Global card fraud losses," Issue 1209, 2022.',
-        '[22] A. Dal Pozzolo, O. Caelen, and G. Bontempi, "When is undersampling effective in unbalanced classification tasks?," in ECML PKDD, 2015.',
     ]
-    pdf.set_font('Times', '', 7.5)
     for ref in refs:
-        pdf.multi_cell(0, 3.5, ref)
-        pdf.ln(0.8)
-    # Save
     out = os.path.join(PAPER_DIR, 'fraud_detection_paper.pdf')
     pdf.output(out)
-    print(f"\nPDF saved: {out}  ({pdf.page_no()} pages)")
 if __name__ == '__main__':

 """
+Generate a tight, comprehensive IEEE-style PDF paper using fpdf2.
+Target: 12 pages. No wasted whitespace. Redundant figures removed.
 """
 import os, sys
 sys.path.insert(0, '/app/fraud_detection')
 PAPER_DIR = '/app/fraud_detection/paper'
 os.makedirs(PAPER_DIR, exist_ok=True)
+LM = 14
+RM = 14
+TM = 14
+BW = 215.9 - LM - RM
 class IEEEPaper(FPDF):
     def __init__(self):
         super().__init__('P', 'mm', 'letter')
+        self.set_margins(LM, TM, RM)
+        self.set_auto_page_break(auto=True, margin=16)
     def header(self):
         if self.page_no() > 1:
+            self.set_font('Helvetica', 'I', 6.5)
+            self.cell(0, 3, 'IEEE -- Fraud Detection with Explainable AI', align='C')
+            self.ln(4)
     def footer(self):
+        self.set_y(-12)
         self.set_font('Helvetica', 'I', 7)
+        self.cell(0, 8, f'{self.page_no()}', align='C')
+    def sec(self, num, title):
         self.ln(3)
         self.set_font('Helvetica', 'B', 10)
+        t = f'{num}.  {title.upper()}' if num else title.upper()
+        self.cell(0, 5, t, ln=True)
         self.ln(1)
+    def sub(self, label, title):
         self.ln(1.5)
+        self.set_font('Helvetica', 'B', 9)
+        self.cell(0, 4.5, f'{label} {title}', ln=True)
+        self.ln(0.5)
+    def p(self, text):
+        self.set_font('Times', '', 9)
+        self.multi_cell(0, 3.8, text)
+        self.ln(1)
     def bullet(self, items):
+        self.set_font('Times', '', 9)
         for item in items:
+            self.set_x(LM + 3)
+            self.cell(3, 3.8, '-')
+            self.multi_cell(BW - 6, 3.8, item)
+            self.ln(0.3)
+        self.ln(0.5)
+    def fig(self, path, caption, w=145):
         if not os.path.exists(path):
             return
+        self.ln(1.5)
         x = (self.w - w) / 2
         self.image(path, x=x, w=w)
+        self.ln(1)
+        self.set_font('Helvetica', 'I', 7.5)
+        self.multi_cell(0, 3.5, caption, align='C')
+        self.ln(1.5)
     def tbl(self, hdrs, rows, caption=''):
         if caption:
             self.ln(1)
+            self.set_font('Helvetica', 'I', 7.5)
+            self.multi_cell(0, 3.5, caption, align='C')
+            self.ln(0.5)
         cw = BW / len(hdrs)
+        self.set_font('Helvetica', 'B', 7)
         for h in hdrs:
+            self.cell(cw, 4, h, border=1, align='C')
         self.ln()
+        self.set_font('Times', '', 7)
         for row in rows:
             for c in row:
+                self.cell(cw, 4, str(c), border=1, align='C')
             self.ln()
+        self.ln(1.5)
 def build():
     pdf = IEEEPaper()
+    F = FIGURES_DIR
+    # ===== PAGE 1: Title + Abstract + Start of Intro =====
     pdf.add_page()
     pdf.ln(6)
+    pdf.set_font('Helvetica', 'B', 15)
+    pdf.multi_cell(0, 7.5, 'A Comprehensive Ensemble-Based Framework\nfor Credit Card Fraud Detection with Explainable AI', align='C')
+    pdf.ln(3)
+    pdf.set_font('Helvetica', '', 10)
+    pdf.cell(0, 5, 'Raj Vivan', align='C', ln=True)
+    pdf.set_font('Helvetica', 'I', 8.5)
+    pdf.cell(0, 4, 'Department of Computer Science  |  Independent Research', align='C', ln=True)
+    pdf.ln(4)
+    pdf.set_font('Helvetica', 'B', 9)
+    pdf.cell(0, 4, 'Abstract', align='C', ln=True)
+    pdf.ln(1)
+    pdf.p(
+        'Credit card fraud poses a significant and growing threat to the global financial ecosystem, with estimated annual losses exceeding '
+        '$32 billion. This paper presents a comprehensive, end-to-end fraud detection framework that systematically develops, evaluates, and '
+        'compares seven machine learning approaches: Logistic Regression, Random Forest, XGBoost, LightGBM, Multilayer Perceptron, '
+        'Autoencoder-based anomaly detection, and a Voting Ensemble. Using the European Cardholder benchmark dataset (284,807 transactions, '
+        '0.173% fraud rate), we engineer 12 novel features and address class imbalance through SMOTE oversampling (applied exclusively after '
+        'train-test splitting) and cost-sensitive learning. XGBoost achieves the best performance with PR-AUC of 0.8166, precision of 0.9048, '
+        'recall of 0.8028, and F1 of 0.8507. Threshold optimization from 0.5 to 0.55 improves F1 to 0.8636. SHAP and LIME explainability '
+        'analysis identifies V4, V14, and PCA_magnitude as primary fraud discriminators. Error analysis reveals that false negatives arise '
+        'from sophisticated fraud closely mimicking legitimate behavior. The model is deployed as a FastAPI service with sub-10ms latency. '
+        'All code, models, and results are publicly available.'
+    )
+    pdf.set_font('Helvetica', 'I', 7.5)
+    pdf.cell(0, 4, 'Keywords: fraud detection, XGBoost, ensemble learning, SHAP, LIME, class imbalance, SMOTE, anomaly detection', ln=True)
     # ===== I. INTRODUCTION =====
+    pdf.sec('I', 'Introduction')
     pdf.p(
+        'Financial fraud detection has emerged as one of the most consequential applications of machine learning. The global shift toward '
+        'electronic payments has created unprecedented transaction volumes while simultaneously enabling sophisticated fraud. According to '
+        'the Nilson Report [21], worldwide card fraud losses reached $32.34 billion in 2021 (a 14% increase year-over-year), projected to '
+        'exceed $43 billion by 2026. The fundamental challenge lies in extreme class imbalance: fraudulent transactions typically constitute '
+        'less than 0.5% of all transactions, rendering accuracy metrics meaningless and necessitating PR-AUC, F1, and MCC [18].'
     )
     pdf.p(
+        'A second challenge is concept drift [1]: fraudsters continuously adapt, causing model performance to degrade over time. Previous '
+        'approaches range from rule-based expert systems [12] to deep learning architectures [13]. However, extensive benchmarking by '
+        'Shwartz-Ziv and Armon [9] and Grinsztajn et al. [10] demonstrates that well-tuned gradient-boosted trees consistently outperform '
+        'deep learning on tabular data, including fraud detection, when combined with thoughtful feature engineering.'
     )
+    pdf.p('This paper makes the following contributions:')
     pdf.bullet([
+        'Systematic comparison of seven ML approaches spanning linear models, tree ensembles, neural networks, and anomaly detection.',
+        'Novel feature engineering producing 12 features capturing temporal cycles, transaction velocity, and PCA interactions.',
+        'Rigorous methodology: SMOTE only after splitting; scaler fitted on train only; six metrics including PR-AUC and MCC.',
+        'SHAP (global) and LIME (local) explainability analysis identifying key fraud indicators.',
+        'Production FastAPI deployment achieving sub-10ms latency with business impact quantification.',
     ])
     # ===== II. RELATED WORK =====
+    pdf.sec('II', 'Related Work')
     pdf.p(
+        'Bolton and Hand [12] provided an early survey of statistical fraud detection. Dal Pozzolo et al. [1] analyzed class imbalance and '
+        'concept drift in real-world systems, while their follow-up [22] investigated when undersampling is effective. Chawla et al. [2] '
+        'introduced SMOTE for synthetic minority oversampling; Fernandez et al. [3] later demonstrated that SMOTE must be applied exclusively '
+        'to training data to avoid data leakage that produces over-optimistic estimates.'
     )
     pdf.p(
+        'Tree-based methods dominate tabular fraud detection. Xuan et al. [17] showed Random Forests achieve robust baseline performance. '
+        'Chen and Guestrin [4] introduced XGBoost; Ke et al. [5] proposed LightGBM with leaf-wise growth and GOSS; Prokhorenkova et al. [16] '
+        'introduced CatBoost with ordered boosting. Taha and Malebary [14] demonstrated optimized LightGBM for fraud detection. '
+        'Pumsirirat and Yan [6] employed autoencoders trained on legitimate transactions only, detecting fraud via reconstruction error. '
+        'Zhang et al. [13] proposed attention-based RNNs for sequential patterns.'
     )
     pdf.p(
+        'For explainability, Lundberg and Lee [7] introduced SHAP based on Shapley values from cooperative game theory. Ribeiro et al. [8] '
+        'proposed LIME for instance-level interpretation via local linear approximations. Belle and Papantonis [15] surveyed XAI methods for '
+        'financial decision-making. Akiba et al. [11] introduced Optuna with TPE sampling for efficient hyperparameter optimization.'
     )
     # ===== III. DATASET AND EDA =====
+    pdf.sec('III', 'Dataset and Exploratory Data Analysis')
+    pdf.sub('A.', 'Dataset Description')
     pdf.p(
+        'We use the European Cardholder dataset [1], containing 284,807 transactions over two days in September 2013. Each transaction has '
+        '28 PCA-transformed features (V1-V28), raw Time and Amount, and a binary Class label (0=legitimate, 1=fraud). The PCA transformation '
+        'protects cardholder privacy but prevents domain-specific feature engineering. The dataset has extreme class imbalance: only 492 '
+        'fraudulent transactions (0.173%), yielding an imbalance ratio of 1:577.'
     )
     pdf.tbl(
+        ['Class', 'Count', 'Percentage', 'Ratio'],
         [['Legitimate (0)', '284,315', '99.827%', '---'],
+         ['Fraud (1)', '492', '0.173%', '1:577'],
          ['Total', '284,807', '100%', '---']],
+        'Table I: Class Distribution'
     )
+    pdf.fig(os.path.join(F, 'class_distribution.png'),
+            'Fig. 1. Class distribution showing extreme imbalance (0.173% fraud).', w=130)
+    pdf.sub('B.', 'Transaction Amount and Temporal Patterns')
     pdf.p(
+        'Legitimate transactions have a mean of $88.29 (median $22.00); fraudulent transactions have a higher mean of $122.21 but lower '
+        'median of $9.25. This bimodal fraud pattern suggests two strategies: (i) low-value "testing" transactions verifying stolen cards, '
+        'and (ii) moderate-to-high-value theft. The nighttime (0-6h) fraud rate is 0.518%, nearly 4x the daytime rate of 0.137%, consistent '
+        'with fraudsters exploiting low-monitoring periods. These patterns motivate our cyclic temporal and amount-based feature engineering.'
     )
+    pdf.fig(os.path.join(F, 'amount_analysis.png'),
+            'Fig. 2. Amount analysis: (a) legitimate, (b) fraud, (c) log-scaled comparison, (d) boxplot.', w=145)
+    pdf.fig(os.path.join(F, 'time_analysis.png'),
+            'Fig. 3. Temporal patterns: (a) transaction density by hour, (b) fraud rate by hour.', w=135)
+    pdf.sub('C.', 'Feature Correlations and Key Observations')
     pdf.p(
+        'Pearson correlation identifies V17 (r=-0.326), V14 (r=-0.303), and V12 (r=-0.261) as having the strongest negative correlation '
+        'with fraud; V11 (+0.155) and V4 (+0.133) show the strongest positive correlation. Amount has near-zero correlation (r=0.006), '
+        'confirming that amount-based rules alone would be ineffective. Five key observations: (1) the 1:577 imbalance makes accuracy '
+        'meaningless; (2) bimodal fraud amounts require engineered deviation features; (3) the 4x nighttime fraud rate provides temporal '
+        'signal; (4) V14, V17, V12, V4, V11 carry the strongest fraud signal; (5) no missing values exist, with 1,081 duplicates removed.'
     )
+    pdf.fig(os.path.join(F, 'correlation_heatmap.png'),
+            'Fig. 4. Feature correlation with fraud class. Red bars indicate negative correlation (lower values signal fraud).', w=120)
     # ===== IV. METHODOLOGY =====
+    pdf.sec('IV', 'Methodology')
+    pdf.sub('A.', 'Feature Engineering')
     pdf.p(
+        'We augment the original 30 features with 12 engineered features (42 total). Temporal features: cyclic hour encoding '
+        '(Hour_sin, Hour_cos) and Time_diff (inter-arrival time). Amount features: Amount_log, Amount_deviation_mean, '
+        'Amount_deviation_median, Amount_zscore, and Transaction_velocity. Interaction features: V14*V17, V12*V14, V10*V14 (capturing '
+        'joint effects between top discriminators). PCA_magnitude: L2 norm of all V features, measuring overall transaction abnormality.'
     )
+    pdf.sub('B.', 'Class Imbalance, Splitting, and Scaling')
     pdf.p(
+        'Two approaches are compared, both applied exclusively to training data. SMOTE [2] generates synthetic fraud at a 1:2 ratio '
+        '(99,138 synthetic + 198,277 legitimate), used only for MLP training. Cost-sensitive learning applies balanced class weights '
+        '(w0=0.501, w1=300.01) for tree models and Logistic Regression. Stratified 70/15/15 splitting preserves the fraud ratio across '
+        'all sets (Train: 198,608 samples/331 fraud; Val/Test: 42,559/71 each). RobustScaler normalizes by IQR, fitted on train only.'
     )
+    pdf.sub('C.', 'Model Descriptions')
     pdf.p(
+        '(1) Logistic Regression: L2-regularized (C=0.1), balanced weights, interpretable baseline. '
+        '(2) Random Forest: 150 trees, depth 12, balanced weights. '
+        '(3) XGBoost: 200 estimators, depth 6, lr=0.1, scale_pos_weight from class frequencies, histogram splitting. '
+        '(4) LightGBM: 200 estimators, depth 8, lr=0.05, leaf-wise growth with GOSS. '
+        '(5) MLP: 128-64-32 neurons, ReLU, adaptive lr, early stopping, trained on SMOTE data. '
+        '(6) Autoencoder: 42-64-32-16-32-64-42, trained 50 epochs on legitimate only, detects fraud via reconstruction error. '
+        '(7) Voting Ensemble: soft voting over three best tuned models (XGBoost, LightGBM, RF).'
     )
+    pdf.sub('D.', 'Hyperparameter Optimization')
     pdf.p(
+        'Optuna [11] with TPE sampler tunes XGBoost, LightGBM, and Random Forest (15-20 trials each), optimizing PR-AUC on the validation set.'
     )
+    pdf.fig(os.path.join(F, 'architecture_diagram.png'),
+            'Fig. 5. End-to-end system architecture from transaction input through inference to API output and monitoring.', w=145)
     # ===== V. EXPERIMENTAL SETUP =====
+    pdf.sec('V', 'Experimental Setup')
     pdf.p(
+        'Experiments use Python 3.12 with scikit-learn 1.8.0, XGBoost 3.2.0, LightGBM 4.6.0, PyTorch 2.11.0, Optuna 4.8.0, SHAP 0.51.0, '
+        'and LIME 0.2.0.1 on CPU infrastructure (~25 min total training). Six metrics reported: Precision, Recall, F1, ROC-AUC, '
+        'PR-AUC (primary, most informative under extreme imbalance [18]), and MCC (balanced measure across all confusion matrix quadrants).'
     )
     # ===== VI. RESULTS AND DISCUSSION =====
+    pdf.sec('VI', 'Results and Discussion')
+    pdf.sub('A.', 'Model Comparison')
+    pdf.p('Table II presents comprehensive test set evaluation at threshold 0.5.')
     pdf.tbl(
         ['Model', 'Prec.', 'Recall', 'F1', 'ROC-AUC', 'PR-AUC', 'MCC'],
         [
             ['Logistic Reg.', '0.049', '0.887', '0.092', '0.962', '0.735', '0.204'],
             ['Autoencoder', '0.003', '1.000', '0.007', '0.960', '0.044', '0.041'],
         ],
+        'Table II: Comprehensive Model Comparison on Test Set (threshold=0.5)'
     )
     pdf.p(
+        'Observation 1 -- Tree models dominate: XGBoost achieves the highest PR-AUC (0.817), F1 (0.851), and MCC (0.852), confirming [9] '
+        'that gradient-boosted trees remain strongest for tabular data. The Voting Ensemble achieves marginally higher ROC-AUC (0.978) '
+        'but does not improve PR-AUC, suggesting insufficient member diversity.'
     )
     pdf.p(
+        'Observation 2 -- Precision-recall tradeoff: Logistic Regression achieves high recall (0.887) but catastrophic precision (0.049), '
+        'flagging 1,229 legitimate transactions. The aggressive class weight (300x) creates a boundary that is far too liberal, producing '
+        'a flood of false alarms that would overwhelm any operational team.'
     )
     pdf.p(
+        'Observation 3 -- Autoencoder failure: Perfect recall (1.0) but precision of only 0.003 (21,209 false positives). The PCA-transformed '
+        'space causes the autoencoder to reconstruct dominant variance directions; fraud signals in minor components produce similar '
+        'reconstruction errors to legitimate noise, making discrimination unreliable. PR-AUC of 0.044 is near-random.'
     )
     pdf.p(
+        'Observation 4 -- Tuning: Optuna dramatically improved LightGBM (PR-AUC 0.012 to 0.796 by correcting over-aggressive '
+        'scale_pos_weight). However, tuned XGBoost (0.793) slightly underperformed the base (0.817), suggesting the base was near-optimal.'
     )
+    pdf.fig(os.path.join(F, 'roc_curves.png'),
+            'Fig. 6. ROC curves for top 5 models (all ROC-AUC > 0.93).', w=115)
+    pdf.fig(os.path.join(F, 'pr_curves.png'),
+            'Fig. 7. Precision-Recall curves. XGBoost achieves the highest AP (0.817).', w=115)
+    pdf.fig(os.path.join(F, 'confusion_matrices.png'),
+            'Fig. 8. Confusion matrices. XGBoost: 57 TP, 6 FP -- best balance.', w=150)
+    pdf.sub('B.', 'Threshold Optimization')
     pdf.p(
+        'A systematic threshold sweep (0.05 to 0.95) on XGBoost reveals an optimal threshold of 0.55, improving F1 from 0.851 to 0.864 '
+        'and precision from 0.905 to 0.934 while maintaining identical recall (0.803). This is a Pareto improvement: borderline false '
+        'positives are eliminated without losing any true positives. Above 0.85, recall degrades as the model becomes overly conservative.'
+    )
+    pdf.fig(os.path.join(F, 'threshold_analysis.png'),
+            'Fig. 9. Threshold analysis: (a) Precision/Recall/F1, (b) MCC. Optimal F1 at 0.55.', w=135)
+    pdf.sub('C.', 'Business Impact')
     pdf.p(
+        'Using average fraud amount $122.21 and $5/false alarm investigation cost, XGBoost yields the highest net savings ($6,936 on the '
+        '42,559-transaction test set), catching 80.3% of fraud with only 6 false alarms ($30 cost). Logistic Regression catches 88.7% but '
+        'generates 1,229 false alarms ($6,145 cost), yielding only $1,554 net. The Autoencoder catches 100% but produces 21,209 false '
+        'alarms at $106,045 -- a net loss of $97,368. This underscores that maximizing recall alone is operationally counterproductive.'
     )
     pdf.tbl(
+        ['Model', 'TP', 'FN', 'FP', 'Caught($)', 'FP Cost($)', 'Net($)'],
         [
+            ['XGBoost', '57', '14', '6', '6,966', '30', '6,936'],
+            ['Ensemble', '57', '14', '9', '6,966', '45', '6,921'],
+            ['LGBM-T', '58', '13', '24', '7,088', '120', '6,968'],
+            ['LR', '63', '8', '1229', '7,699', '6,145', '1,554'],
+            ['AE', '71', '0', '21209', '8,677', '106,045', '-97,368'],
         ],
+        'Table III: Business Impact Analysis'
     )
+    pdf.sub('D.', 'Explainability (SHAP and LIME)')
     pdf.p(
+        'SHAP analysis (2,000 test samples) reveals V4 (mean |SHAP|=1.913), V14 (1.843), and PCA_magnitude (1.113) as the dominant '
+        'fraud predictors. High V4 values push toward fraud; low (negative) V14 values are strongly associated with fraud, consistent '
+        'with EDA correlations. The engineered V10_V14_interaction ranks 9th, validating that interaction terms capture additional signal. '
+        'LIME analysis on a correctly classified fraud sample (P=1.0) shows Time_diff, V4, V12, and V14 as the strongest local contributors, '
+        'providing the granular instance-level explanation needed for regulatory compliance and analyst review.'
     )
+    pdf.fig(os.path.join(F, 'shap_summary.png'),
+            'Fig. 10. SHAP summary: each dot = one sample; color = feature value; x-axis = SHAP impact on fraud prediction.', w=130)
+    pdf.fig(os.path.join(F, 'lime_explanation.png'),
+            'Fig. 11. LIME explanation for a single fraud sample (P=1.0). Red = increases fraud risk; green = decreases it.', w=130)
     # ===== VII. ERROR ANALYSIS =====
+    pdf.sec('VII', 'Error Analysis')
+    pdf.sub('A.', 'False Negatives (Missed Fraud)')
     pdf.p(
+        'XGBoost misses 14 of 71 fraud transactions (19.7%). Their mean predicted probability is only 0.013 -- the model is highly '
+        'confident they are legitimate, not borderline. Feature comparison explains why: FN transactions have V14 averaging -0.97 '
+        'vs -8.45 for true positives, V12 at -0.41 vs -7.69, and PCA_magnitude of 1.82 vs 12.25. These missed cases have feature '
+        'values dramatically closer to legitimate transactions, representing sophisticated fraud that mimics normal behavior. Lowering '
+        'the threshold would not help: at 0.12, only one additional FN would be caught while generating many more false alarms. '
+        'Catching these requires additional data sources (transaction sequences, device fingerprints, geography).'
     )
+    pdf.sub('B.', 'False Positives (False Alarms)')
     pdf.p(
+        'The 6 false positives have mean predicted probability 0.827 (some reaching 1.0). Their V14 averages -7.13 (vs -0.04 for TN) '
+        'and PCA_magnitude 7.86 (vs 0.28 for TN). These legitimate transactions genuinely exhibit fraud-like anomalous patterns -- '
+        'unusual but lawful spending (e.g., first-time purchases in unusual categories, international transactions). No model tuning can '
+        'distinguish these without additional contextual information.'
     )
+    pdf.sub('C.', 'Concept Drift and Retraining')
     pdf.p(
+        'Comparing model confidence between early and late test periods reveals a drift indicator of +0.115. We recommend: (1) weekly '
+        'PR-AUC monitoring on labeled data; (2) automated retraining when PR-AUC drops below 0.70; (3) sliding window training on 3-6 '
+        'months of recent data; (4) PSI monitoring on all features (alert when PSI > 0.25); (5) A/B testing for model updates; '
+        '(6) quarterly fraud pattern reviews with domain experts.'
     )
+    pdf.fig(os.path.join(F, 'error_analysis.png'),
+            'Fig. 12. Error analysis: (a) FN probability distribution, (b) FP probability distribution, (c) score distribution by class.', w=150)
     # ===== VIII. LIMITATIONS =====
+    pdf.sec('VIII', 'Limitations')
     pdf.bullet([
+        'PCA Anonymization: prevents domain-specific feature engineering (merchant, location, device) and limits interpretability.',
+        'Temporal Scope: only two days of data, limiting drift assessment and seasonal pattern detection.',
+        'Single-Institution: results may not generalize across banks, geographies, or payment networks.',
+        'Static Features: no sequential transaction history (spending velocity, merchant novelty) which are critical in production.',
+        'Static Threshold: optimal 0.55 was determined on test data and may shift; production needs dynamic adaptation.',
+        'Simple Autoencoder: more advanced architectures (VAE, adversarial) might improve anomaly detection performance.',
     ])
     # ===== IX. FUTURE WORK =====
+    pdf.sec('IX', 'Future Work')
     pdf.p(
+        'Graph Neural Networks [19]: Modeling transaction networks as graphs enables fraud ring detection through suspicion propagation '
+        'across connected accounts -- impossible from individual transaction features alone.'
     )
     pdf.p(
+        'Real-Time Streaming: Integration with Apache Kafka and Flink would enable millions of transactions/second with consistent '
+        'sub-100ms latency guarantees. Federated Learning [20]: collaborative training across banks without sharing raw data preserves '
+        'privacy while expanding effective training sets for rare fraud types.'
     )
     pdf.p(
+        'LLM-Generated Explanations: Large language models could translate SHAP values into natural-language justifications for blocked '
+        'transactions, reducing analyst burden and satisfying regulatory requirements for explainable decisions.'
     )
     pdf.p(
+        'Temporal Sequence Modeling: Transformers or LSTMs on cardholder transaction sequences could capture behavioral patterns and flag '
+        'departures from established routines, treating fraud detection as time-series anomaly detection.'
     )
     # ===== X. CONCLUSION =====
+    pdf.sec('X', 'Conclusion')
     pdf.p(
+        'This paper presents a comprehensive fraud detection framework evaluating seven ML approaches on the European Cardholder benchmark. '
+        'XGBoost with cost-sensitive learning achieves best overall performance (PR-AUC 0.817, F1 0.851, MCC 0.852). Threshold optimization '
+        'to 0.55 improves F1 to 0.864 without sacrificing recall. Business impact analysis shows XGBoost catches 80.3% of fraud with only '
+        '6 false alarms ($6,936 net savings), while the Autoencoder\'s 100% recall generates 21,000+ false alarms at $97,368 net loss.'
     )
     pdf.p(
+        'SHAP and LIME identify V4, V14, and PCA_magnitude as primary fraud discriminators. Error analysis reveals that 14 missed fraud '
+        'cases have feature profiles indistinguishable from legitimate transactions, requiring additional data sources to catch. The complete '
+        'system -- feature engineering, training, evaluation, explainability, and FastAPI deployment with sub-10ms latency -- demonstrates '
+        'that production-grade fraud detection is achievable with well-tuned classical ML. Tree-based ensembles, particularly XGBoost, '
+        'remain state-of-the-art for tabular fraud detection, outperforming deep learning and linear alternatives on all metrics that '
+        'matter for imbalanced classification.'
     )
     # ===== REFERENCES =====
+    pdf.sec('', 'References')
     refs = [
+        '[1]  A. Dal Pozzolo et al., "Calibrating probability with undersampling for unbalanced classification," IEEE CIDM, 2015.',
+        '[2]  N. V. Chawla et al., "SMOTE: Synthetic Minority Over-sampling Technique," JAIR, vol. 16, pp. 321-357, 2002.',
+        '[3]  A. Fernandez et al., Learning from Imbalanced Data Sets. Springer, 2018.',
+        '[4]  T. Chen and C. Guestrin, "XGBoost: A scalable tree boosting system," ACM SIGKDD, 2016.',
+        '[5]  G. Ke et al., "LightGBM: A highly efficient gradient boosting decision tree," NeurIPS, 2017.',
+        '[6]  A. Pumsirirat and L. Yan, "Credit card fraud detection using deep learning," IJACSA, vol. 9, 2018.',
+        '[7]  S. M. Lundberg and S.-I. Lee, "A unified approach to interpreting model predictions," NeurIPS, 2017.',
+        '[8]  M. T. Ribeiro et al., "Why should I trust you?," ACM SIGKDD, 2016.',
+        '[9]  R. Shwartz-Ziv and A. Armon, "Tabular data: Deep learning is not all you need," Info. Fusion, vol. 81, 2022.',
+        '[10] L. Grinsztajn et al., "Why do tree-based models still outperform deep learning on tabular data?," NeurIPS, 2022.',
+        '[11] T. Akiba et al., "Optuna: A next-generation hyperparameter optimization framework," ACM SIGKDD, 2019.',
+        '[12] R. J. Bolton and D. J. Hand, "Statistical fraud detection: A review," Statistical Science, vol. 17, 2002.',
+        '[13] Z. Zhang et al., "A model based on convolutional RNN for fraud detection," Complexity, 2021.',
+        '[14] A. A. Taha and S. J. Malebary, "An intelligent approach to credit card fraud detection," IEEE Access, vol. 8, 2020.',
+        '[15] V. Belle and I. Papantonis, "Principles and practice of explainable ML," Frontiers in Big Data, vol. 4, 2021.',
+        '[16] L. Prokhorenkova et al., "CatBoost: Unbiased boosting with categorical features," NeurIPS, 2018.',
+        '[17] S. Xuan et al., "Random forest for credit card fraud detection," IEEE ICNSC, 2018.',
+        '[18] T. Saito and M. Rehmsmeier, "The PR plot is more informative than ROC on imbalanced datasets," PLoS ONE, 2015.',
+        '[19] Y. Liu et al., "Pick and choose: A GNN-based imbalanced learning for fraud detection," Web Conf., 2021.',
+        '[20] Q. Yang et al., "Federated machine learning: Concept and applications," ACM TIST, vol. 10, 2019.',
         '[21] Nilson Report, "Global card fraud losses," Issue 1209, 2022.',
+        '[22] A. Dal Pozzolo et al., "When is undersampling effective?," ECML PKDD, 2015.',
     ]
+    pdf.set_font('Times', '', 7)
     for ref in refs:
+        pdf.multi_cell(0, 3.2, ref)
+        pdf.ln(0.5)
     out = os.path.join(PAPER_DIR, 'fraud_detection_paper.pdf')
     pdf.output(out)
+    print(f"PDF saved: {out}  ({pdf.page_no()} pages)")
 if __name__ == '__main__':