| """ |
| Module 1: Exploratory Data Analysis (EDA) |
| Generates comprehensive analysis and figures for the credit card fraud dataset. |
| """ |
| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import matplotlib.gridspec as gridspec |
| import seaborn as sns |
| from datasets import load_dataset |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| from config import FIGURES_DIR, FIG_DPI, FIG_BG, DATASET_ID, DATA_DIR, SEED |
|
|
| |
| plt.style.use('seaborn-v0_8-whitegrid') |
| sns.set_palette("husl") |
|
|
|
|
| def load_data(): |
| """Load the credit card fraud dataset from HuggingFace Hub.""" |
| print("=" * 60) |
| print("LOADING DATASET") |
| print("=" * 60) |
| ds = load_dataset(DATASET_ID, split="train") |
| df = ds.to_pandas() |
| |
| df.to_csv(os.path.join(DATA_DIR, "creditcard.csv"), index=False) |
| print(f"Dataset shape: {df.shape}") |
| print(f"Columns: {list(df.columns)}") |
| return df |
|
|
|
|
| def basic_statistics(df): |
| """Print basic dataset statistics.""" |
| print("\n" + "=" * 60) |
| print("BASIC STATISTICS") |
| print("=" * 60) |
| print(f"\nShape: {df.shape[0]} rows, {df.shape[1]} columns") |
| print(f"\nData types:\n{df.dtypes.value_counts()}") |
| print(f"\nMissing values: {df.isnull().sum().sum()}") |
| print(f"\nDuplicate rows: {df.duplicated().sum()}") |
| print(f"\nBasic stats for Amount:") |
| print(df['Amount'].describe()) |
| print(f"\nBasic stats for Time:") |
| print(df['Time'].describe()) |
| return df.describe() |
|
|
|
|
| def class_distribution_analysis(df): |
| """Analyze and visualize class distribution.""" |
| print("\n" + "=" * 60) |
| print("CLASS DISTRIBUTION ANALYSIS") |
| print("=" * 60) |
| |
| class_counts = df['Class'].value_counts() |
| fraud_ratio = class_counts[1] / len(df) * 100 |
| |
| print(f"\nClass 0 (Legitimate): {class_counts[0]:,} ({100 - fraud_ratio:.3f}%)") |
| print(f"Class 1 (Fraud): {class_counts[1]:,} ({fraud_ratio:.3f}%)") |
| print(f"Imbalance ratio: 1:{class_counts[0] // class_counts[1]}") |
| |
| |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5), facecolor=FIG_BG) |
| |
| |
| colors = ['#2ecc71', '#e74c3c'] |
| bars = axes[0].bar(['Legitimate\n(Class 0)', 'Fraud\n(Class 1)'], |
| class_counts.values, color=colors, edgecolor='black', linewidth=0.5) |
| axes[0].set_ylabel('Number of Transactions', fontsize=12) |
| axes[0].set_title('Transaction Class Distribution', fontsize=14, fontweight='bold') |
| for bar, count in zip(bars, class_counts.values): |
| axes[0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 1000, |
| f'{count:,}', ha='center', va='bottom', fontsize=11, fontweight='bold') |
| axes[0].set_yscale('log') |
| axes[0].set_ylabel('Number of Transactions (log scale)', fontsize=12) |
| |
| |
| axes[1].pie(class_counts.values, labels=['Legitimate', 'Fraud'], |
| colors=colors, autopct='%1.3f%%', startangle=90, |
| explode=(0, 0.1), shadow=True, textprops={'fontsize': 12}) |
| axes[1].set_title('Fraud Ratio', fontsize=14, fontweight='bold') |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "class_distribution.png"), dpi=FIG_DPI, |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "class_distribution.pdf"), |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: class_distribution.png/pdf") |
| |
| return class_counts, fraud_ratio |
|
|
|
|
| def transaction_amount_analysis(df): |
| """Analyze transaction amounts by class.""" |
| print("\n" + "=" * 60) |
| print("TRANSACTION AMOUNT ANALYSIS") |
| print("=" * 60) |
| |
| for cls, label in [(0, 'Legitimate'), (1, 'Fraud')]: |
| subset = df[df['Class'] == cls]['Amount'] |
| print(f"\n{label} Transactions:") |
| print(f" Mean: ${subset.mean():.2f}") |
| print(f" Median: ${subset.median():.2f}") |
| print(f" Std: ${subset.std():.2f}") |
| print(f" Min: ${subset.min():.2f}") |
| print(f" Max: ${subset.max():.2f}") |
| print(f" Q25: ${subset.quantile(0.25):.2f}") |
| print(f" Q75: ${subset.quantile(0.75):.2f}") |
| |
| fig, axes = plt.subplots(2, 2, figsize=(14, 10), facecolor=FIG_BG) |
| |
| |
| axes[0, 0].hist(df[df['Class'] == 0]['Amount'], bins=100, color='#2ecc71', alpha=0.7, edgecolor='black', linewidth=0.3) |
| axes[0, 0].set_title('Legitimate Transaction Amounts', fontsize=12, fontweight='bold') |
| axes[0, 0].set_xlabel('Amount ($)') |
| axes[0, 0].set_ylabel('Frequency') |
| axes[0, 0].set_xlim(0, 2500) |
| |
| |
| axes[0, 1].hist(df[df['Class'] == 1]['Amount'], bins=50, color='#e74c3c', alpha=0.7, edgecolor='black', linewidth=0.3) |
| axes[0, 1].set_title('Fraudulent Transaction Amounts', fontsize=12, fontweight='bold') |
| axes[0, 1].set_xlabel('Amount ($)') |
| axes[0, 1].set_ylabel('Frequency') |
| |
| |
| for cls, color, label in [(0, '#2ecc71', 'Legitimate'), (1, '#e74c3c', 'Fraud')]: |
| subset = df[df['Class'] == cls]['Amount'] |
| axes[1, 0].hist(np.log1p(subset), bins=50, color=color, alpha=0.6, label=label, edgecolor='black', linewidth=0.3) |
| axes[1, 0].set_title('Log-Scaled Amount Distribution', fontsize=12, fontweight='bold') |
| axes[1, 0].set_xlabel('log(1 + Amount)') |
| axes[1, 0].set_ylabel('Frequency') |
| axes[1, 0].legend() |
| |
| |
| df_plot = df[['Amount', 'Class']].copy() |
| df_plot['Class'] = df_plot['Class'].map({0: 'Legitimate', 1: 'Fraud'}) |
| sns.boxplot(data=df_plot, x='Class', y='Amount', palette=['#2ecc71', '#e74c3c'], ax=axes[1, 1]) |
| axes[1, 1].set_title('Amount by Class (Box Plot)', fontsize=12, fontweight='bold') |
| axes[1, 1].set_ylim(0, 500) |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "amount_analysis.png"), dpi=FIG_DPI, |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "amount_analysis.pdf"), |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: amount_analysis.png/pdf") |
|
|
|
|
| def time_analysis(df): |
| """Analyze temporal patterns.""" |
| print("\n" + "=" * 60) |
| print("TEMPORAL ANALYSIS") |
| print("=" * 60) |
| |
| df_temp = df.copy() |
| df_temp['Hour'] = (df_temp['Time'] / 3600) % 24 |
| |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5), facecolor=FIG_BG) |
| |
| |
| for cls, color, label in [(0, '#2ecc71', 'Legitimate'), (1, '#e74c3c', 'Fraud')]: |
| subset = df_temp[df_temp['Class'] == cls] |
| axes[0].hist(subset['Hour'], bins=48, color=color, alpha=0.6, label=label, density=True) |
| axes[0].set_title('Transaction Density by Hour of Day', fontsize=12, fontweight='bold') |
| axes[0].set_xlabel('Hour of Day') |
| axes[0].set_ylabel('Density') |
| axes[0].legend() |
| |
| |
| hourly_fraud = df_temp.groupby(df_temp['Hour'].astype(int))['Class'].mean() * 100 |
| axes[1].bar(hourly_fraud.index, hourly_fraud.values, color='#e74c3c', alpha=0.7, edgecolor='black', linewidth=0.3) |
| axes[1].set_title('Fraud Rate by Hour', fontsize=12, fontweight='bold') |
| axes[1].set_xlabel('Hour of Day') |
| axes[1].set_ylabel('Fraud Rate (%)') |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "time_analysis.png"), dpi=FIG_DPI, |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "time_analysis.pdf"), |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: time_analysis.png/pdf") |
|
|
|
|
| def correlation_heatmap(df): |
| """Generate correlation heatmap.""" |
| print("\n" + "=" * 60) |
| print("CORRELATION ANALYSIS") |
| print("=" * 60) |
| |
| |
| correlations = df.corr()['Class'].drop('Class').sort_values() |
| print("\nTop 10 features positively correlated with Fraud:") |
| print(correlations.tail(10)) |
| print("\nTop 10 features negatively correlated with Fraud:") |
| print(correlations.head(10)) |
| |
| fig, axes = plt.subplots(1, 2, figsize=(18, 7), facecolor=FIG_BG) |
| |
| |
| colors = ['#e74c3c' if v < 0 else '#2ecc71' for v in correlations.values] |
| axes[0].barh(correlations.index, correlations.values, color=colors, edgecolor='black', linewidth=0.3) |
| axes[0].set_title('Feature Correlation with Fraud (Class)', fontsize=12, fontweight='bold') |
| axes[0].set_xlabel('Pearson Correlation') |
| axes[0].axvline(x=0, color='black', linewidth=0.5) |
| |
| |
| important_features = list(correlations.head(5).index) + list(correlations.tail(5).index) + ['Amount', 'Time', 'Class'] |
| corr_matrix = df[important_features].corr() |
| sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0, |
| ax=axes[1], square=True, linewidths=0.5) |
| axes[1].set_title('Correlation Heatmap (Top Features)', fontsize=12, fontweight='bold') |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "correlation_heatmap.png"), dpi=FIG_DPI, |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "correlation_heatmap.pdf"), |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: correlation_heatmap.png/pdf") |
| |
| return correlations |
|
|
|
|
| def feature_distributions(df): |
| """Plot distributions of key PCA features by class.""" |
| print("\n" + "=" * 60) |
| print("FEATURE DISTRIBUTIONS") |
| print("=" * 60) |
| |
| |
| corr_with_class = df.corr()['Class'].drop('Class').abs().sort_values(ascending=False) |
| top_features = corr_with_class.head(12).index.tolist() |
| |
| fig, axes = plt.subplots(3, 4, figsize=(20, 12), facecolor=FIG_BG) |
| axes = axes.ravel() |
| |
| for i, feat in enumerate(top_features): |
| for cls, color, label in [(0, '#2ecc71', 'Legit'), (1, '#e74c3c', 'Fraud')]: |
| subset = df[df['Class'] == cls][feat] |
| axes[i].hist(subset, bins=50, color=color, alpha=0.5, label=label, density=True) |
| axes[i].set_title(f'{feat}', fontsize=10, fontweight='bold') |
| axes[i].legend(fontsize=8) |
| |
| plt.suptitle('Distribution of Top 12 Discriminative Features by Class', fontsize=14, fontweight='bold', y=1.02) |
| plt.tight_layout() |
| plt.savefig(os.path.join(FIGURES_DIR, "feature_distributions.png"), dpi=FIG_DPI, |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.savefig(os.path.join(FIGURES_DIR, "feature_distributions.pdf"), |
| bbox_inches='tight', facecolor=FIG_BG) |
| plt.close() |
| print("Saved: feature_distributions.png/pdf") |
|
|
|
|
| def missing_values_analysis(df): |
| """Check for missing values.""" |
| print("\n" + "=" * 60) |
| print("MISSING VALUES ANALYSIS") |
| print("=" * 60) |
| |
| missing = df.isnull().sum() |
| missing_pct = (missing / len(df)) * 100 |
| |
| if missing.sum() == 0: |
| print("No missing values found in the dataset.") |
| else: |
| missing_report = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct}) |
| missing_report = missing_report[missing_report['Missing Count'] > 0] |
| print(missing_report) |
| |
| return missing |
|
|
|
|
| def key_observations(df, class_counts, fraud_ratio, correlations): |
| """Generate 5 key observations from the data.""" |
| print("\n" + "=" * 60) |
| print("5 KEY OBSERVATIONS") |
| print("=" * 60) |
| |
| observations = [] |
| |
| |
| obs1 = (f"1. EXTREME CLASS IMBALANCE: Only {fraud_ratio:.3f}% of transactions are fraudulent " |
| f"({class_counts[1]:,} out of {len(df):,}). The imbalance ratio is approximately " |
| f"1:{class_counts[0] // class_counts[1]}, making accuracy a misleading metric.") |
| observations.append(obs1) |
| |
| |
| fraud_amt = df[df['Class'] == 1]['Amount'] |
| legit_amt = df[df['Class'] == 0]['Amount'] |
| obs2 = (f"2. AMOUNT PATTERNS: Fraudulent transactions have a mean of ${fraud_amt.mean():.2f} " |
| f"(median: ${fraud_amt.median():.2f}) vs legitimate mean of ${legit_amt.mean():.2f} " |
| f"(median: ${legit_amt.median():.2f}). Fraud tends to involve smaller amounts to " |
| f"avoid detection triggers.") |
| observations.append(obs2) |
| |
| |
| df_temp = df.copy() |
| df_temp['Hour'] = (df_temp['Time'] / 3600) % 24 |
| night_fraud = df_temp[(df_temp['Hour'] >= 0) & (df_temp['Hour'] <= 6)] |
| night_fraud_rate = night_fraud['Class'].mean() * 100 |
| day_fraud_rate = df_temp[(df_temp['Hour'] >= 7) & (df_temp['Hour'] <= 23)]['Class'].mean() * 100 |
| obs3 = (f"3. TEMPORAL PATTERNS: Night-time (0-6h) fraud rate is {night_fraud_rate:.3f}% " |
| f"vs daytime (7-23h) rate of {day_fraud_rate:.3f}%. " |
| f"Fraudsters are more active during low-activity periods.") |
| observations.append(obs3) |
| |
| |
| top_neg = correlations.head(3) |
| top_pos = correlations.tail(3) |
| obs4 = (f"4. KEY DISCRIMINATIVE FEATURES: Most negatively correlated with fraud: " |
| f"{list(top_neg.index)} (r={top_neg.values[0]:.3f} to {top_neg.values[2]:.3f}). " |
| f"Most positively correlated: {list(top_pos.index)} " |
| f"(r={top_pos.values[0]:.3f} to {top_pos.values[2]:.3f}).") |
| observations.append(obs4) |
| |
| |
| obs5 = (f"5. DATA QUALITY: The dataset has no missing values and {df.duplicated().sum()} " |
| f"duplicate rows. All V1-V28 features are PCA-transformed, ensuring no " |
| f"multicollinearity in the principal components. Only 'Time' and 'Amount' are " |
| f"in original scale and need normalization.") |
| observations.append(obs5) |
| |
| for obs in observations: |
| print(f"\n{obs}") |
| |
| return observations |
|
|
|
|
| def run_eda(): |
| """Run the complete EDA pipeline.""" |
| print("=" * 60) |
| print("FRAUD DETECTION SYSTEM - EXPLORATORY DATA ANALYSIS") |
| print("=" * 60) |
| |
| |
| df = load_data() |
| |
| |
| stats = basic_statistics(df) |
| |
| |
| class_counts, fraud_ratio = class_distribution_analysis(df) |
| |
| |
| transaction_amount_analysis(df) |
| |
| |
| time_analysis(df) |
| |
| |
| correlations = correlation_heatmap(df) |
| |
| |
| feature_distributions(df) |
| |
| |
| missing = missing_values_analysis(df) |
| |
| |
| observations = key_observations(df, class_counts, fraud_ratio, correlations) |
| |
| print("\n" + "=" * 60) |
| print("EDA COMPLETE - All figures saved to:", FIGURES_DIR) |
| print("=" * 60) |
| |
| return df, stats, class_counts, fraud_ratio, correlations, observations |
|
|
|
|
| if __name__ == "__main__": |
| df, stats, class_counts, fraud_ratio, correlations, observations = run_eda() |
|
|