fraud-detection-system / preprocessing.py
rajvivan's picture
Complete fraud detection system: code, figures, models, paper
408a9b2 verified
raw
history blame
7.54 kB
"""
Module 2: Data Preprocessing
Feature engineering, class imbalance handling, stratified splitting, scaling.
"""
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')
from config import DATA_DIR, MODELS_DIR, SEED, TRAIN_RATIO, VAL_RATIO, TEST_RATIO
def engineer_features(df):
"""Engineer new features from raw data."""
print("\n" + "=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)
df = df.copy()
# 1. Hour of Day (cyclic encoding)
df['Hour'] = (df['Time'] / 3600) % 24
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
# 2. Time since last transaction (proxy: diff in Time column)
df['Time_diff'] = df['Time'].diff().fillna(0)
# 3. Transaction Amount Features
df['Amount_log'] = np.log1p(df['Amount'])
# 4. Amount deviation from global mean/median
df['Amount_deviation_mean'] = df['Amount'] - df['Amount'].mean()
df['Amount_deviation_median'] = df['Amount'] - df['Amount'].median()
# 5. Transaction velocity (rolling count proxy using time windows)
# We approximate velocity as inverse of time since last transaction
df['Transaction_velocity'] = 1.0 / (df['Time_diff'] + 1.0)
# 6. Amount z-score
df['Amount_zscore'] = (df['Amount'] - df['Amount'].mean()) / (df['Amount'].std() + 1e-8)
# 7. Interaction features between top PCA components
df['V14_V17_interaction'] = df['V14'] * df['V17']
df['V12_V14_interaction'] = df['V12'] * df['V14']
df['V10_V14_interaction'] = df['V10'] * df['V14']
# 8. Magnitude features
pca_features = [f'V{i}' for i in range(1, 29)]
df['PCA_magnitude'] = np.sqrt((df[pca_features] ** 2).sum(axis=1))
# Drop raw Hour (we have cyclic encoding)
df = df.drop('Hour', axis=1)
new_features = ['Hour_sin', 'Hour_cos', 'Time_diff', 'Amount_log',
'Amount_deviation_mean', 'Amount_deviation_median',
'Transaction_velocity', 'Amount_zscore',
'V14_V17_interaction', 'V12_V14_interaction', 'V10_V14_interaction',
'PCA_magnitude']
print(f"Engineered {len(new_features)} new features:")
for f in new_features:
print(f" - {f}")
print(f"\nDataset shape after feature engineering: {df.shape}")
return df, new_features
def stratified_split(df, target_col='Class'):
"""Perform stratified 70/15/15 train/val/test split."""
print("\n" + "=" * 60)
print("STRATIFIED DATA SPLITTING (70/15/15)")
print("=" * 60)
X = df.drop(target_col, axis=1)
y = df[target_col]
# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=(VAL_RATIO + TEST_RATIO),
random_state=SEED, stratify=y
)
# Second split: 50/50 of the 30% = 15/15
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO),
random_state=SEED, stratify=y_temp
)
print(f"\nTrain: {X_train.shape[0]:,} samples ({y_train.sum()} fraud, {y_train.mean()*100:.3f}%)")
print(f"Val: {X_val.shape[0]:,} samples ({y_val.sum()} fraud, {y_val.mean()*100:.3f}%)")
print(f"Test: {X_test.shape[0]:,} samples ({y_test.sum()} fraud, {y_test.mean()*100:.3f}%)")
return X_train, X_val, X_test, y_train, y_val, y_test
def scale_features(X_train, X_val, X_test):
"""Scale features: fit on train only."""
print("\n" + "=" * 60)
print("FEATURE SCALING (Fit on Train Only)")
print("=" * 60)
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
scaler.fit_transform(X_train),
columns=X_train.columns,
index=X_train.index
)
X_val_scaled = pd.DataFrame(
scaler.transform(X_val),
columns=X_val.columns,
index=X_val.index
)
X_test_scaled = pd.DataFrame(
scaler.transform(X_test),
columns=X_test.columns,
index=X_test.index
)
# Save scaler
scaler_path = os.path.join(MODELS_DIR, "scaler.joblib")
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")
print(f"Scaling method: RobustScaler (robust to outliers)")
return X_train_scaled, X_val_scaled, X_test_scaled, scaler
def apply_smote(X_train, y_train):
"""Apply SMOTE to training data only."""
print("\n" + "=" * 60)
print("SMOTE OVERSAMPLING (Train Set Only)")
print("=" * 60)
print(f"\nBefore SMOTE:")
print(f" Class 0: {(y_train == 0).sum():,}")
print(f" Class 1: {(y_train == 1).sum():,}")
smote = SMOTE(random_state=SEED, sampling_strategy=0.5) # 1:2 ratio instead of 1:1
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE (0.5 ratio):")
print(f" Class 0: {(y_train_smote == 0).sum():,}")
print(f" Class 1: {(y_train_smote == 1).sum():,}")
return X_train_smote, y_train_smote
def compute_class_weights(y_train):
"""Compute class weights for cost-sensitive learning."""
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))
print(f"\nClass weights (balanced):")
print(f" Class 0: {class_weight_dict[0]:.4f}")
print(f" Class 1: {class_weight_dict[1]:.4f}")
return class_weight_dict
def run_preprocessing():
"""Run the complete preprocessing pipeline."""
print("=" * 60)
print("FRAUD DETECTION SYSTEM - PREPROCESSING")
print("=" * 60)
# Load raw data
df = pd.read_csv(os.path.join(DATA_DIR, "creditcard.csv"))
print(f"Loaded dataset: {df.shape}")
# Remove duplicates
df = df.drop_duplicates()
print(f"After removing duplicates: {df.shape}")
# Feature engineering
df, new_features = engineer_features(df)
# Stratified split BEFORE any resampling
X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(df)
# Scale features (fit on train only)
X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features(
X_train, X_val, X_test
)
# SMOTE on train set only
X_train_smote, y_train_smote = apply_smote(X_train_scaled, y_train)
# Class weights (alternative to SMOTE)
class_weights = compute_class_weights(y_train)
# Save processed data
data = {
'X_train': X_train_scaled,
'X_val': X_val_scaled,
'X_test': X_test_scaled,
'y_train': y_train,
'y_val': y_val,
'y_test': y_test,
'X_train_smote': X_train_smote,
'y_train_smote': y_train_smote,
'class_weights': class_weights,
'feature_names': list(X_train.columns),
'scaler': scaler,
'new_features': new_features,
}
data_path = os.path.join(DATA_DIR, "processed_data.joblib")
joblib.dump(data, data_path)
print(f"\nProcessed data saved to: {data_path}")
print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE")
print("=" * 60)
return data
if __name__ == "__main__":
data = run_preprocessing()