""" Module 2: Data Preprocessing Feature engineering, class imbalance handling, stratified splitting, scaling. """ import os import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, RobustScaler from imblearn.over_sampling import SMOTE import joblib import warnings warnings.filterwarnings('ignore') from config import DATA_DIR, MODELS_DIR, SEED, TRAIN_RATIO, VAL_RATIO, TEST_RATIO def engineer_features(df): """Engineer new features from raw data.""" print("\n" + "=" * 60) print("FEATURE ENGINEERING") print("=" * 60) df = df.copy() # 1. Hour of Day (cyclic encoding) df['Hour'] = (df['Time'] / 3600) % 24 df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24) df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24) # 2. Time since last transaction (proxy: diff in Time column) df['Time_diff'] = df['Time'].diff().fillna(0) # 3. Transaction Amount Features df['Amount_log'] = np.log1p(df['Amount']) # 4. Amount deviation from global mean/median df['Amount_deviation_mean'] = df['Amount'] - df['Amount'].mean() df['Amount_deviation_median'] = df['Amount'] - df['Amount'].median() # 5. Transaction velocity (rolling count proxy using time windows) # We approximate velocity as inverse of time since last transaction df['Transaction_velocity'] = 1.0 / (df['Time_diff'] + 1.0) # 6. Amount z-score df['Amount_zscore'] = (df['Amount'] - df['Amount'].mean()) / (df['Amount'].std() + 1e-8) # 7. Interaction features between top PCA components df['V14_V17_interaction'] = df['V14'] * df['V17'] df['V12_V14_interaction'] = df['V12'] * df['V14'] df['V10_V14_interaction'] = df['V10'] * df['V14'] # 8. Magnitude features pca_features = [f'V{i}' for i in range(1, 29)] df['PCA_magnitude'] = np.sqrt((df[pca_features] ** 2).sum(axis=1)) # Drop raw Hour (we have cyclic encoding) df = df.drop('Hour', axis=1) new_features = ['Hour_sin', 'Hour_cos', 'Time_diff', 'Amount_log', 'Amount_deviation_mean', 'Amount_deviation_median', 'Transaction_velocity', 'Amount_zscore', 'V14_V17_interaction', 'V12_V14_interaction', 'V10_V14_interaction', 'PCA_magnitude'] print(f"Engineered {len(new_features)} new features:") for f in new_features: print(f" - {f}") print(f"\nDataset shape after feature engineering: {df.shape}") return df, new_features def stratified_split(df, target_col='Class'): """Perform stratified 70/15/15 train/val/test split.""" print("\n" + "=" * 60) print("STRATIFIED DATA SPLITTING (70/15/15)") print("=" * 60) X = df.drop(target_col, axis=1) y = df[target_col] # First split: 70% train, 30% temp X_train, X_temp, y_train, y_temp = train_test_split( X, y, test_size=(VAL_RATIO + TEST_RATIO), random_state=SEED, stratify=y ) # Second split: 50/50 of the 30% = 15/15 X_val, X_test, y_val, y_test = train_test_split( X_temp, y_temp, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO), random_state=SEED, stratify=y_temp ) print(f"\nTrain: {X_train.shape[0]:,} samples ({y_train.sum()} fraud, {y_train.mean()*100:.3f}%)") print(f"Val: {X_val.shape[0]:,} samples ({y_val.sum()} fraud, {y_val.mean()*100:.3f}%)") print(f"Test: {X_test.shape[0]:,} samples ({y_test.sum()} fraud, {y_test.mean()*100:.3f}%)") return X_train, X_val, X_test, y_train, y_val, y_test def scale_features(X_train, X_val, X_test): """Scale features: fit on train only.""" print("\n" + "=" * 60) print("FEATURE SCALING (Fit on Train Only)") print("=" * 60) scaler = RobustScaler() X_train_scaled = pd.DataFrame( scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index ) X_val_scaled = pd.DataFrame( scaler.transform(X_val), columns=X_val.columns, index=X_val.index ) X_test_scaled = pd.DataFrame( scaler.transform(X_test), columns=X_test.columns, index=X_test.index ) # Save scaler scaler_path = os.path.join(MODELS_DIR, "scaler.joblib") joblib.dump(scaler, scaler_path) print(f"Scaler saved to: {scaler_path}") print(f"Scaling method: RobustScaler (robust to outliers)") return X_train_scaled, X_val_scaled, X_test_scaled, scaler def apply_smote(X_train, y_train): """Apply SMOTE to training data only.""" print("\n" + "=" * 60) print("SMOTE OVERSAMPLING (Train Set Only)") print("=" * 60) print(f"\nBefore SMOTE:") print(f" Class 0: {(y_train == 0).sum():,}") print(f" Class 1: {(y_train == 1).sum():,}") smote = SMOTE(random_state=SEED, sampling_strategy=0.5) # 1:2 ratio instead of 1:1 X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train) print(f"\nAfter SMOTE (0.5 ratio):") print(f" Class 0: {(y_train_smote == 0).sum():,}") print(f" Class 1: {(y_train_smote == 1).sum():,}") return X_train_smote, y_train_smote def compute_class_weights(y_train): """Compute class weights for cost-sensitive learning.""" from sklearn.utils.class_weight import compute_class_weight classes = np.unique(y_train) weights = compute_class_weight('balanced', classes=classes, y=y_train) class_weight_dict = dict(zip(classes, weights)) print(f"\nClass weights (balanced):") print(f" Class 0: {class_weight_dict[0]:.4f}") print(f" Class 1: {class_weight_dict[1]:.4f}") return class_weight_dict def run_preprocessing(): """Run the complete preprocessing pipeline.""" print("=" * 60) print("FRAUD DETECTION SYSTEM - PREPROCESSING") print("=" * 60) # Load raw data df = pd.read_csv(os.path.join(DATA_DIR, "creditcard.csv")) print(f"Loaded dataset: {df.shape}") # Remove duplicates df = df.drop_duplicates() print(f"After removing duplicates: {df.shape}") # Feature engineering df, new_features = engineer_features(df) # Stratified split BEFORE any resampling X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(df) # Scale features (fit on train only) X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features( X_train, X_val, X_test ) # SMOTE on train set only X_train_smote, y_train_smote = apply_smote(X_train_scaled, y_train) # Class weights (alternative to SMOTE) class_weights = compute_class_weights(y_train) # Save processed data data = { 'X_train': X_train_scaled, 'X_val': X_val_scaled, 'X_test': X_test_scaled, 'y_train': y_train, 'y_val': y_val, 'y_test': y_test, 'X_train_smote': X_train_smote, 'y_train_smote': y_train_smote, 'class_weights': class_weights, 'feature_names': list(X_train.columns), 'scaler': scaler, 'new_features': new_features, } data_path = os.path.join(DATA_DIR, "processed_data.joblib") joblib.dump(data, data_path) print(f"\nProcessed data saved to: {data_path}") print("\n" + "=" * 60) print("PREPROCESSING COMPLETE") print("=" * 60) return data if __name__ == "__main__": data = run_preprocessing()