| """ |
| Module 2: Data Preprocessing |
| Feature engineering, class imbalance handling, stratified splitting, scaling. |
| """ |
| import os |
| import numpy as np |
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import StandardScaler, RobustScaler |
| from imblearn.over_sampling import SMOTE |
| import joblib |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| from config import DATA_DIR, MODELS_DIR, SEED, TRAIN_RATIO, VAL_RATIO, TEST_RATIO |
|
|
|
|
| def engineer_features(df): |
| """Engineer new features from raw data.""" |
| print("\n" + "=" * 60) |
| print("FEATURE ENGINEERING") |
| print("=" * 60) |
| |
| df = df.copy() |
| |
| |
| df['Hour'] = (df['Time'] / 3600) % 24 |
| df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24) |
| df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24) |
| |
| |
| df['Time_diff'] = df['Time'].diff().fillna(0) |
| |
| |
| df['Amount_log'] = np.log1p(df['Amount']) |
| |
| |
| df['Amount_deviation_mean'] = df['Amount'] - df['Amount'].mean() |
| df['Amount_deviation_median'] = df['Amount'] - df['Amount'].median() |
| |
| |
| |
| df['Transaction_velocity'] = 1.0 / (df['Time_diff'] + 1.0) |
| |
| |
| df['Amount_zscore'] = (df['Amount'] - df['Amount'].mean()) / (df['Amount'].std() + 1e-8) |
| |
| |
| df['V14_V17_interaction'] = df['V14'] * df['V17'] |
| df['V12_V14_interaction'] = df['V12'] * df['V14'] |
| df['V10_V14_interaction'] = df['V10'] * df['V14'] |
| |
| |
| pca_features = [f'V{i}' for i in range(1, 29)] |
| df['PCA_magnitude'] = np.sqrt((df[pca_features] ** 2).sum(axis=1)) |
| |
| |
| df = df.drop('Hour', axis=1) |
| |
| new_features = ['Hour_sin', 'Hour_cos', 'Time_diff', 'Amount_log', |
| 'Amount_deviation_mean', 'Amount_deviation_median', |
| 'Transaction_velocity', 'Amount_zscore', |
| 'V14_V17_interaction', 'V12_V14_interaction', 'V10_V14_interaction', |
| 'PCA_magnitude'] |
| |
| print(f"Engineered {len(new_features)} new features:") |
| for f in new_features: |
| print(f" - {f}") |
| print(f"\nDataset shape after feature engineering: {df.shape}") |
| |
| return df, new_features |
|
|
|
|
| def stratified_split(df, target_col='Class'): |
| """Perform stratified 70/15/15 train/val/test split.""" |
| print("\n" + "=" * 60) |
| print("STRATIFIED DATA SPLITTING (70/15/15)") |
| print("=" * 60) |
| |
| X = df.drop(target_col, axis=1) |
| y = df[target_col] |
| |
| |
| X_train, X_temp, y_train, y_temp = train_test_split( |
| X, y, test_size=(VAL_RATIO + TEST_RATIO), |
| random_state=SEED, stratify=y |
| ) |
| |
| |
| X_val, X_test, y_val, y_test = train_test_split( |
| X_temp, y_temp, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO), |
| random_state=SEED, stratify=y_temp |
| ) |
| |
| print(f"\nTrain: {X_train.shape[0]:,} samples ({y_train.sum()} fraud, {y_train.mean()*100:.3f}%)") |
| print(f"Val: {X_val.shape[0]:,} samples ({y_val.sum()} fraud, {y_val.mean()*100:.3f}%)") |
| print(f"Test: {X_test.shape[0]:,} samples ({y_test.sum()} fraud, {y_test.mean()*100:.3f}%)") |
| |
| return X_train, X_val, X_test, y_train, y_val, y_test |
|
|
|
|
| def scale_features(X_train, X_val, X_test): |
| """Scale features: fit on train only.""" |
| print("\n" + "=" * 60) |
| print("FEATURE SCALING (Fit on Train Only)") |
| print("=" * 60) |
| |
| scaler = RobustScaler() |
| |
| X_train_scaled = pd.DataFrame( |
| scaler.fit_transform(X_train), |
| columns=X_train.columns, |
| index=X_train.index |
| ) |
| X_val_scaled = pd.DataFrame( |
| scaler.transform(X_val), |
| columns=X_val.columns, |
| index=X_val.index |
| ) |
| X_test_scaled = pd.DataFrame( |
| scaler.transform(X_test), |
| columns=X_test.columns, |
| index=X_test.index |
| ) |
| |
| |
| scaler_path = os.path.join(MODELS_DIR, "scaler.joblib") |
| joblib.dump(scaler, scaler_path) |
| print(f"Scaler saved to: {scaler_path}") |
| print(f"Scaling method: RobustScaler (robust to outliers)") |
| |
| return X_train_scaled, X_val_scaled, X_test_scaled, scaler |
|
|
|
|
| def apply_smote(X_train, y_train): |
| """Apply SMOTE to training data only.""" |
| print("\n" + "=" * 60) |
| print("SMOTE OVERSAMPLING (Train Set Only)") |
| print("=" * 60) |
| |
| print(f"\nBefore SMOTE:") |
| print(f" Class 0: {(y_train == 0).sum():,}") |
| print(f" Class 1: {(y_train == 1).sum():,}") |
| |
| smote = SMOTE(random_state=SEED, sampling_strategy=0.5) |
| X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train) |
| |
| print(f"\nAfter SMOTE (0.5 ratio):") |
| print(f" Class 0: {(y_train_smote == 0).sum():,}") |
| print(f" Class 1: {(y_train_smote == 1).sum():,}") |
| |
| return X_train_smote, y_train_smote |
|
|
|
|
| def compute_class_weights(y_train): |
| """Compute class weights for cost-sensitive learning.""" |
| from sklearn.utils.class_weight import compute_class_weight |
| |
| classes = np.unique(y_train) |
| weights = compute_class_weight('balanced', classes=classes, y=y_train) |
| class_weight_dict = dict(zip(classes, weights)) |
| |
| print(f"\nClass weights (balanced):") |
| print(f" Class 0: {class_weight_dict[0]:.4f}") |
| print(f" Class 1: {class_weight_dict[1]:.4f}") |
| |
| return class_weight_dict |
|
|
|
|
| def run_preprocessing(): |
| """Run the complete preprocessing pipeline.""" |
| print("=" * 60) |
| print("FRAUD DETECTION SYSTEM - PREPROCESSING") |
| print("=" * 60) |
| |
| |
| df = pd.read_csv(os.path.join(DATA_DIR, "creditcard.csv")) |
| print(f"Loaded dataset: {df.shape}") |
| |
| |
| df = df.drop_duplicates() |
| print(f"After removing duplicates: {df.shape}") |
| |
| |
| df, new_features = engineer_features(df) |
| |
| |
| X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(df) |
| |
| |
| X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features( |
| X_train, X_val, X_test |
| ) |
| |
| |
| X_train_smote, y_train_smote = apply_smote(X_train_scaled, y_train) |
| |
| |
| class_weights = compute_class_weights(y_train) |
| |
| |
| data = { |
| 'X_train': X_train_scaled, |
| 'X_val': X_val_scaled, |
| 'X_test': X_test_scaled, |
| 'y_train': y_train, |
| 'y_val': y_val, |
| 'y_test': y_test, |
| 'X_train_smote': X_train_smote, |
| 'y_train_smote': y_train_smote, |
| 'class_weights': class_weights, |
| 'feature_names': list(X_train.columns), |
| 'scaler': scaler, |
| 'new_features': new_features, |
| } |
| |
| data_path = os.path.join(DATA_DIR, "processed_data.joblib") |
| joblib.dump(data, data_path) |
| print(f"\nProcessed data saved to: {data_path}") |
| |
| print("\n" + "=" * 60) |
| print("PREPROCESSING COMPLETE") |
| print("=" * 60) |
| |
| return data |
|
|
|
|
| if __name__ == "__main__": |
| data = run_preprocessing() |
|
|