File size: 7,539 Bytes
408a9b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | """
Module 2: Data Preprocessing
Feature engineering, class imbalance handling, stratified splitting, scaling.
"""
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')
from config import DATA_DIR, MODELS_DIR, SEED, TRAIN_RATIO, VAL_RATIO, TEST_RATIO
def engineer_features(df):
"""Engineer new features from raw data."""
print("\n" + "=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)
df = df.copy()
# 1. Hour of Day (cyclic encoding)
df['Hour'] = (df['Time'] / 3600) % 24
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
# 2. Time since last transaction (proxy: diff in Time column)
df['Time_diff'] = df['Time'].diff().fillna(0)
# 3. Transaction Amount Features
df['Amount_log'] = np.log1p(df['Amount'])
# 4. Amount deviation from global mean/median
df['Amount_deviation_mean'] = df['Amount'] - df['Amount'].mean()
df['Amount_deviation_median'] = df['Amount'] - df['Amount'].median()
# 5. Transaction velocity (rolling count proxy using time windows)
# We approximate velocity as inverse of time since last transaction
df['Transaction_velocity'] = 1.0 / (df['Time_diff'] + 1.0)
# 6. Amount z-score
df['Amount_zscore'] = (df['Amount'] - df['Amount'].mean()) / (df['Amount'].std() + 1e-8)
# 7. Interaction features between top PCA components
df['V14_V17_interaction'] = df['V14'] * df['V17']
df['V12_V14_interaction'] = df['V12'] * df['V14']
df['V10_V14_interaction'] = df['V10'] * df['V14']
# 8. Magnitude features
pca_features = [f'V{i}' for i in range(1, 29)]
df['PCA_magnitude'] = np.sqrt((df[pca_features] ** 2).sum(axis=1))
# Drop raw Hour (we have cyclic encoding)
df = df.drop('Hour', axis=1)
new_features = ['Hour_sin', 'Hour_cos', 'Time_diff', 'Amount_log',
'Amount_deviation_mean', 'Amount_deviation_median',
'Transaction_velocity', 'Amount_zscore',
'V14_V17_interaction', 'V12_V14_interaction', 'V10_V14_interaction',
'PCA_magnitude']
print(f"Engineered {len(new_features)} new features:")
for f in new_features:
print(f" - {f}")
print(f"\nDataset shape after feature engineering: {df.shape}")
return df, new_features
def stratified_split(df, target_col='Class'):
"""Perform stratified 70/15/15 train/val/test split."""
print("\n" + "=" * 60)
print("STRATIFIED DATA SPLITTING (70/15/15)")
print("=" * 60)
X = df.drop(target_col, axis=1)
y = df[target_col]
# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=(VAL_RATIO + TEST_RATIO),
random_state=SEED, stratify=y
)
# Second split: 50/50 of the 30% = 15/15
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO),
random_state=SEED, stratify=y_temp
)
print(f"\nTrain: {X_train.shape[0]:,} samples ({y_train.sum()} fraud, {y_train.mean()*100:.3f}%)")
print(f"Val: {X_val.shape[0]:,} samples ({y_val.sum()} fraud, {y_val.mean()*100:.3f}%)")
print(f"Test: {X_test.shape[0]:,} samples ({y_test.sum()} fraud, {y_test.mean()*100:.3f}%)")
return X_train, X_val, X_test, y_train, y_val, y_test
def scale_features(X_train, X_val, X_test):
"""Scale features: fit on train only."""
print("\n" + "=" * 60)
print("FEATURE SCALING (Fit on Train Only)")
print("=" * 60)
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
scaler.fit_transform(X_train),
columns=X_train.columns,
index=X_train.index
)
X_val_scaled = pd.DataFrame(
scaler.transform(X_val),
columns=X_val.columns,
index=X_val.index
)
X_test_scaled = pd.DataFrame(
scaler.transform(X_test),
columns=X_test.columns,
index=X_test.index
)
# Save scaler
scaler_path = os.path.join(MODELS_DIR, "scaler.joblib")
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")
print(f"Scaling method: RobustScaler (robust to outliers)")
return X_train_scaled, X_val_scaled, X_test_scaled, scaler
def apply_smote(X_train, y_train):
"""Apply SMOTE to training data only."""
print("\n" + "=" * 60)
print("SMOTE OVERSAMPLING (Train Set Only)")
print("=" * 60)
print(f"\nBefore SMOTE:")
print(f" Class 0: {(y_train == 0).sum():,}")
print(f" Class 1: {(y_train == 1).sum():,}")
smote = SMOTE(random_state=SEED, sampling_strategy=0.5) # 1:2 ratio instead of 1:1
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE (0.5 ratio):")
print(f" Class 0: {(y_train_smote == 0).sum():,}")
print(f" Class 1: {(y_train_smote == 1).sum():,}")
return X_train_smote, y_train_smote
def compute_class_weights(y_train):
"""Compute class weights for cost-sensitive learning."""
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))
print(f"\nClass weights (balanced):")
print(f" Class 0: {class_weight_dict[0]:.4f}")
print(f" Class 1: {class_weight_dict[1]:.4f}")
return class_weight_dict
def run_preprocessing():
"""Run the complete preprocessing pipeline."""
print("=" * 60)
print("FRAUD DETECTION SYSTEM - PREPROCESSING")
print("=" * 60)
# Load raw data
df = pd.read_csv(os.path.join(DATA_DIR, "creditcard.csv"))
print(f"Loaded dataset: {df.shape}")
# Remove duplicates
df = df.drop_duplicates()
print(f"After removing duplicates: {df.shape}")
# Feature engineering
df, new_features = engineer_features(df)
# Stratified split BEFORE any resampling
X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(df)
# Scale features (fit on train only)
X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features(
X_train, X_val, X_test
)
# SMOTE on train set only
X_train_smote, y_train_smote = apply_smote(X_train_scaled, y_train)
# Class weights (alternative to SMOTE)
class_weights = compute_class_weights(y_train)
# Save processed data
data = {
'X_train': X_train_scaled,
'X_val': X_val_scaled,
'X_test': X_test_scaled,
'y_train': y_train,
'y_val': y_val,
'y_test': y_test,
'X_train_smote': X_train_smote,
'y_train_smote': y_train_smote,
'class_weights': class_weights,
'feature_names': list(X_train.columns),
'scaler': scaler,
'new_features': new_features,
}
data_path = os.path.join(DATA_DIR, "processed_data.joblib")
joblib.dump(data, data_path)
print(f"\nProcessed data saved to: {data_path}")
print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE")
print("=" * 60)
return data
if __name__ == "__main__":
data = run_preprocessing()
|