"""Train and evaluate price increase churn prediction model.""" import pandas as pd import numpy as np import joblib from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.metrics import (roc_auc_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, precision_recall_curve, average_precision_score) from xgboost import XGBClassifier import warnings warnings.filterwarnings('ignore') class PriceIncreaseChurnModel: """Binary classifier: will customer cancel within 90 days after price increase?""" def __init__(self, random_state=42): self.random_state = random_state self.pipeline = None self.feature_names = None self.numeric_features = None self.categorical_features = None def _get_feature_lists(self, df): """Identify numeric and categorical features from engineered dataframe.""" exclude = ['Customer ID', 'Churn', 'Churn Category', 'Churn Reason', 'Customer Status', 'City', 'State', 'Zip Code', 'Lat Long', 'Country', 'Latitude', 'Longitude', 'Quarter', 'Offer', 'Internet Type', 'Gender', 'Payment Method', 'tenure_group', 'age_group'] candidate_cols = [c for c in df.columns if c not in exclude] numeric = [] categorical = [] for col in candidate_cols: if df[col].dtype in ['int64', 'float64', 'int32', 'float32']: numeric.append(col) else: categorical.append(col) return numeric, categorical def build_pipeline(self, numeric_features, categorical_features, scale_pos_weight=1.0): """Build sklearn pipeline with preprocessing + XGBoost.""" self.numeric_features = numeric_features self.categorical_features = categorical_features numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ], remainder='drop' ) classifier = XGBClassifier( n_estimators=300, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=scale_pos_weight, eval_metric='logloss', random_state=self.random_state, n_jobs=-1, reg_alpha=0.1, reg_lambda=1.0, min_child_weight=3 ) self.pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', classifier) ]) return self.pipeline def fit(self, X, y, scale_pos_weight=None): """Train the model.""" numeric, categorical = self._get_feature_lists(X) if scale_pos_weight is None: neg, pos = y.value_counts().sort_index() scale_pos_weight = neg / pos print(f"Auto-calculated scale_pos_weight: {scale_pos_weight:.2f}") self.build_pipeline(numeric, categorical, scale_pos_weight) self.pipeline.fit(X, y) # Store feature names for interpretability preprocessor = self.pipeline.named_steps['preprocessor'] cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot'] cat_names = list(cat_encoder.get_feature_names_out(self.categorical_features)) self.feature_names = self.numeric_features + cat_names return self def predict(self, X): return self.pipeline.predict(X) def predict_proba(self, X): return self.pipeline.predict_proba(X)[:, 1] def evaluate(self, X_test, y_test, threshold=0.5): """Comprehensive model evaluation.""" y_prob = self.predict_proba(X_test) y_pred = (y_prob >= threshold).astype(int) results = { 'auc_roc': roc_auc_score(y_test, y_prob), 'average_precision': average_precision_score(y_test, y_prob), 'precision': precision_score(y_test, y_pred, zero_division=0), 'recall': recall_score(y_test, y_pred, zero_division=0), 'f1': f1_score(y_test, y_pred, zero_division=0), 'confusion_matrix': confusion_matrix(y_test, y_pred).tolist() } print("\n" + "="*60) print("MODEL EVALUATION RESULTS") print("="*60) print(f"AUC-ROC: {results['auc_roc']:.4f}") print(f"Average Precision: {results['average_precision']:.4f}") print(f"Precision: {results['precision']:.4f}") print(f"Recall: {results['recall']:.4f}") print(f"F1 Score: {results['f1']:.4f}") print(f"\nConfusion Matrix:\n{np.array(results['confusion_matrix'])}") print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=['Stay', 'Churn'])) return results def cross_validate(self, X, y, cv=5): """Stratified cross-validation.""" numeric, categorical = self._get_feature_lists(X) self.build_pipeline(numeric, categorical) skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=self.random_state) auc_scores = cross_val_score(self.pipeline, X, y, cv=skf, scoring='roc_auc') print(f"\nCV AUC-ROC: {auc_scores.mean():.4f} (+/- {auc_scores.std()*2:.4f})") return auc_scores def save(self, path): joblib.dump(self.pipeline, path) print(f"Model saved to {path}") def load(self, path): self.pipeline = joblib.load(path) print(f"Model loaded from {path}")