Sairamr46's picture
Upload model.py with huggingface_hub
535ccdd verified
"""Train and evaluate price increase churn prediction model."""
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (roc_auc_score, precision_score, recall_score,
f1_score, classification_report, confusion_matrix,
precision_recall_curve, average_precision_score)
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
class PriceIncreaseChurnModel:
"""Binary classifier: will customer cancel within 90 days after price increase?"""
def __init__(self, random_state=42):
self.random_state = random_state
self.pipeline = None
self.feature_names = None
self.numeric_features = None
self.categorical_features = None
def _get_feature_lists(self, df):
"""Identify numeric and categorical features from engineered dataframe."""
exclude = ['Customer ID', 'Churn', 'Churn Category', 'Churn Reason',
'Customer Status', 'City', 'State', 'Zip Code', 'Lat Long',
'Country', 'Latitude', 'Longitude', 'Quarter', 'Offer',
'Internet Type', 'Gender', 'Payment Method', 'tenure_group',
'age_group']
candidate_cols = [c for c in df.columns if c not in exclude]
numeric = []
categorical = []
for col in candidate_cols:
if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
numeric.append(col)
else:
categorical.append(col)
return numeric, categorical
def build_pipeline(self, numeric_features, categorical_features,
scale_pos_weight=1.0):
"""Build sklearn pipeline with preprocessing + XGBoost."""
self.numeric_features = numeric_features
self.categorical_features = categorical_features
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
],
remainder='drop'
)
classifier = XGBClassifier(
n_estimators=300,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=scale_pos_weight,
eval_metric='logloss',
random_state=self.random_state,
n_jobs=-1,
reg_alpha=0.1,
reg_lambda=1.0,
min_child_weight=3
)
self.pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', classifier)
])
return self.pipeline
def fit(self, X, y, scale_pos_weight=None):
"""Train the model."""
numeric, categorical = self._get_feature_lists(X)
if scale_pos_weight is None:
neg, pos = y.value_counts().sort_index()
scale_pos_weight = neg / pos
print(f"Auto-calculated scale_pos_weight: {scale_pos_weight:.2f}")
self.build_pipeline(numeric, categorical, scale_pos_weight)
self.pipeline.fit(X, y)
# Store feature names for interpretability
preprocessor = self.pipeline.named_steps['preprocessor']
cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_names = list(cat_encoder.get_feature_names_out(self.categorical_features))
self.feature_names = self.numeric_features + cat_names
return self
def predict(self, X):
return self.pipeline.predict(X)
def predict_proba(self, X):
return self.pipeline.predict_proba(X)[:, 1]
def evaluate(self, X_test, y_test, threshold=0.5):
"""Comprehensive model evaluation."""
y_prob = self.predict_proba(X_test)
y_pred = (y_prob >= threshold).astype(int)
results = {
'auc_roc': roc_auc_score(y_test, y_prob),
'average_precision': average_precision_score(y_test, y_prob),
'precision': precision_score(y_test, y_pred, zero_division=0),
'recall': recall_score(y_test, y_pred, zero_division=0),
'f1': f1_score(y_test, y_pred, zero_division=0),
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
}
print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)
print(f"AUC-ROC: {results['auc_roc']:.4f}")
print(f"Average Precision: {results['average_precision']:.4f}")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1']:.4f}")
print(f"\nConfusion Matrix:\n{np.array(results['confusion_matrix'])}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Stay', 'Churn']))
return results
def cross_validate(self, X, y, cv=5):
"""Stratified cross-validation."""
numeric, categorical = self._get_feature_lists(X)
self.build_pipeline(numeric, categorical)
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=self.random_state)
auc_scores = cross_val_score(self.pipeline, X, y, cv=skf, scoring='roc_auc')
print(f"\nCV AUC-ROC: {auc_scores.mean():.4f} (+/- {auc_scores.std()*2:.4f})")
return auc_scores
def save(self, path):
joblib.dump(self.pipeline, path)
print(f"Model saved to {path}")
def load(self, path):
self.pipeline = joblib.load(path)
print(f"Model loaded from {path}")