Upload model.py with huggingface_hub

535ccdd verified 3 days ago

6.53 kB

	"""Train and evaluate price increase churn prediction model."""
	import pandas as pd
	import numpy as np
	import joblib
	from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.metrics import (roc_auc_score, precision_score, recall_score,
	f1_score, classification_report, confusion_matrix,
	precision_recall_curve, average_precision_score)
	from xgboost import XGBClassifier
	import warnings
	warnings.filterwarnings('ignore')


	class PriceIncreaseChurnModel:
	"""Binary classifier: will customer cancel within 90 days after price increase?"""

	def __init__(self, random_state=42):
	self.random_state = random_state
	self.pipeline = None
	self.feature_names = None
	self.numeric_features = None
	self.categorical_features = None

	def _get_feature_lists(self, df):
	"""Identify numeric and categorical features from engineered dataframe."""
	exclude = ['Customer ID', 'Churn', 'Churn Category', 'Churn Reason',
	'Customer Status', 'City', 'State', 'Zip Code', 'Lat Long',
	'Country', 'Latitude', 'Longitude', 'Quarter', 'Offer',
	'Internet Type', 'Gender', 'Payment Method', 'tenure_group',
	'age_group']

	candidate_cols = [c for c in df.columns if c not in exclude]

	numeric = []
	categorical = []
	for col in candidate_cols:
	if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
	numeric.append(col)
	else:
	categorical.append(col)

	return numeric, categorical

	def build_pipeline(self, numeric_features, categorical_features,
	scale_pos_weight=1.0):
	"""Build sklearn pipeline with preprocessing + XGBoost."""
	self.numeric_features = numeric_features
	self.categorical_features = categorical_features

	numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
	])

	preprocessor = ColumnTransformer(
	transformers=[
	('num', numeric_transformer, numeric_features),
	('cat', categorical_transformer, categorical_features)
	],
	remainder='drop'
	)

	classifier = XGBClassifier(
	n_estimators=300,
	max_depth=6,
	learning_rate=0.05,
	subsample=0.8,
	colsample_bytree=0.8,
	scale_pos_weight=scale_pos_weight,
	eval_metric='logloss',
	random_state=self.random_state,
	n_jobs=-1,
	reg_alpha=0.1,
	reg_lambda=1.0,
	min_child_weight=3
	)

	self.pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('classifier', classifier)
	])

	return self.pipeline

	def fit(self, X, y, scale_pos_weight=None):
	"""Train the model."""
	numeric, categorical = self._get_feature_lists(X)

	if scale_pos_weight is None:
	neg, pos = y.value_counts().sort_index()
	scale_pos_weight = neg / pos
	print(f"Auto-calculated scale_pos_weight: {scale_pos_weight:.2f}")

	self.build_pipeline(numeric, categorical, scale_pos_weight)
	self.pipeline.fit(X, y)

	# Store feature names for interpretability
	preprocessor = self.pipeline.named_steps['preprocessor']
	cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
	cat_names = list(cat_encoder.get_feature_names_out(self.categorical_features))
	self.feature_names = self.numeric_features + cat_names

	return self

	def predict(self, X):
	return self.pipeline.predict(X)

	def predict_proba(self, X):
	return self.pipeline.predict_proba(X)[:, 1]

	def evaluate(self, X_test, y_test, threshold=0.5):
	"""Comprehensive model evaluation."""
	y_prob = self.predict_proba(X_test)
	y_pred = (y_prob >= threshold).astype(int)

	results = {
	'auc_roc': roc_auc_score(y_test, y_prob),
	'average_precision': average_precision_score(y_test, y_prob),
	'precision': precision_score(y_test, y_pred, zero_division=0),
	'recall': recall_score(y_test, y_pred, zero_division=0),
	'f1': f1_score(y_test, y_pred, zero_division=0),
	'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
	}

	print("\n" + "="*60)
	print("MODEL EVALUATION RESULTS")
	print("="*60)
	print(f"AUC-ROC: {results['auc_roc']:.4f}")
	print(f"Average Precision: {results['average_precision']:.4f}")
	print(f"Precision: {results['precision']:.4f}")
	print(f"Recall: {results['recall']:.4f}")
	print(f"F1 Score: {results['f1']:.4f}")
	print(f"\nConfusion Matrix:\n{np.array(results['confusion_matrix'])}")
	print("\nClassification Report:")
	print(classification_report(y_test, y_pred, target_names=['Stay', 'Churn']))

	return results

	def cross_validate(self, X, y, cv=5):
	"""Stratified cross-validation."""
	numeric, categorical = self._get_feature_lists(X)
	self.build_pipeline(numeric, categorical)

	skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=self.random_state)
	auc_scores = cross_val_score(self.pipeline, X, y, cv=skf, scoring='roc_auc')

	print(f"\nCV AUC-ROC: {auc_scores.mean():.4f} (+/- {auc_scores.std()*2:.4f})")
	return auc_scores

	def save(self, path):
	joblib.dump(self.pipeline, path)
	print(f"Model saved to {path}")

	def load(self, path):
	self.pipeline = joblib.load(path)
	print(f"Model loaded from {path}")