Upload analysis.py with huggingface_hub

9174e35 verified about 22 hours ago

6.33 kB

	"""Business analysis and insights from the churn model."""
	import pandas as pd
	import numpy as np
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import seaborn as sns
	from data_loader import load_data, clean_data
	from features import SubscriptionFeatureEngineer
	from model import PriceIncreaseChurnModel
	import joblib


	def analyze_risk_segments(df, y_prob, y_true, output_dir='/app/price_increase_model'):
	"""Analyze customer segments by churn risk."""
	df = df.copy()
	df['churn_prob'] = y_prob
	df['actual_churn'] = y_true.values

	# Risk tiers
	df['risk_tier'] = pd.cut(df['churn_prob'],
	bins=[0, 0.4, 0.7, 1.0],
	labels=['Low', 'Medium', 'High'])

	print("\n" + "="*60)
	print("RISK SEGMENT ANALYSIS")
	print("="*60)

	segment_summary = df.groupby('risk_tier').agg({
	'churn_prob': ['count', 'mean'],
	'actual_churn': 'mean',
	'rmr': 'mean',
	'cltv': 'mean',
	'tenure': 'mean',
	'num_services': 'mean',
	'satisfaction_score': 'mean'
	}).round(3)
	print("\nSegment Summary:")
	print(segment_summary)

	# Revenue at risk
	df['monthly_revenue_at_risk'] = df['rmr'] * df['churn_prob']
	total_revenue_at_risk = df['monthly_revenue_at_risk'].sum()
	total_revenue = df['rmr'].sum()

	print(f"\nTotal Monthly Revenue: ${total_revenue:,.2f}")
	print(f"Monthly Revenue at Risk: ${total_revenue_at_risk:,.2f}")
	print(f"Revenue at Risk %: {total_revenue_at_risk/total_revenue*100:.1f}%")

	# Plot risk distribution
	plt.figure(figsize=(10, 6))

	plt.subplot(1, 2, 1)
	tier_counts = df['risk_tier'].value_counts()
	colors = ['#2ecc71', '#f39c12', '#e74c3c']
	plt.pie(tier_counts, labels=tier_counts.index, autopct='%1.1f%%', colors=colors)
	plt.title('Customer Distribution by Risk Tier')

	plt.subplot(1, 2, 2)
	sns.boxplot(data=df, x='risk_tier', y='rmr', palette=colors)
	plt.title('Monthly Revenue by Risk Tier')
	plt.ylabel('Monthly Revenue ($)')

	plt.tight_layout()
	plt.savefig(f'{output_dir}/risk_segments.png', dpi=150)
	plt.close()
	print(f"Risk segment plot saved to {output_dir}/risk_segments.png")

	return df


	def analyze_feature_effects(df, model, output_dir='/app/price_increase_model'):
	"""Analyze how key features affect churn probability."""
	classifier = model.pipeline.named_steps['classifier']
	preprocessor = model.pipeline.named_steps['preprocessor']

	# Get feature names
	cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
	cat_names = list(cat_encoder.get_feature_names_out(model.categorical_features))
	feature_names = model.numeric_features + cat_names

	# Get feature importances
	importances = classifier.feature_importances_

	fi_df = pd.DataFrame({
	'feature': feature_names,
	'importance': importances
	}).sort_values('importance', ascending=False)

	print("\n" + "="*60)
	print("TOP 15 FEATURE IMPORTANCES")
	print("="*60)
	for i, row in fi_df.head(15).iterrows():
	print(f"{row['feature']:35s} {row['importance']:.4f}")

	return fi_df


	def price_sensitivity_analysis(df, model, output_dir='/app/price_increase_model'):
	"""Analyze how different price increase % affect churn risk."""
	print("\n" + "="*60)
	print("PRICE SENSITIVITY ANALYSIS")
	print("="*60)

	price_increases = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
	results = []

	# Sample 500 customers for speed
	sample_df = df.sample(n=min(500, len(df)), random_state=42).copy()

	for pct in price_increases:
	engineer = SubscriptionFeatureEngineer(price_increase_pct=pct)
	engineered = engineer.transform(sample_df)
	probs = model.pipeline.predict_proba(engineered)
	avg_churn = probs[:, 1].mean()
	high_risk_pct = (probs[:, 1] >= 0.7).mean() * 100

	results.append({
	'price_increase_pct': pct * 100,
	'avg_churn_prob': avg_churn,
	'high_risk_pct': high_risk_pct
	})
	print(f"Price Increase {pct*100:.0f}%: Avg Churn Prob = {avg_churn:.3f}, High Risk = {high_risk_pct:.1f}%")

	results_df = pd.DataFrame(results)

	# Plot
	plt.figure(figsize=(10, 5))

	plt.subplot(1, 2, 1)
	plt.plot(results_df['price_increase_pct'], results_df['avg_churn_prob'], 'o-', linewidth=2)
	plt.xlabel('Price Increase (%)')
	plt.ylabel('Average Churn Probability')
	plt.title('Churn Risk vs Price Increase')
	plt.grid(True, alpha=0.3)

	plt.subplot(1, 2, 2)
	plt.plot(results_df['price_increase_pct'], results_df['high_risk_pct'], 'o-',
	color='red', linewidth=2)
	plt.xlabel('Price Increase (%)')
	plt.ylabel('High Risk Customers (%)')
	plt.title('High Risk % vs Price Increase')
	plt.grid(True, alpha=0.3)

	plt.tight_layout()
	plt.savefig(f'{output_dir}/price_sensitivity.png', dpi=150)
	plt.close()
	print(f"Price sensitivity plot saved to {output_dir}/price_sensitivity.png")

	return results_df


	def main():
	print("="*60)
	print("BUSINESS IMPACT ANALYSIS")
	print("="*60)

	# Load and prepare data
	df = load_data()
	df = clean_data(df)
	engineer = SubscriptionFeatureEngineer(price_increase_pct=0.15)
	df = engineer.transform(df)

	# Load model
	model = PriceIncreaseChurnModel()
	model.load('/app/price_increase_model/price_increase_churn_model.pkl')

	# Re-extract feature lists from data
	model.numeric_features, model.categorical_features = model._get_feature_lists(df.drop(columns=['Churn']))

	# Predict on full dataset
	features_df = df.drop(columns=['Churn'])
	y_prob = model.predict_proba(features_df)
	y_true = df['Churn']

	# Run analyses
	risk_df = analyze_risk_segments(df, y_prob, y_true)
	fi_df = analyze_feature_effects(df, model)
	sensitivity = price_sensitivity_analysis(df, model)

	print("\n" + "="*60)
	print("ANALYSIS COMPLETE")
	print("="*60)

	return risk_df, fi_df, sensitivity


	if __name__ == '__main__':
	from sklearn.metrics import roc_auc_score
	risk_df, fi_df, sensitivity = main()