"""Business analysis and insights from the churn model.""" import pandas as pd import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns from data_loader import load_data, clean_data from features import SubscriptionFeatureEngineer from model import PriceIncreaseChurnModel import joblib def analyze_risk_segments(df, y_prob, y_true, output_dir='/app/price_increase_model'): """Analyze customer segments by churn risk.""" df = df.copy() df['churn_prob'] = y_prob df['actual_churn'] = y_true.values # Risk tiers df['risk_tier'] = pd.cut(df['churn_prob'], bins=[0, 0.4, 0.7, 1.0], labels=['Low', 'Medium', 'High']) print("\n" + "="*60) print("RISK SEGMENT ANALYSIS") print("="*60) segment_summary = df.groupby('risk_tier').agg({ 'churn_prob': ['count', 'mean'], 'actual_churn': 'mean', 'rmr': 'mean', 'cltv': 'mean', 'tenure': 'mean', 'num_services': 'mean', 'satisfaction_score': 'mean' }).round(3) print("\nSegment Summary:") print(segment_summary) # Revenue at risk df['monthly_revenue_at_risk'] = df['rmr'] * df['churn_prob'] total_revenue_at_risk = df['monthly_revenue_at_risk'].sum() total_revenue = df['rmr'].sum() print(f"\nTotal Monthly Revenue: ${total_revenue:,.2f}") print(f"Monthly Revenue at Risk: ${total_revenue_at_risk:,.2f}") print(f"Revenue at Risk %: {total_revenue_at_risk/total_revenue*100:.1f}%") # Plot risk distribution plt.figure(figsize=(10, 6)) plt.subplot(1, 2, 1) tier_counts = df['risk_tier'].value_counts() colors = ['#2ecc71', '#f39c12', '#e74c3c'] plt.pie(tier_counts, labels=tier_counts.index, autopct='%1.1f%%', colors=colors) plt.title('Customer Distribution by Risk Tier') plt.subplot(1, 2, 2) sns.boxplot(data=df, x='risk_tier', y='rmr', palette=colors) plt.title('Monthly Revenue by Risk Tier') plt.ylabel('Monthly Revenue ($)') plt.tight_layout() plt.savefig(f'{output_dir}/risk_segments.png', dpi=150) plt.close() print(f"Risk segment plot saved to {output_dir}/risk_segments.png") return df def analyze_feature_effects(df, model, output_dir='/app/price_increase_model'): """Analyze how key features affect churn probability.""" classifier = model.pipeline.named_steps['classifier'] preprocessor = model.pipeline.named_steps['preprocessor'] # Get feature names cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot'] cat_names = list(cat_encoder.get_feature_names_out(model.categorical_features)) feature_names = model.numeric_features + cat_names # Get feature importances importances = classifier.feature_importances_ fi_df = pd.DataFrame({ 'feature': feature_names, 'importance': importances }).sort_values('importance', ascending=False) print("\n" + "="*60) print("TOP 15 FEATURE IMPORTANCES") print("="*60) for i, row in fi_df.head(15).iterrows(): print(f"{row['feature']:35s} {row['importance']:.4f}") return fi_df def price_sensitivity_analysis(df, model, output_dir='/app/price_increase_model'): """Analyze how different price increase % affect churn risk.""" print("\n" + "="*60) print("PRICE SENSITIVITY ANALYSIS") print("="*60) price_increases = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] results = [] # Sample 500 customers for speed sample_df = df.sample(n=min(500, len(df)), random_state=42).copy() for pct in price_increases: engineer = SubscriptionFeatureEngineer(price_increase_pct=pct) engineered = engineer.transform(sample_df) probs = model.pipeline.predict_proba(engineered) avg_churn = probs[:, 1].mean() high_risk_pct = (probs[:, 1] >= 0.7).mean() * 100 results.append({ 'price_increase_pct': pct * 100, 'avg_churn_prob': avg_churn, 'high_risk_pct': high_risk_pct }) print(f"Price Increase {pct*100:.0f}%: Avg Churn Prob = {avg_churn:.3f}, High Risk = {high_risk_pct:.1f}%") results_df = pd.DataFrame(results) # Plot plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.plot(results_df['price_increase_pct'], results_df['avg_churn_prob'], 'o-', linewidth=2) plt.xlabel('Price Increase (%)') plt.ylabel('Average Churn Probability') plt.title('Churn Risk vs Price Increase') plt.grid(True, alpha=0.3) plt.subplot(1, 2, 2) plt.plot(results_df['price_increase_pct'], results_df['high_risk_pct'], 'o-', color='red', linewidth=2) plt.xlabel('Price Increase (%)') plt.ylabel('High Risk Customers (%)') plt.title('High Risk % vs Price Increase') plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(f'{output_dir}/price_sensitivity.png', dpi=150) plt.close() print(f"Price sensitivity plot saved to {output_dir}/price_sensitivity.png") return results_df def main(): print("="*60) print("BUSINESS IMPACT ANALYSIS") print("="*60) # Load and prepare data df = load_data() df = clean_data(df) engineer = SubscriptionFeatureEngineer(price_increase_pct=0.15) df = engineer.transform(df) # Load model model = PriceIncreaseChurnModel() model.load('/app/price_increase_model/price_increase_churn_model.pkl') # Re-extract feature lists from data model.numeric_features, model.categorical_features = model._get_feature_lists(df.drop(columns=['Churn'])) # Predict on full dataset features_df = df.drop(columns=['Churn']) y_prob = model.predict_proba(features_df) y_true = df['Churn'] # Run analyses risk_df = analyze_risk_segments(df, y_prob, y_true) fi_df = analyze_feature_effects(df, model) sensitivity = price_sensitivity_analysis(df, model) print("\n" + "="*60) print("ANALYSIS COMPLETE") print("="*60) return risk_df, fi_df, sensitivity if __name__ == '__main__': from sklearn.metrics import roc_auc_score risk_df, fi_df, sensitivity = main()