| """Business analysis and insights from the churn model.""" |
| import pandas as pd |
| import numpy as np |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from data_loader import load_data, clean_data |
| from features import SubscriptionFeatureEngineer |
| from model import PriceIncreaseChurnModel |
| import joblib |
|
|
|
|
| def analyze_risk_segments(df, y_prob, y_true, output_dir='/app/price_increase_model'): |
| """Analyze customer segments by churn risk.""" |
| df = df.copy() |
| df['churn_prob'] = y_prob |
| df['actual_churn'] = y_true.values |
| |
| |
| df['risk_tier'] = pd.cut(df['churn_prob'], |
| bins=[0, 0.4, 0.7, 1.0], |
| labels=['Low', 'Medium', 'High']) |
| |
| print("\n" + "="*60) |
| print("RISK SEGMENT ANALYSIS") |
| print("="*60) |
| |
| segment_summary = df.groupby('risk_tier').agg({ |
| 'churn_prob': ['count', 'mean'], |
| 'actual_churn': 'mean', |
| 'rmr': 'mean', |
| 'cltv': 'mean', |
| 'tenure': 'mean', |
| 'num_services': 'mean', |
| 'satisfaction_score': 'mean' |
| }).round(3) |
| print("\nSegment Summary:") |
| print(segment_summary) |
| |
| |
| df['monthly_revenue_at_risk'] = df['rmr'] * df['churn_prob'] |
| total_revenue_at_risk = df['monthly_revenue_at_risk'].sum() |
| total_revenue = df['rmr'].sum() |
| |
| print(f"\nTotal Monthly Revenue: ${total_revenue:,.2f}") |
| print(f"Monthly Revenue at Risk: ${total_revenue_at_risk:,.2f}") |
| print(f"Revenue at Risk %: {total_revenue_at_risk/total_revenue*100:.1f}%") |
| |
| |
| plt.figure(figsize=(10, 6)) |
| |
| plt.subplot(1, 2, 1) |
| tier_counts = df['risk_tier'].value_counts() |
| colors = ['#2ecc71', '#f39c12', '#e74c3c'] |
| plt.pie(tier_counts, labels=tier_counts.index, autopct='%1.1f%%', colors=colors) |
| plt.title('Customer Distribution by Risk Tier') |
| |
| plt.subplot(1, 2, 2) |
| sns.boxplot(data=df, x='risk_tier', y='rmr', palette=colors) |
| plt.title('Monthly Revenue by Risk Tier') |
| plt.ylabel('Monthly Revenue ($)') |
| |
| plt.tight_layout() |
| plt.savefig(f'{output_dir}/risk_segments.png', dpi=150) |
| plt.close() |
| print(f"Risk segment plot saved to {output_dir}/risk_segments.png") |
| |
| return df |
|
|
|
|
| def analyze_feature_effects(df, model, output_dir='/app/price_increase_model'): |
| """Analyze how key features affect churn probability.""" |
| classifier = model.pipeline.named_steps['classifier'] |
| preprocessor = model.pipeline.named_steps['preprocessor'] |
| |
| |
| cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot'] |
| cat_names = list(cat_encoder.get_feature_names_out(model.categorical_features)) |
| feature_names = model.numeric_features + cat_names |
| |
| |
| importances = classifier.feature_importances_ |
| |
| fi_df = pd.DataFrame({ |
| 'feature': feature_names, |
| 'importance': importances |
| }).sort_values('importance', ascending=False) |
| |
| print("\n" + "="*60) |
| print("TOP 15 FEATURE IMPORTANCES") |
| print("="*60) |
| for i, row in fi_df.head(15).iterrows(): |
| print(f"{row['feature']:35s} {row['importance']:.4f}") |
| |
| return fi_df |
|
|
|
|
| def price_sensitivity_analysis(df, model, output_dir='/app/price_increase_model'): |
| """Analyze how different price increase % affect churn risk.""" |
| print("\n" + "="*60) |
| print("PRICE SENSITIVITY ANALYSIS") |
| print("="*60) |
| |
| price_increases = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] |
| results = [] |
| |
| |
| sample_df = df.sample(n=min(500, len(df)), random_state=42).copy() |
| |
| for pct in price_increases: |
| engineer = SubscriptionFeatureEngineer(price_increase_pct=pct) |
| engineered = engineer.transform(sample_df) |
| probs = model.pipeline.predict_proba(engineered) |
| avg_churn = probs[:, 1].mean() |
| high_risk_pct = (probs[:, 1] >= 0.7).mean() * 100 |
| |
| results.append({ |
| 'price_increase_pct': pct * 100, |
| 'avg_churn_prob': avg_churn, |
| 'high_risk_pct': high_risk_pct |
| }) |
| print(f"Price Increase {pct*100:.0f}%: Avg Churn Prob = {avg_churn:.3f}, High Risk = {high_risk_pct:.1f}%") |
| |
| results_df = pd.DataFrame(results) |
| |
| |
| plt.figure(figsize=(10, 5)) |
| |
| plt.subplot(1, 2, 1) |
| plt.plot(results_df['price_increase_pct'], results_df['avg_churn_prob'], 'o-', linewidth=2) |
| plt.xlabel('Price Increase (%)') |
| plt.ylabel('Average Churn Probability') |
| plt.title('Churn Risk vs Price Increase') |
| plt.grid(True, alpha=0.3) |
| |
| plt.subplot(1, 2, 2) |
| plt.plot(results_df['price_increase_pct'], results_df['high_risk_pct'], 'o-', |
| color='red', linewidth=2) |
| plt.xlabel('Price Increase (%)') |
| plt.ylabel('High Risk Customers (%)') |
| plt.title('High Risk % vs Price Increase') |
| plt.grid(True, alpha=0.3) |
| |
| plt.tight_layout() |
| plt.savefig(f'{output_dir}/price_sensitivity.png', dpi=150) |
| plt.close() |
| print(f"Price sensitivity plot saved to {output_dir}/price_sensitivity.png") |
| |
| return results_df |
|
|
|
|
| def main(): |
| print("="*60) |
| print("BUSINESS IMPACT ANALYSIS") |
| print("="*60) |
| |
| |
| df = load_data() |
| df = clean_data(df) |
| engineer = SubscriptionFeatureEngineer(price_increase_pct=0.15) |
| df = engineer.transform(df) |
| |
| |
| model = PriceIncreaseChurnModel() |
| model.load('/app/price_increase_model/price_increase_churn_model.pkl') |
| |
| |
| model.numeric_features, model.categorical_features = model._get_feature_lists(df.drop(columns=['Churn'])) |
| |
| |
| features_df = df.drop(columns=['Churn']) |
| y_prob = model.predict_proba(features_df) |
| y_true = df['Churn'] |
| |
| |
| risk_df = analyze_risk_segments(df, y_prob, y_true) |
| fi_df = analyze_feature_effects(df, model) |
| sensitivity = price_sensitivity_analysis(df, model) |
| |
| print("\n" + "="*60) |
| print("ANALYSIS COMPLETE") |
| print("="*60) |
| |
| return risk_df, fi_df, sensitivity |
|
|
|
|
| if __name__ == '__main__': |
| from sklearn.metrics import roc_auc_score |
| risk_df, fi_df, sensitivity = main() |
|
|