Sairamr46's picture
Upload analysis.py with huggingface_hub
9174e35 verified
"""Business analysis and insights from the churn model."""
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import load_data, clean_data
from features import SubscriptionFeatureEngineer
from model import PriceIncreaseChurnModel
import joblib
def analyze_risk_segments(df, y_prob, y_true, output_dir='/app/price_increase_model'):
"""Analyze customer segments by churn risk."""
df = df.copy()
df['churn_prob'] = y_prob
df['actual_churn'] = y_true.values
# Risk tiers
df['risk_tier'] = pd.cut(df['churn_prob'],
bins=[0, 0.4, 0.7, 1.0],
labels=['Low', 'Medium', 'High'])
print("\n" + "="*60)
print("RISK SEGMENT ANALYSIS")
print("="*60)
segment_summary = df.groupby('risk_tier').agg({
'churn_prob': ['count', 'mean'],
'actual_churn': 'mean',
'rmr': 'mean',
'cltv': 'mean',
'tenure': 'mean',
'num_services': 'mean',
'satisfaction_score': 'mean'
}).round(3)
print("\nSegment Summary:")
print(segment_summary)
# Revenue at risk
df['monthly_revenue_at_risk'] = df['rmr'] * df['churn_prob']
total_revenue_at_risk = df['monthly_revenue_at_risk'].sum()
total_revenue = df['rmr'].sum()
print(f"\nTotal Monthly Revenue: ${total_revenue:,.2f}")
print(f"Monthly Revenue at Risk: ${total_revenue_at_risk:,.2f}")
print(f"Revenue at Risk %: {total_revenue_at_risk/total_revenue*100:.1f}%")
# Plot risk distribution
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
tier_counts = df['risk_tier'].value_counts()
colors = ['#2ecc71', '#f39c12', '#e74c3c']
plt.pie(tier_counts, labels=tier_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Customer Distribution by Risk Tier')
plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='risk_tier', y='rmr', palette=colors)
plt.title('Monthly Revenue by Risk Tier')
plt.ylabel('Monthly Revenue ($)')
plt.tight_layout()
plt.savefig(f'{output_dir}/risk_segments.png', dpi=150)
plt.close()
print(f"Risk segment plot saved to {output_dir}/risk_segments.png")
return df
def analyze_feature_effects(df, model, output_dir='/app/price_increase_model'):
"""Analyze how key features affect churn probability."""
classifier = model.pipeline.named_steps['classifier']
preprocessor = model.pipeline.named_steps['preprocessor']
# Get feature names
cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_names = list(cat_encoder.get_feature_names_out(model.categorical_features))
feature_names = model.numeric_features + cat_names
# Get feature importances
importances = classifier.feature_importances_
fi_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False)
print("\n" + "="*60)
print("TOP 15 FEATURE IMPORTANCES")
print("="*60)
for i, row in fi_df.head(15).iterrows():
print(f"{row['feature']:35s} {row['importance']:.4f}")
return fi_df
def price_sensitivity_analysis(df, model, output_dir='/app/price_increase_model'):
"""Analyze how different price increase % affect churn risk."""
print("\n" + "="*60)
print("PRICE SENSITIVITY ANALYSIS")
print("="*60)
price_increases = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
results = []
# Sample 500 customers for speed
sample_df = df.sample(n=min(500, len(df)), random_state=42).copy()
for pct in price_increases:
engineer = SubscriptionFeatureEngineer(price_increase_pct=pct)
engineered = engineer.transform(sample_df)
probs = model.pipeline.predict_proba(engineered)
avg_churn = probs[:, 1].mean()
high_risk_pct = (probs[:, 1] >= 0.7).mean() * 100
results.append({
'price_increase_pct': pct * 100,
'avg_churn_prob': avg_churn,
'high_risk_pct': high_risk_pct
})
print(f"Price Increase {pct*100:.0f}%: Avg Churn Prob = {avg_churn:.3f}, High Risk = {high_risk_pct:.1f}%")
results_df = pd.DataFrame(results)
# Plot
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(results_df['price_increase_pct'], results_df['avg_churn_prob'], 'o-', linewidth=2)
plt.xlabel('Price Increase (%)')
plt.ylabel('Average Churn Probability')
plt.title('Churn Risk vs Price Increase')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(results_df['price_increase_pct'], results_df['high_risk_pct'], 'o-',
color='red', linewidth=2)
plt.xlabel('Price Increase (%)')
plt.ylabel('High Risk Customers (%)')
plt.title('High Risk % vs Price Increase')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{output_dir}/price_sensitivity.png', dpi=150)
plt.close()
print(f"Price sensitivity plot saved to {output_dir}/price_sensitivity.png")
return results_df
def main():
print("="*60)
print("BUSINESS IMPACT ANALYSIS")
print("="*60)
# Load and prepare data
df = load_data()
df = clean_data(df)
engineer = SubscriptionFeatureEngineer(price_increase_pct=0.15)
df = engineer.transform(df)
# Load model
model = PriceIncreaseChurnModel()
model.load('/app/price_increase_model/price_increase_churn_model.pkl')
# Re-extract feature lists from data
model.numeric_features, model.categorical_features = model._get_feature_lists(df.drop(columns=['Churn']))
# Predict on full dataset
features_df = df.drop(columns=['Churn'])
y_prob = model.predict_proba(features_df)
y_true = df['Churn']
# Run analyses
risk_df = analyze_risk_segments(df, y_prob, y_true)
fi_df = analyze_feature_effects(df, model)
sensitivity = price_sensitivity_analysis(df, model)
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
return risk_df, fi_df, sensitivity
if __name__ == '__main__':
from sklearn.metrics import roc_auc_score
risk_df, fi_df, sensitivity = main()