"""Feature engineering for subscription price-increase churn prediction.""" import pandas as pd import numpy as np from sklearn.base import BaseEstimator, TransformerMixin class SubscriptionFeatureEngineer(BaseEstimator, TransformerMixin): """Engineer features from raw subscription data.""" def __init__(self, price_increase_pct=0.15): self.price_increase_pct = price_increase_pct def fit(self, X, y=None): return self def transform(self, X): df = X.copy() # --- Core subscription features (directly from user's columns) --- # 1. TENURE features (tenure is #1 protective factor per literature) df['tenure'] = df['Tenure in Months'] df['tenure_log'] = np.log1p(df['tenure']) df['tenure_squared'] = df['tenure'] ** 2 df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 6, 12, 24, 48, 999], labels=['0-6m', '6-12m', '1-2y', '2-4y', '4y+']) # 2. CONTRACT / days left features contract_map = {'Month-to-Month': 0, 'One Year': 12, 'Two Year': 24} df['contract_months'] = df['Contract'].map(contract_map).fillna(0) df['days_left_in_contract'] = (df['contract_months'] - df['tenure']).clip(lower=0) * 30 df['contract_ending_soon'] = (df['days_left_in_contract'] <= 90).astype(int) df['is_month_to_month'] = (df['Contract'] == 'Month-to-Month').astype(int) # 3. RMR (Recurring Monthly Revenue) features df['rmr'] = df['Monthly Charge'] df['rmr_log'] = np.log1p(df['rmr']) df['rmr_per_tenure_month'] = df['rmr'] / (df['tenure'] + 1) # 4. Total spend / engagement features df['total_charges'] = df['Total Charges'] df['total_revenue'] = df['Total Revenue'] df['cltv'] = df['CLTV'] df['cltv_log'] = np.log1p(df['cltv']) # 5. PRICE INCREASE specific features (the core business scenario) df['price_increase_pct'] = self.price_increase_pct df['new_monthly_charge'] = df['rmr'] * (1 + self.price_increase_pct) df['monthly_increase_amount'] = df['new_monthly_charge'] - df['rmr'] # Price sensitivity proxies df['price_to_cltv_ratio'] = df['new_monthly_charge'] / (df['cltv'] + 1) df['price_increase_burden'] = df['monthly_increase_amount'] / (df['total_charges'] / (df['tenure'] + 1) + 1) # 6. SERVICE engagement features (annual_services proxy) service_cols = ['Phone Service', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Multiple Lines', 'Unlimited Data'] available_service_cols = [c for c in service_cols if c in df.columns] df['num_services'] = df[available_service_cols].sum(axis=1) df['service_diversity'] = df['num_services'] / len(available_service_cols) df['rmr_per_service'] = df['rmr'] / (df['num_services'] + 1) # 7. RECENCY features (from last_appointment_date proxy) # Use satisfaction score and churn score as recency/engagement proxies df['satisfaction_score'] = df['Satisfaction Score'] df['churn_risk_score'] = df['Churn Score'] df['low_satisfaction'] = (df['satisfaction_score'] <= 2).astype(int) # 8. COHORT / temporal features df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], labels=['<30', '30-45', '45-60', '60+']) df['is_senior'] = df['Senior Citizen'] df['has_dependents'] = df['Dependents'] df['is_married'] = df['Married'] # 9. INTERACTION features (price increase × customer characteristics) df['tenure_x_price_increase'] = df['tenure'] * df['price_increase_pct'] df['contract_ending_x_price_increase'] = df['contract_ending_soon'] * df['price_increase_pct'] df['low_sat_x_price_increase'] = df['low_satisfaction'] * df['price_increase_pct'] df['rmr_x_tenure'] = df['rmr'] * df['tenure'] # 10. RFM-style features df['recency_score'] = 6 - df['satisfaction_score'].clip(1, 5) # lower sat = higher recency df['frequency_score'] = pd.qcut(df['num_services'].rank(method='first'), q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float) df['monetary_score'] = pd.qcut(df['total_revenue'].rank(method='first'), q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float) df['rfm_score'] = df['recency_score'] + df['frequency_score'].fillna(3) + df['monetary_score'].fillna(3) return df