| """Feature engineering for subscription price-increase churn prediction.""" |
| import pandas as pd |
| import numpy as np |
| from sklearn.base import BaseEstimator, TransformerMixin |
|
|
|
|
| class SubscriptionFeatureEngineer(BaseEstimator, TransformerMixin): |
| """Engineer features from raw subscription data.""" |
| |
| def __init__(self, price_increase_pct=0.15): |
| self.price_increase_pct = price_increase_pct |
| |
| def fit(self, X, y=None): |
| return self |
| |
| def transform(self, X): |
| df = X.copy() |
| |
| |
| |
| |
| df['tenure'] = df['Tenure in Months'] |
| df['tenure_log'] = np.log1p(df['tenure']) |
| df['tenure_squared'] = df['tenure'] ** 2 |
| df['tenure_group'] = pd.cut(df['tenure'], |
| bins=[0, 6, 12, 24, 48, 999], |
| labels=['0-6m', '6-12m', '1-2y', '2-4y', '4y+']) |
| |
| |
| contract_map = {'Month-to-Month': 0, 'One Year': 12, 'Two Year': 24} |
| df['contract_months'] = df['Contract'].map(contract_map).fillna(0) |
| df['days_left_in_contract'] = (df['contract_months'] - df['tenure']).clip(lower=0) * 30 |
| df['contract_ending_soon'] = (df['days_left_in_contract'] <= 90).astype(int) |
| df['is_month_to_month'] = (df['Contract'] == 'Month-to-Month').astype(int) |
| |
| |
| df['rmr'] = df['Monthly Charge'] |
| df['rmr_log'] = np.log1p(df['rmr']) |
| df['rmr_per_tenure_month'] = df['rmr'] / (df['tenure'] + 1) |
| |
| |
| df['total_charges'] = df['Total Charges'] |
| df['total_revenue'] = df['Total Revenue'] |
| df['cltv'] = df['CLTV'] |
| df['cltv_log'] = np.log1p(df['cltv']) |
| |
| |
| df['price_increase_pct'] = self.price_increase_pct |
| df['new_monthly_charge'] = df['rmr'] * (1 + self.price_increase_pct) |
| df['monthly_increase_amount'] = df['new_monthly_charge'] - df['rmr'] |
| |
| |
| df['price_to_cltv_ratio'] = df['new_monthly_charge'] / (df['cltv'] + 1) |
| df['price_increase_burden'] = df['monthly_increase_amount'] / (df['total_charges'] / (df['tenure'] + 1) + 1) |
| |
| |
| service_cols = ['Phone Service', 'Internet Service', 'Online Security', |
| 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', |
| 'Streaming TV', 'Streaming Movies', 'Streaming Music', |
| 'Multiple Lines', 'Unlimited Data'] |
| available_service_cols = [c for c in service_cols if c in df.columns] |
| df['num_services'] = df[available_service_cols].sum(axis=1) |
| df['service_diversity'] = df['num_services'] / len(available_service_cols) |
| df['rmr_per_service'] = df['rmr'] / (df['num_services'] + 1) |
| |
| |
| |
| df['satisfaction_score'] = df['Satisfaction Score'] |
| df['churn_risk_score'] = df['Churn Score'] |
| df['low_satisfaction'] = (df['satisfaction_score'] <= 2).astype(int) |
| |
| |
| df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], |
| labels=['<30', '30-45', '45-60', '60+']) |
| df['is_senior'] = df['Senior Citizen'] |
| df['has_dependents'] = df['Dependents'] |
| df['is_married'] = df['Married'] |
| |
| |
| df['tenure_x_price_increase'] = df['tenure'] * df['price_increase_pct'] |
| df['contract_ending_x_price_increase'] = df['contract_ending_soon'] * df['price_increase_pct'] |
| df['low_sat_x_price_increase'] = df['low_satisfaction'] * df['price_increase_pct'] |
| df['rmr_x_tenure'] = df['rmr'] * df['tenure'] |
| |
| |
| df['recency_score'] = 6 - df['satisfaction_score'].clip(1, 5) |
| df['frequency_score'] = pd.qcut(df['num_services'].rank(method='first'), |
| q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float) |
| df['monetary_score'] = pd.qcut(df['total_revenue'].rank(method='first'), |
| q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float) |
| df['rfm_score'] = df['recency_score'] + df['frequency_score'].fillna(3) + df['monetary_score'].fillna(3) |
| |
| return df |
|
|