Sairamr46's picture
Upload features.py with huggingface_hub
d4514a6 verified
"""Feature engineering for subscription price-increase churn prediction."""
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class SubscriptionFeatureEngineer(BaseEstimator, TransformerMixin):
"""Engineer features from raw subscription data."""
def __init__(self, price_increase_pct=0.15):
self.price_increase_pct = price_increase_pct
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
# --- Core subscription features (directly from user's columns) ---
# 1. TENURE features (tenure is #1 protective factor per literature)
df['tenure'] = df['Tenure in Months']
df['tenure_log'] = np.log1p(df['tenure'])
df['tenure_squared'] = df['tenure'] ** 2
df['tenure_group'] = pd.cut(df['tenure'],
bins=[0, 6, 12, 24, 48, 999],
labels=['0-6m', '6-12m', '1-2y', '2-4y', '4y+'])
# 2. CONTRACT / days left features
contract_map = {'Month-to-Month': 0, 'One Year': 12, 'Two Year': 24}
df['contract_months'] = df['Contract'].map(contract_map).fillna(0)
df['days_left_in_contract'] = (df['contract_months'] - df['tenure']).clip(lower=0) * 30
df['contract_ending_soon'] = (df['days_left_in_contract'] <= 90).astype(int)
df['is_month_to_month'] = (df['Contract'] == 'Month-to-Month').astype(int)
# 3. RMR (Recurring Monthly Revenue) features
df['rmr'] = df['Monthly Charge']
df['rmr_log'] = np.log1p(df['rmr'])
df['rmr_per_tenure_month'] = df['rmr'] / (df['tenure'] + 1)
# 4. Total spend / engagement features
df['total_charges'] = df['Total Charges']
df['total_revenue'] = df['Total Revenue']
df['cltv'] = df['CLTV']
df['cltv_log'] = np.log1p(df['cltv'])
# 5. PRICE INCREASE specific features (the core business scenario)
df['price_increase_pct'] = self.price_increase_pct
df['new_monthly_charge'] = df['rmr'] * (1 + self.price_increase_pct)
df['monthly_increase_amount'] = df['new_monthly_charge'] - df['rmr']
# Price sensitivity proxies
df['price_to_cltv_ratio'] = df['new_monthly_charge'] / (df['cltv'] + 1)
df['price_increase_burden'] = df['monthly_increase_amount'] / (df['total_charges'] / (df['tenure'] + 1) + 1)
# 6. SERVICE engagement features (annual_services proxy)
service_cols = ['Phone Service', 'Internet Service', 'Online Security',
'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
'Streaming TV', 'Streaming Movies', 'Streaming Music',
'Multiple Lines', 'Unlimited Data']
available_service_cols = [c for c in service_cols if c in df.columns]
df['num_services'] = df[available_service_cols].sum(axis=1)
df['service_diversity'] = df['num_services'] / len(available_service_cols)
df['rmr_per_service'] = df['rmr'] / (df['num_services'] + 1)
# 7. RECENCY features (from last_appointment_date proxy)
# Use satisfaction score and churn score as recency/engagement proxies
df['satisfaction_score'] = df['Satisfaction Score']
df['churn_risk_score'] = df['Churn Score']
df['low_satisfaction'] = (df['satisfaction_score'] <= 2).astype(int)
# 8. COHORT / temporal features
df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100],
labels=['<30', '30-45', '45-60', '60+'])
df['is_senior'] = df['Senior Citizen']
df['has_dependents'] = df['Dependents']
df['is_married'] = df['Married']
# 9. INTERACTION features (price increase × customer characteristics)
df['tenure_x_price_increase'] = df['tenure'] * df['price_increase_pct']
df['contract_ending_x_price_increase'] = df['contract_ending_soon'] * df['price_increase_pct']
df['low_sat_x_price_increase'] = df['low_satisfaction'] * df['price_increase_pct']
df['rmr_x_tenure'] = df['rmr'] * df['tenure']
# 10. RFM-style features
df['recency_score'] = 6 - df['satisfaction_score'].clip(1, 5) # lower sat = higher recency
df['frequency_score'] = pd.qcut(df['num_services'].rank(method='first'),
q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float)
df['monetary_score'] = pd.qcut(df['total_revenue'].rank(method='first'),
q=5, labels=[1,2,3,4,5], duplicates='drop').astype(float)
df['rfm_score'] = df['recency_score'] + df['frequency_score'].fillna(3) + df['monetary_score'].fillna(3)
return df