File size: 1,367 Bytes
7585b30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | """Load and prepare dataset for price increase churn modeling."""
import pandas as pd
import numpy as np
from datasets import load_dataset
from config import DATASET_NAME, TARGET_COL, CUSTOMER_ID_COL
def load_data():
"""Load telco customer churn dataset from Hugging Face."""
ds = load_dataset(DATASET_NAME)
train_df = pd.DataFrame(ds['train'])
val_df = pd.DataFrame(ds['validation'])
test_df = pd.DataFrame(ds['test'])
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
return df
def clean_data(df):
"""Clean and type-cast columns."""
df = df.copy()
# Total Charges may have spaces/empty strings
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'] = df['Total Charges'].fillna(0)
# Ensure numeric types
numeric_cols = ['Monthly Charge', 'Tenure in Months', 'Age', 'Number of Dependents',
'Number of Referrals', 'Avg Monthly GB Download',
'Avg Monthly Long Distance Charges', 'Total Long Distance Charges',
'Total Extra Data Charges', 'Total Refunds', 'Total Revenue', 'CLTV',
'Churn Score', 'Satisfaction Score', 'Population']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
|