"""Load and prepare dataset for price increase churn modeling.""" import pandas as pd import numpy as np from datasets import load_dataset from config import DATASET_NAME, TARGET_COL, CUSTOMER_ID_COL def load_data(): """Load telco customer churn dataset from Hugging Face.""" ds = load_dataset(DATASET_NAME) train_df = pd.DataFrame(ds['train']) val_df = pd.DataFrame(ds['validation']) test_df = pd.DataFrame(ds['test']) df = pd.concat([train_df, val_df, test_df], ignore_index=True) return df def clean_data(df): """Clean and type-cast columns.""" df = df.copy() # Total Charges may have spaces/empty strings df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce') df['Total Charges'] = df['Total Charges'].fillna(0) # Ensure numeric types numeric_cols = ['Monthly Charge', 'Tenure in Months', 'Age', 'Number of Dependents', 'Number of Referrals', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges', 'Total Long Distance Charges', 'Total Extra Data Charges', 'Total Refunds', 'Total Revenue', 'CLTV', 'Churn Score', 'Satisfaction Score', 'Population'] for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') return df