| """Load and prepare dataset for price increase churn modeling.""" |
| import pandas as pd |
| import numpy as np |
| from datasets import load_dataset |
| from config import DATASET_NAME, TARGET_COL, CUSTOMER_ID_COL |
|
|
|
|
| def load_data(): |
| """Load telco customer churn dataset from Hugging Face.""" |
| ds = load_dataset(DATASET_NAME) |
| train_df = pd.DataFrame(ds['train']) |
| val_df = pd.DataFrame(ds['validation']) |
| test_df = pd.DataFrame(ds['test']) |
| df = pd.concat([train_df, val_df, test_df], ignore_index=True) |
| return df |
|
|
|
|
| def clean_data(df): |
| """Clean and type-cast columns.""" |
| df = df.copy() |
| |
| df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce') |
| df['Total Charges'] = df['Total Charges'].fillna(0) |
| |
| |
| numeric_cols = ['Monthly Charge', 'Tenure in Months', 'Age', 'Number of Dependents', |
| 'Number of Referrals', 'Avg Monthly GB Download', |
| 'Avg Monthly Long Distance Charges', 'Total Long Distance Charges', |
| 'Total Extra Data Charges', 'Total Refunds', 'Total Revenue', 'CLTV', |
| 'Churn Score', 'Satisfaction Score', 'Population'] |
| for col in numeric_cols: |
| if col in df.columns: |
| df[col] = pd.to_numeric(df[col], errors='coerce') |
| |
| return df |
|
|