File size: 1,367 Bytes
7585b30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Load and prepare dataset for price increase churn modeling."""
import pandas as pd
import numpy as np
from datasets import load_dataset
from config import DATASET_NAME, TARGET_COL, CUSTOMER_ID_COL


def load_data():
    """Load telco customer churn dataset from Hugging Face."""
    ds = load_dataset(DATASET_NAME)
    train_df = pd.DataFrame(ds['train'])
    val_df = pd.DataFrame(ds['validation'])
    test_df = pd.DataFrame(ds['test'])
    df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    return df


def clean_data(df):
    """Clean and type-cast columns."""
    df = df.copy()
    # Total Charges may have spaces/empty strings
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    df['Total Charges'] = df['Total Charges'].fillna(0)
    
    # Ensure numeric types
    numeric_cols = ['Monthly Charge', 'Tenure in Months', 'Age', 'Number of Dependents',
                    'Number of Referrals', 'Avg Monthly GB Download',
                    'Avg Monthly Long Distance Charges', 'Total Long Distance Charges',
                    'Total Extra Data Charges', 'Total Refunds', 'Total Revenue', 'CLTV',
                    'Churn Score', 'Satisfaction Score', 'Population']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df