File size: 1,568 Bytes
63289b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import pandas as pd
import sys,os


# sys.path.append(os.path.abspath("src"))
project_root = os.path.dirname(os.path.dirname(__file__))
sys.path.append(project_root)

from src.data.load_data import load_data
from src.data.preprocess import preprocess_data
from src.features.build_features import build_features
from src.utils.dataframe_validator import validate_dataframe


data_path = "C:\\Users\\Wind\\Downloads\\archive (10)\\WA_Fn-UseC_-Telco-Customer-Churn.csv"
target_col = "Churn"

def main():
    print("Loading Data...\n")
    df = load_data(data_path)   
    print(df.head(3))
    
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df["MonthlyCharges"] = pd.to_numeric(df["MonthlyCharges"], errors="coerce")

    # Drop rows with missing numeric values
    df = df.dropna(subset=["TotalCharges", "MonthlyCharges"])

    print("Validating Data\n")
    valid,errors = validate_dataframe(df)
    if errors:
        print("Printing first 5 errors...")
        for idx,err in errors[:5]:
            print(f"Row: {idx}, Error: {err}")
    

    print("Preprocessing Data...\n")
    df_clean = preprocess_data(df,target_col=target_col)
    print(df_clean.head(3))
    
    print("Building Features\n")
    df_features = build_features(df_clean,target_col=target_col)
    print(df_features.head(3))
    print("Phase one done !")

    output_path = "data/processed/train_processed.csv"
    df_features.to_csv(output_path, index=False)
    print(f"Saved processed data to {output_path}")
if __name__ == "__main__":
    main()