telco-churn-app / src /features /build_features.py
ziadkassem's picture
feat: Dockerize app, fix XGBoost dependencies, and add Gradio UI
061507f
import pandas as pd
def _map_binary_series(s:pd.Series)->pd.Series:
vals = list(s.dropna().unique().astype(str))
valset = set(vals)
if valset == {"Yes","No"}:
return s.map({"Yes":1,"No":0}).fillna(0).astype(int)
if valset== {"Male","Female"}:
return s.map({"Male":1,"Female":0}).fillna(0).astype(int)
if len(vals)==2:
sorted_vals = sorted(vals)
return s.astype(str).map({sorted_vals[0]:0,sorted_vals[1]:1}).fillna(0).astype(int)
# If no match, return original but ensure it's converted to numeric if possible
return s
def build_features(df:pd.DataFrame,target_col:str="Churn")->pd.DataFrame:
df=df.copy()
# if "TotalCharges" in df.columns:
# df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
# #fill the new nans with 0
# df["TotalCharges"] = df["TotalCharges"].fillna(0)
# df["TotalCharges"] = df["TotalCharges"].astype(float)
object_cols=[ c for c in df.select_dtypes(include='object').columns if c != target_col]
numeric_colns=df.select_dtypes(include=["Int64","Float64"]).columns.tolist()
print(f"Found:{len(object_cols)} categorical cols and {len(numeric_colns)} numerical")
binary_features = [c for c in object_cols if df[c].dropna().nunique() == 2]
multi_colns = [c for c in object_cols if df[c].dropna().nunique() > 2]
print(f"Found:{len(binary_features)} binary features and {len(multi_colns)} multi colns")
for c in binary_features:
original = df[c].dtype
df[c] = _map_binary_series(df[c].astype(str))
print(f"{c}: {original} → binary (0/1)")
bool_cols = df.select_dtypes(include='bool').columns.tolist()
if bool_cols:
df[bool_cols] = df[bool_cols].astype(int)
print(f"Converted {len(bool_cols)} boolean columns to int: {bool_cols}")
if multi_colns:
print("Applying One-Hot to multi colns")
df = pd.get_dummies(df,columns=multi_colns,drop_first=True,dtype=int)
for c in binary_features:
if pd.api.types.is_integer_dtype(df[c]):
df[c] = df[c].fillna(0).astype(int)
##########################################################
# # Ensure binary features are numeric (handle case where _map_binary_series returned original)
# elif df[c].dtype == 'object':
# # Force conversion: if it's still object, convert to numeric
# df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)
########################################################
# Final check: ensure all remaining object columns (except target) are handled
remaining_object_cols = [c for c in df.select_dtypes(include='object').columns if c != target_col]
if remaining_object_cols:
print(f"Warning: Converting remaining object columns to numeric: {remaining_object_cols}")
for c in remaining_object_cols:
df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)
print(f" Feature engineering complete: {df.shape[1]} final features")
return df