import pandas as pd def _map_binary_series(s:pd.Series)->pd.Series: vals = list(s.dropna().unique().astype(str)) valset = set(vals) if valset == {"Yes","No"}: return s.map({"Yes":1,"No":0}).fillna(0).astype(int) if valset== {"Male","Female"}: return s.map({"Male":1,"Female":0}).fillna(0).astype(int) if len(vals)==2: sorted_vals = sorted(vals) return s.astype(str).map({sorted_vals[0]:0,sorted_vals[1]:1}).fillna(0).astype(int) # If no match, return original but ensure it's converted to numeric if possible return s def build_features(df:pd.DataFrame,target_col:str="Churn")->pd.DataFrame: df=df.copy() # if "TotalCharges" in df.columns: # df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce') # #fill the new nans with 0 # df["TotalCharges"] = df["TotalCharges"].fillna(0) # df["TotalCharges"] = df["TotalCharges"].astype(float) object_cols=[ c for c in df.select_dtypes(include='object').columns if c != target_col] numeric_colns=df.select_dtypes(include=["Int64","Float64"]).columns.tolist() print(f"Found:{len(object_cols)} categorical cols and {len(numeric_colns)} numerical") binary_features = [c for c in object_cols if df[c].dropna().nunique() == 2] multi_colns = [c for c in object_cols if df[c].dropna().nunique() > 2] print(f"Found:{len(binary_features)} binary features and {len(multi_colns)} multi colns") for c in binary_features: original = df[c].dtype df[c] = _map_binary_series(df[c].astype(str)) print(f"{c}: {original} → binary (0/1)") bool_cols = df.select_dtypes(include='bool').columns.tolist() if bool_cols: df[bool_cols] = df[bool_cols].astype(int) print(f"Converted {len(bool_cols)} boolean columns to int: {bool_cols}") if multi_colns: print("Applying One-Hot to multi colns") df = pd.get_dummies(df,columns=multi_colns,drop_first=True,dtype=int) for c in binary_features: if pd.api.types.is_integer_dtype(df[c]): df[c] = df[c].fillna(0).astype(int) ########################################################## # # Ensure binary features are numeric (handle case where _map_binary_series returned original) # elif df[c].dtype == 'object': # # Force conversion: if it's still object, convert to numeric # df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int) ######################################################## # Final check: ensure all remaining object columns (except target) are handled remaining_object_cols = [c for c in df.select_dtypes(include='object').columns if c != target_col] if remaining_object_cols: print(f"Warning: Converting remaining object columns to numeric: {remaining_object_cols}") for c in remaining_object_cols: df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int) print(f" Feature engineering complete: {df.shape[1]} final features") return df