from sklearn.preprocessing import LabelEncoder def feature_engineering(df): # Loan_ID is just an identifier, so we remove it if "Loan_ID" in df.columns: df = df.drop("Loan_ID", axis=1) # create some useful new features df["Total_Income"] = df["ApplicantIncome"] + df["CoapplicantIncome"] df["EMI"] = (df["LoanAmount"] * 1000) / df["Loan_Amount_Term"] df["Balance_Income"] = df["Total_Income"] - df["EMI"] return df def encode_data(df): encoders = {} # convert target (Y/N) into numeric target_encoder = LabelEncoder() df["Loan_Status"] = target_encoder.fit_transform(df["Loan_Status"]) # encode categorical columns cols = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"] for col in cols: le = LabelEncoder() df[col] = le.fit_transform(df[col].astype(str)) encoders[col] = le return df, encoders, target_encoder