| import snowflake.snowpark as snowpark |
| from snowflake.snowpark.functions import col |
| from sklearn.experimental import enable_iterative_imputer |
| from sklearn.impute import SimpleImputer, IterativeImputer |
| from imblearn.over_sampling import SMOTE |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve, auc |
| from sklearn.model_selection import StratifiedKFold |
| import pandas as pd |
| import numpy as np |
| import json |
|
|
| def main(session: snowpark.Session): |
| |
| df = session.table("PUBLIC.BEQUESTS_CLEAN").to_pandas() |
|
|
| |
| imputers = { |
| 'mean': SimpleImputer(strategy='mean'), |
| 'median': SimpleImputer(strategy='median'), |
| 'mice': IterativeImputer(random_state=42) |
| } |
|
|
| |
| results_dead = [] |
| results_alive = [] |
| results_modeling = [] |
|
|
| |
| def evaluate_imputation(df, imputer_name, imputer): |
| |
| df['BIRTH_YEAR'] = imputer.fit_transform(df[['BIRTH_YEAR']]) |
| |
| |
| df = pd.get_dummies(df, columns=['REGION_CODE'], drop_first=True) |
| |
| |
| feature_columns = [ |
| 'TOTAL_TRANSACTIONS', |
| 'TOTAL_AMOUNT', |
| 'FIRST_GIFT_AMOUNT', |
| 'MRC_AMOUNT', |
| 'HPC_AMOUNT', |
| 'YEARS_SINCE_FIRST_GIFT', |
| 'YEARS_SINCE_MRC_GIFT', |
| 'YEARS_SINCE_HPC_GIFT', |
| 'BIRTH_YEAR' |
| ] + [col for col in df.columns if col.startswith('REGION_CODE_')] |
|
|
| |
| df_dead = df[df['DEATH_FLAG'] == 1] |
| df_alive = df[df['DEATH_FLAG'] == 0] |
|
|
| |
| if len(df_dead) > 0: |
| X_dead = df_dead[feature_columns] |
| y_dead = df_dead['BEQUEST_RECEIVED'] |
| ROI_FAMILY_ID_dead = df_dead['ROI_FAMILY_ID'] |
| |
| |
| skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) |
| smote = SMOTE(random_state=42) |
| model = RandomForestClassifier(random_state=42, n_jobs=-1) |
|
|
| |
| y_pred_dead = np.zeros(len(y_dead)) |
| y_pred_proba_dead = np.zeros(len(y_dead)) |
| for train_index, test_index in skf.split(X_dead, y_dead): |
| X_train, X_test = X_dead.iloc[train_index], X_dead.iloc[test_index] |
| y_train, y_test = y_dead.iloc[train_index], y_dead.iloc[test_index] |
|
|
| X_train_res, y_train_res = smote.fit_resample(X_train, y_train) |
| model.fit(X_train_res, y_train_res) |
| y_pred_dead[test_index] = model.predict(X_test) |
| y_pred_proba_dead[test_index] = model.predict_proba(X_test)[:, 1] |
|
|
| |
| accuracy_dead = accuracy_score(y_dead, y_pred_dead) |
| precision_dead, recall_dead, _ = precision_recall_curve(y_dead, y_pred_proba_dead) |
| auc_pr_dead = auc(recall_dead, precision_dead) |
| report_dead = classification_report(y_dead, y_pred_dead, output_dict=True) |
| model.fit(X_dead, y_dead) |
| feature_importance_dead = pd.DataFrame({ |
| 'Feature': X_dead.columns, |
| 'Importance': model.feature_importances_ |
| }).sort_values(by='Importance', ascending=False) |
|
|
| results_dead.append({ |
| 'imputer': imputer_name, |
| 'accuracy': accuracy_dead, |
| 'auc_pr': auc_pr_dead, |
| 'report': pd.DataFrame(report_dead).transpose(), |
| 'feature_importance': feature_importance_dead, |
| 'ROI_FAMILY_ID': ROI_FAMILY_ID_dead, |
| 'y_true': y_dead, |
| 'y_pred': y_pred_dead |
| }) |
|
|
| results_modeling.append({ |
| 'imputer': imputer_name, |
| 'accuracy': accuracy_dead, |
| 'auc_pr': auc_pr_dead, |
| 'classification_report': json.dumps(report_dead), |
| 'feature_importance': feature_importance_dead.to_dict(orient='list') |
| }) |
|
|
| |
| if len(df_alive) > 0: |
| X_alive = df_alive[feature_columns] |
| y_pred_alive = model.predict(X_alive) |
| ROI_FAMILY_ID_alive = df_alive['ROI_FAMILY_ID'] |
|
|
| results_alive.append({ |
| 'imputer': imputer_name, |
| 'ROI_FAMILY_ID': ROI_FAMILY_ID_alive, |
| 'y_pred': y_pred_alive |
| }) |
|
|
| |
| for imputer_name, imputer in imputers.items(): |
| evaluate_imputation(df.copy(), imputer_name, imputer) |
|
|
| |
| for result in results_dead: |
| print(f"Imputer: {result['imputer']} (Dead)") |
| print("Accuracy:", result['accuracy']) |
| print("AUC-PR:", result['auc_pr']) |
| print("Classification Report:") |
| print(result['report']) |
| print("Feature Importance:") |
| print(result['feature_importance']) |
| print("\n" + "-"*50 + "\n") |
|
|
| |
| predictions_dead_df = pd.concat([ |
| pd.DataFrame({ |
| 'ROI_FAMILY_ID': result['ROI_FAMILY_ID'], |
| 'imputer': result['imputer'], |
| 'y_true': result['y_true'], |
| 'y_pred': result['y_pred'], |
| 'status': 'dead' |
| }) for result in results_dead |
| ], ignore_index=True) |
|
|
| |
| predictions_alive_df = pd.concat([ |
| pd.DataFrame({ |
| 'ROI_FAMILY_ID': result['ROI_FAMILY_ID'], |
| 'imputer': result['imputer'], |
| 'y_pred': result['y_pred'], |
| 'status': 'alive' |
| }) for result in results_alive |
| ], ignore_index=True) |
|
|
| |
| session.write_pandas(predictions_dead_df, 'BEQUEST_PREDICTIONS_DEAD', auto_create_table=True) |
|
|
| |
| session.write_pandas(predictions_alive_df, 'BEQUEST_PREDICTIONS_ALIVE', auto_create_table=True) |
|
|
| |
| modeling_results_df = pd.DataFrame(results_modeling) |
| session.write_pandas(modeling_results_df, 'BEQUEST_MODELING_RESULTS', auto_create_table=True) |
|
|
| |
| return "Data processing, prediction, and table creation completed successfully." |
|
|