| """ |
| TRAIN |
| Impute any null data, save ethnicity info for each ID and scale |
| final dataset |
| """ |
| import json |
| import joblib |
| import pandas as pd |
| import numpy as np |
| from numpy import savetxt |
| from sklearn.preprocessing import MinMaxScaler |
| from utils.reduction import calc_ds_med |
|
|
|
|
| demo_cols = ['age_bin', 'sex_bin'] |
|
|
| ds_cols = ['days_since_copd_resp', 'days_since_adm', 'days_since_rescue'] |
|
|
| null_cols = ['alt_med_2yr', 'ast_med_2yr', 'albumin_med_2yr', |
| 'alkaline_phosphatase_med_2yr', 'basophils_med_2yr', |
| 'c_reactive_protein_med_2yr', 'chloride_med_2yr', |
| 'creatinine_med_2yr', 'eosinophils_med_2yr', |
| 'estimated_gfr_med_2yr', 'haematocrit_med_2yr', |
| 'haemoglobin_med_2yr', 'lymphocytes_med_2yr', |
| 'mch_med_2yr', 'mean_cell_volume_med_2yr', |
| 'monocytes_med_2yr', 'neutrophils_med_2yr', |
| 'platelets_med_2yr', 'potassium_med_2yr', |
| 'red_blood_count_med_2yr', 'sodium_med_2yr', |
| 'total_bilirubin_med_2yr', 'urea_med_2yr', |
| 'white_blood_count_med_2yr', 'neut_lymph_med_2yr'] |
|
|
| cols2drop = ['eth_grp', 'entry_dataset', 'first_entry', 'obf_dob', |
| 'sex_bin', 'marital_status', 'age_bin', |
| 'days_since_copd_resp_med', 'days_since_adm_med', |
| 'days_since_rescue_med', 'simd_vigintile', 'simd_decile', |
| 'simd_quintile'] |
|
|
|
|
| def calc_age_bins_train(df, data_path): |
| """ |
| Split ages into 10 bins and save results for median filling test data |
| -------- |
| :param df: dataframe to be updated |
| :param data_path: path to generated data |
| :return: updated dataframe |
| """ |
| |
| cat, ed = pd.qcut(df['age'], q=10, precision=0, retbins=True) |
| categories, edges = pd.qcut( |
| df['age'], q=10, precision=0, retbins=True, labels=ed[1:]) |
| df['age_bin'] = categories.astype(int) |
|
|
| |
| savetxt(data_path + 'age_bins_train.csv', edges, delimiter=',') |
|
|
| return df |
|
|
|
|
| def calc_df_med(df, data_path): |
| """ |
| Calculate the medians for all columns in the dataset |
| -------- |
| :param df: dataframe to update |
| :param data_path: path to generated data |
| :return: dataframe with null columns filled with median values and days_since |
| median columns added to the dataframe |
| """ |
| |
| all_cols = df.columns |
| all_cols = all_cols.drop(['SafeHavenID', 'eoy']) |
| df_median = df[all_cols].groupby(demo_cols).median() |
|
|
| |
| ds_med = df[demo_cols + ds_cols].groupby(demo_cols).apply(calc_ds_med) |
|
|
| |
| df_median = df_median.join(ds_med) |
|
|
| |
| df_median.to_pickle(data_path + 'medians.pkl') |
|
|
| |
| ds_med.columns += '_med' |
| df = df.join(ds_med, on=demo_cols) |
|
|
| return df |
|
|
|
|
| def ds_fill_5year_train(df, col): |
| """ |
| Fill days_since_X columns where patient has been in the dataset less than |
| 5 years |
| -------- |
| :param df: dataframe to be updated |
| :param col: column to check |
| :return: dataframe with column nulls filled where patient has ggc_years < 5 |
| """ |
| df_5years = df.ggc_years < 5 |
| df.loc[df_5years, col] = df.loc[df_5years, col].fillna(df[col].max()) |
|
|
| return df |
|
|
|
|
| def scale_data_train(df, data_path, scaler): |
| """ |
| Min-max scale final dataset |
| ----- |
| :param df: dataframe to be scaled |
| :param data_path: path to generated data |
| :param scaler: scaler object to apply to df |
| :return: scaled dataset for modelling |
| """ |
| all_cols = df.columns |
| all_cols = all_cols.drop(['SafeHavenID', 'eoy']) |
| data_scaled = scaler.fit_transform(df[all_cols].to_numpy()) |
| df_scaled = pd.DataFrame(data_scaled, columns=all_cols) |
| df_final = (df[['SafeHavenID', 'eoy']] |
| .reset_index(drop=True) |
| .join(df_scaled)) |
|
|
| |
| joblib.dump(scaler, data_path + 'min_max_scaler_train.pkl') |
|
|
| return df_final |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
| data_path = config['model_data_path'] |
| |
| |
| df = pd.read_pickle(data_path + 'merged_train.pkl') |
|
|
| |
| df = calc_age_bins_train(df, data_path) |
|
|
| |
| df = calc_df_med(df, data_path) |
|
|
| |
| df[null_cols] = df.groupby(demo_cols)[null_cols].apply( |
| lambda x: x.fillna(x.median())) |
|
|
| |
| day = np.timedelta64(1, 'D') |
| df[ds_cols].max().to_pickle(data_path + 'maxs.pkl') |
| for col in ds_cols: |
| df = ds_fill_5year_train(df, col) |
| df[col] = df[col].fillna(df[col + '_med']) |
| df[col] = (df[col] / day).astype(int) |
|
|
| |
| df.to_pickle(data_path + 'filled_train.pkl') |
|
|
| |
| df = df.drop(cols2drop, axis=1) |
|
|
| |
| scaler = MinMaxScaler() |
|
|
| |
| df_final = scale_data_train(df, data_path, scaler) |
|
|
| |
| df_final.to_pickle(data_path + 'min_max_train.pkl') |
|
|
|
|
| main() |
|
|