| |
| import pandas as pd |
|
|
| |
| input_file_path = '<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/' |
| output_file_path = '<YOUR_DATA_PATH>/summary_csv_files/' |
|
|
|
|
| def read_data(file): |
| """ |
| Read in data source |
| -------- |
| :param file: string filename |
| :return: dataframe |
| """ |
| df = pd.read_csv(file) |
| return df |
|
|
|
|
| def format_data_for_output(survival_data): |
| """ |
| Remove columns not needed for output |
| -------- |
| :param survival_data: dataframe containing date of death field |
| :return: above dataframe filtered to only contain columns required |
| for analysis/ output |
| """ |
| survival_data = survival_data[['SafeHavenID', 'DOD']] |
| return survival_data |
|
|
|
|
| def filter_data(survival_data, date): |
| """ |
| Filter data to only include those alive on the index date for analysis |
| -------- |
| :param data: string filename |
| :param date: Index date in 'DD-MM-YYYY' format |
| :return: dataframe including only those alive on the index date for |
| analysis |
| """ |
| survival_data['DOD'] = pd.to_datetime(survival_data['DOD']) |
| return survival_data[survival_data['DOD'] >= date] |
|
|
|
|
| def calulate_days_survived(survival_data, date): |
| """ |
| Calcualte days survived following the index date |
| -------- |
| :param survival_data: dataframe containing date of death field |
| :param date: Index date in 'DD-MM-YYYY' format |
| :return: days survived from index date |
| """ |
| survival_data['index_date'] = date |
| survival_data['index_date'] = pd.to_datetime(survival_data['index_date']) |
| return (survival_data['DOD'] - survival_data['index_date']).dt.days |
|
|
|
|
| def main(): |
| |
| survival_file = input_file_path + "Deaths_Cohort3R.csv" |
| survival_data = read_data(survival_file) |
|
|
| |
| survival_data = survival_data.drop_duplicates() |
|
|
| |
| survival_data = format_data_for_output(survival_data) |
|
|
| |
| survival_data = filter_data(survival_data, '01-01-2020') |
|
|
| |
| survival_data['days_survived'] = calulate_days_survived(survival_data, |
| '01-01-2020') |
|
|
| |
| survival_data.to_pickle(output_file_path + 'Survival_from_index.pkl') |
|
|
|
|
| main() |
|
|