shivapriyasom commited on
Commit
de235f8
·
verified ·
1 Parent(s): 1186868

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -60
app.py CHANGED
@@ -23,9 +23,6 @@ import matplotlib.pyplot as plt
23
  import warnings
24
  warnings.filterwarnings('ignore')
25
 
26
- # ---------------------------------------------------------------------------
27
- # Globals
28
- # ---------------------------------------------------------------------------
29
  training_data = None
30
  column_names = None
31
  test_list = []
@@ -33,12 +30,8 @@ test_list = []
33
  DEFAULT_N_BOOT_CI = 1000
34
 
35
 
36
- # ---------------------------------------------------------------------------
37
- # Probability calibration helpers (ported from first codebase)
38
- # ---------------------------------------------------------------------------
39
-
40
  def calibrate_probabilities_undersampling(p_s, beta):
41
- """Pozzolo et al. correction for undersampling bias."""
42
  p_s = np.asarray(p_s, dtype=float)
43
  numerator = beta * p_s
44
  denominator = np.maximum((beta - 1.0) * p_s + 1.0, 1e-10)
@@ -52,10 +45,7 @@ def bootstrap_ci_from_oof(
52
  confidence: float = 0.95,
53
  random_state: int = 42,
54
  ) -> tuple:
55
- """
56
- Bootstrap percentile CI for a point estimate, anchored on OOF probabilities
57
- (mirrors the implementation in the first codebase).
58
- """
59
  if oof_probs is None or len(oof_probs) == 0:
60
  return float(point_estimate), float(point_estimate)
61
 
@@ -124,9 +114,6 @@ def compute_efs_ci(
124
  return p_efs, efs_lo, efs_hi
125
 
126
 
127
- # ---------------------------------------------------------------------------
128
- # Original modelling helpers (unchanged from second codebase)
129
- # ---------------------------------------------------------------------------
130
 
131
  def rand_for(neww_list, x_te, rf, lab, x_tr, actual, paramss,
132
  X_Tempp, enco, my_table_str, my_table_num, tabl, tracount):
@@ -279,10 +266,6 @@ def run_model(x_tr, x_te, y_tr, deltaa, lab, rf, X_Tempp, track,
279
  return a_lisy, probab_lisy, secondlisy
280
 
281
 
282
- # ---------------------------------------------------------------------------
283
- # Data loading
284
- # ---------------------------------------------------------------------------
285
-
286
  def load_training_data():
287
  global training_data, column_names, test_list
288
 
@@ -301,9 +284,6 @@ def load_training_data():
301
  return "No training Data"
302
 
303
 
304
- # ---------------------------------------------------------------------------
305
- # Main evaluation function
306
- # ---------------------------------------------------------------------------
307
 
308
  def train_and_evaluate(input_file):
309
  global training_data, column_names, test_list
@@ -323,9 +303,7 @@ def train_and_evaluate(input_file):
323
  if not available_features_input:
324
  return "Error: No matching columns found between datasets", None, None
325
 
326
- # -------------------------------------------------------------------
327
- # Outcome columns: add DWOGF so EFS can be derived from it + GF
328
- # -------------------------------------------------------------------
329
  base_outcome_cols = ['DEAD', 'GF', 'AGVHD', 'CGVHD', 'VOCPSHI', 'STROKEHI']
330
  efs_outcomes = ['DWOGF', 'GF'] # needed for EFS calculation
331
  all_model_outcomes = base_outcome_cols.copy()
@@ -344,9 +322,7 @@ def train_and_evaluate(input_file):
344
  X_input = inter_input[available_features][my_table.columns]
345
  my_test = X_input
346
 
347
- # -------------------------------------------------------------------
348
- # Encode training features
349
- # -------------------------------------------------------------------
350
  li1 = ['Yes', 'No']
351
  cols_yes_no_train = [col for col in my_table.columns if my_table[col].isin(li1).all()]
352
  my_ye_train = my_table[cols_yes_no_train].replace({'Yes': 1, 'No': 0}).astype('int64')
@@ -354,14 +330,14 @@ def train_and_evaluate(input_file):
354
  my_table_str = my_table_modify.select_dtypes(exclude=['number'])
355
  my_table_num = my_table_modify.select_dtypes(include=['number'])
356
 
357
- # Encode test features
358
  cols_yes_no_test = [col for col in my_test.columns if my_test[col].isin(li1).all()]
359
  my_ye_test = my_test[cols_yes_no_test].replace({'Yes': 1, 'No': 0}).astype('int64')
360
  my_test_modify = pd.concat([my_test.drop(cols_yes_no_test, axis=1), my_ye_test], axis=1)
361
  my_test_str_raw = my_test_modify.select_dtypes(exclude=['number'])
362
  my_test_num = my_test_modify.select_dtypes(include=['number'])
363
 
364
- # Fit encoder on combined train+test categorical columns
365
  df_combined = pd.concat([my_table_str, my_test_str_raw], axis=0, ignore_index=True)
366
  enco = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
367
  encoded = enco.fit_transform(df_combined)
@@ -372,23 +348,21 @@ def train_and_evaluate(input_file):
372
  my_test_str = encoded_df.iloc[len(my_table_str):].reset_index(drop=True)
373
  my_test_real = pd.concat([my_test_str, my_test_num], axis=1)
374
 
375
- # -------------------------------------------------------------------
376
- # Train models for every outcome (including DWOGF)
377
- # -------------------------------------------------------------------
378
  outcome_display_names = {
379
- 'DEAD': 'Overall Survival', # reported as 1 – P(DEAD)
380
  'GF': 'Graft Failure',
381
  'AGVHD': 'Acute GVHD',
382
  'CGVHD': 'Chronic GVHD',
383
  'VOCPSHI': 'Vaso-Occlusive Crisis Post-HCT',
384
  'STROKEHI': 'Stroke Post-HCT',
385
- 'DWOGF': 'Death Without Graft Failure', # internal; used for EFS
386
  }
387
 
388
- # Storage for per-outcome predicted probabilities (test set)
389
- all_pred_proba = {} # outcome -> np.ndarray of calibrated probs (test set)
390
- all_pred_labels = {} # outcome -> list of predicted labels
391
- all_y_test = {} # outcome -> np.ndarray of true labels
392
 
393
  metrics_results = []
394
  calibration_results = []
@@ -429,9 +403,9 @@ def train_and_evaluate(input_file):
429
  all_pred_labels[outcome_col] = y_pred
430
  all_y_test[outcome_col] = y_test
431
 
432
- # Only report metrics/plots for the display outcomes (not DWOGF standalone)
433
  if outcome_col == 'DWOGF':
434
- continue # used internally for EFS; no separate display row
435
 
436
  outcome_name = outcome_display_names.get(outcome_col, outcome_col)
437
 
@@ -467,22 +441,15 @@ def train_and_evaluate(input_file):
467
  plt.tight_layout()
468
  calibration_plots.append(fig)
469
 
470
- # -------------------------------------------------------------------
471
- # EFS calculation (mirrors first codebase exactly)
472
- # EFS = 1 – P(DWOGF) – P(GF)
473
- # CI = bootstrap percentile on shifted OOF distributions
474
- # -------------------------------------------------------------------
475
  if 'DWOGF' in all_pred_proba and 'GF' in all_pred_proba:
476
  proba_dwogf = all_pred_proba['DWOGF'] # test-set probabilities
477
  proba_gf = all_pred_proba['GF']
478
 
479
- # We treat the full test-set probability vectors as the "OOF"
480
- # equivalents for CI construction (matches the spirit of the
481
- # first codebase where oof_probs_calibrated were stored from
482
- # training; here we use the held-out test predictions instead).
483
  efs_probs = np.clip(1.0 - proba_dwogf - proba_gf, 0.0, 1.0)
484
 
485
- # Point estimate: mean EFS across the test cohort
486
  p_efs_point = float(np.mean(efs_probs))
487
 
488
  p_efs, efs_lo, efs_hi = compute_efs_ci(
@@ -498,9 +465,7 @@ def train_and_evaluate(input_file):
498
  f"[95% CI: {efs_lo:.3f} – {efs_hi:.3f}]"
499
  )
500
 
501
- # --- EFS calibration plot ---
502
- # For a calibration curve we need a binary "EFS occurred" label.
503
- # EFS event = DWOGF event OR GF event (i.e. 1 if either happened).
504
  if 'DWOGF' in all_y_test and 'GF' in all_y_test:
505
  n_min_efs = min(len(all_y_test['DWOGF']), len(all_y_test['GF']))
506
  y_efs_true = np.clip(
@@ -570,9 +535,7 @@ def train_and_evaluate(input_file):
570
  except Exception as e:
571
  print(f"Warning: EFS metrics computation failed: {e}")
572
 
573
- # -------------------------------------------------------------------
574
- # Assemble output DataFrames
575
- # -------------------------------------------------------------------
576
  metrics_df = pd.DataFrame(
577
  metrics_results,
578
  columns=['Outcome', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'AUC']
@@ -590,9 +553,7 @@ def train_and_evaluate(input_file):
590
  return f"Error processing data: {str(e)}", None, None
591
 
592
 
593
- # ---------------------------------------------------------------------------
594
- # Gradio interface
595
- # ---------------------------------------------------------------------------
596
 
597
  def create_interface():
598
  load_training_data()
 
23
  import warnings
24
  warnings.filterwarnings('ignore')
25
 
 
 
 
26
  training_data = None
27
  column_names = None
28
  test_list = []
 
30
  DEFAULT_N_BOOT_CI = 1000
31
 
32
 
 
 
 
 
33
  def calibrate_probabilities_undersampling(p_s, beta):
34
+
35
  p_s = np.asarray(p_s, dtype=float)
36
  numerator = beta * p_s
37
  denominator = np.maximum((beta - 1.0) * p_s + 1.0, 1e-10)
 
45
  confidence: float = 0.95,
46
  random_state: int = 42,
47
  ) -> tuple:
48
+
 
 
 
49
  if oof_probs is None or len(oof_probs) == 0:
50
  return float(point_estimate), float(point_estimate)
51
 
 
114
  return p_efs, efs_lo, efs_hi
115
 
116
 
 
 
 
117
 
118
  def rand_for(neww_list, x_te, rf, lab, x_tr, actual, paramss,
119
  X_Tempp, enco, my_table_str, my_table_num, tabl, tracount):
 
266
  return a_lisy, probab_lisy, secondlisy
267
 
268
 
 
 
 
 
269
  def load_training_data():
270
  global training_data, column_names, test_list
271
 
 
284
  return "No training Data"
285
 
286
 
 
 
 
287
 
288
  def train_and_evaluate(input_file):
289
  global training_data, column_names, test_list
 
303
  if not available_features_input:
304
  return "Error: No matching columns found between datasets", None, None
305
 
306
+
 
 
307
  base_outcome_cols = ['DEAD', 'GF', 'AGVHD', 'CGVHD', 'VOCPSHI', 'STROKEHI']
308
  efs_outcomes = ['DWOGF', 'GF'] # needed for EFS calculation
309
  all_model_outcomes = base_outcome_cols.copy()
 
322
  X_input = inter_input[available_features][my_table.columns]
323
  my_test = X_input
324
 
325
+
 
 
326
  li1 = ['Yes', 'No']
327
  cols_yes_no_train = [col for col in my_table.columns if my_table[col].isin(li1).all()]
328
  my_ye_train = my_table[cols_yes_no_train].replace({'Yes': 1, 'No': 0}).astype('int64')
 
330
  my_table_str = my_table_modify.select_dtypes(exclude=['number'])
331
  my_table_num = my_table_modify.select_dtypes(include=['number'])
332
 
333
+
334
  cols_yes_no_test = [col for col in my_test.columns if my_test[col].isin(li1).all()]
335
  my_ye_test = my_test[cols_yes_no_test].replace({'Yes': 1, 'No': 0}).astype('int64')
336
  my_test_modify = pd.concat([my_test.drop(cols_yes_no_test, axis=1), my_ye_test], axis=1)
337
  my_test_str_raw = my_test_modify.select_dtypes(exclude=['number'])
338
  my_test_num = my_test_modify.select_dtypes(include=['number'])
339
 
340
+
341
  df_combined = pd.concat([my_table_str, my_test_str_raw], axis=0, ignore_index=True)
342
  enco = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
343
  encoded = enco.fit_transform(df_combined)
 
348
  my_test_str = encoded_df.iloc[len(my_table_str):].reset_index(drop=True)
349
  my_test_real = pd.concat([my_test_str, my_test_num], axis=1)
350
 
351
+
 
 
352
  outcome_display_names = {
353
+ 'DEAD': 'Overall Survival',
354
  'GF': 'Graft Failure',
355
  'AGVHD': 'Acute GVHD',
356
  'CGVHD': 'Chronic GVHD',
357
  'VOCPSHI': 'Vaso-Occlusive Crisis Post-HCT',
358
  'STROKEHI': 'Stroke Post-HCT',
359
+ 'DWOGF': 'Death Without Graft Failure',
360
  }
361
 
362
+
363
+ all_pred_proba = {}
364
+ all_pred_labels = {}
365
+ all_y_test = {}
366
 
367
  metrics_results = []
368
  calibration_results = []
 
403
  all_pred_labels[outcome_col] = y_pred
404
  all_y_test[outcome_col] = y_test
405
 
406
+
407
  if outcome_col == 'DWOGF':
408
+ continue
409
 
410
  outcome_name = outcome_display_names.get(outcome_col, outcome_col)
411
 
 
441
  plt.tight_layout()
442
  calibration_plots.append(fig)
443
 
444
+
 
 
 
 
445
  if 'DWOGF' in all_pred_proba and 'GF' in all_pred_proba:
446
  proba_dwogf = all_pred_proba['DWOGF'] # test-set probabilities
447
  proba_gf = all_pred_proba['GF']
448
 
449
+
 
 
 
450
  efs_probs = np.clip(1.0 - proba_dwogf - proba_gf, 0.0, 1.0)
451
 
452
+
453
  p_efs_point = float(np.mean(efs_probs))
454
 
455
  p_efs, efs_lo, efs_hi = compute_efs_ci(
 
465
  f"[95% CI: {efs_lo:.3f} – {efs_hi:.3f}]"
466
  )
467
 
468
+
 
 
469
  if 'DWOGF' in all_y_test and 'GF' in all_y_test:
470
  n_min_efs = min(len(all_y_test['DWOGF']), len(all_y_test['GF']))
471
  y_efs_true = np.clip(
 
535
  except Exception as e:
536
  print(f"Warning: EFS metrics computation failed: {e}")
537
 
538
+
 
 
539
  metrics_df = pd.DataFrame(
540
  metrics_results,
541
  columns=['Outcome', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'AUC']
 
553
  return f"Error processing data: {str(e)}", None, None
554
 
555
 
556
+
 
 
557
 
558
  def create_interface():
559
  load_training_data()