Satyawan1 commited on
Commit
d13de86
Β·
verified Β·
1 Parent(s): ede2ce3

Upload retrain_with_pca.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. retrain_with_pca.py +279 -0
retrain_with_pca.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Retrain all EEG + Braak models with PCA dimensionality reduction
4
+ ================================================================
5
+ Compares: No PCA vs PCA(95%) vs PCA(99%) vs PCA+SelectKBest
6
+ Also retrains EEG without MMSE to find pure EEG biomarkers
7
+
8
+ Author: Satyawan Singh β€” Infonova Solutions
9
+ """
10
+ import os, json, pickle, warnings
11
+ import numpy as np
12
+ import pandas as pd
13
+ warnings.filterwarnings('ignore')
14
+
15
+ if not hasattr(np, 'trapz'):
16
+ np.trapz = np.trapezoid
17
+
18
+ from sklearn.decomposition import PCA
19
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
20
+ from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
21
+ from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
22
+ from sklearn.svm import SVC
23
+ from sklearn.feature_selection import SelectKBest, f_classif
24
+ from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
25
+ from sklearn.pipeline import Pipeline
26
+
27
+ OUTPUT_DIR = '/Users/satyawansingh/Documents/alzheimer-research-complete/models/pca_analysis'
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ results = {}
31
+
32
+ # ══════════════════════════════════════════════════════════════
33
+ # 1. DEEP EEG MODEL (515 features, 88 subjects)
34
+ # ══════════════════════════════════════════════════════════════
35
+ print("=" * 70)
36
+ print(" MODEL 1: DEEP EEG β€” AD vs FTD vs Control (88 subjects)")
37
+ print("=" * 70)
38
+
39
+ df_eeg = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/deep_features.csv')
40
+ labels = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/labels.npy', allow_pickle=True)
41
+
42
+ feature_cols = [c for c in df_eeg.columns if c not in ['subject', 'group']]
43
+ X_raw = df_eeg[feature_cols].fillna(0).values
44
+ y = LabelEncoder().fit_transform(labels) # AD=0, C=1, F=2
45
+
46
+ # Binary: AD vs non-AD
47
+ y_binary = (labels == 'A').astype(int)
48
+
49
+ print(f"Features: {X_raw.shape[1]}, Samples: {X_raw.shape[0]}")
50
+ print(f"Classes: AD={sum(labels=='A')}, Control={sum(labels=='C')}, FTD={sum(labels=='F')}")
51
+
52
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
53
+
54
+ # --- With MMSE vs Without MMSE ---
55
+ mmse_col_idx = feature_cols.index('mmse') if 'mmse' in feature_cols else None
56
+ age_col_idx = feature_cols.index('age') if 'age' in feature_cols else None
57
+
58
+ eeg_only_cols = [i for i, c in enumerate(feature_cols) if c not in ['mmse', 'age', 'sex']]
59
+ X_eeg_only = X_raw[:, eeg_only_cols]
60
+
61
+ print(f"\nWith MMSE+demographics: {X_raw.shape[1]} features")
62
+ print(f"Pure EEG only: {X_eeg_only.shape[1]} features")
63
+
64
+ for data_name, X_data in [("All features", X_raw), ("Pure EEG only", X_eeg_only)]:
65
+ print(f"\n{'─' * 60}")
66
+ print(f" {data_name}")
67
+ print(f"{'─' * 60}")
68
+
69
+ for task_name, y_task in [("3-class", y), ("AD vs non-AD", y_binary)]:
70
+ print(f"\n Task: {task_name}")
71
+
72
+ configs = {
73
+ 'No reduction (SelectKBest 120)': ('skb', 120),
74
+ 'PCA 95% variance': ('pca', 0.95),
75
+ 'PCA 99% variance': ('pca', 0.99),
76
+ 'PCA 30 components': ('pca', 30),
77
+ 'PCA 15 components': ('pca', 15),
78
+ 'PCA 95% + SelectKBest 20': ('pca_skb', (0.95, 20)),
79
+ }
80
+
81
+ for config_name, (method, param) in configs.items():
82
+ scaler = StandardScaler()
83
+ X_scaled = scaler.fit_transform(X_data)
84
+
85
+ if method == 'skb':
86
+ k = min(param, X_data.shape[1])
87
+ selector = SelectKBest(f_classif, k=k)
88
+ X_reduced = selector.fit_transform(X_scaled, y_task)
89
+ elif method == 'pca':
90
+ if isinstance(param, float):
91
+ pca = PCA(n_components=param, random_state=42)
92
+ else:
93
+ pca = PCA(n_components=min(param, X_data.shape[1], X_data.shape[0]-1), random_state=42)
94
+ X_reduced = pca.fit_transform(X_scaled)
95
+ elif method == 'pca_skb':
96
+ pca_param, skb_k = param
97
+ pca = PCA(n_components=pca_param, random_state=42)
98
+ X_pca = pca.fit_transform(X_scaled)
99
+ skb_k = min(skb_k, X_pca.shape[1])
100
+ selector = SelectKBest(f_classif, k=skb_k)
101
+ X_reduced = selector.fit_transform(X_pca, y_task)
102
+
103
+ n_components = X_reduced.shape[1]
104
+
105
+ # Train GradientBoosting
106
+ gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
107
+ learning_rate=0.05, random_state=42)
108
+
109
+ try:
110
+ y_pred = cross_val_predict(gb, X_reduced, y_task, cv=cv)
111
+ acc = accuracy_score(y_task, y_pred)
112
+
113
+ if len(np.unique(y_task)) == 2:
114
+ y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
115
+ auc = roc_auc_score(y_task, y_proba[:, 1])
116
+ print(f" {config_name:40s} β†’ {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
117
+ key = f"deep_eeg_{data_name}_{task_name}_{config_name}"
118
+ results[key] = {'acc': acc, 'auc': auc, 'dims': n_components}
119
+ else:
120
+ y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
121
+ auc = roc_auc_score(y_task, y_proba, multi_class='ovr')
122
+ print(f" {config_name:40s} β†’ {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
123
+ key = f"deep_eeg_{data_name}_{task_name}_{config_name}"
124
+ results[key] = {'acc': acc, 'auc': auc, 'dims': n_components}
125
+ except Exception as e:
126
+ print(f" {config_name:40s} β†’ FAILED: {e}")
127
+
128
+
129
+ # ══════════════════════════════════════════════════════════════
130
+ # 2. BASIC EEG MODEL (851 features, 88 subjects)
131
+ # ══════════════════════════════════════════════════════════════
132
+ print(f"\n\n{'=' * 70}")
133
+ print(" MODEL 2: BASIC EEG CLASSIFIER (851 features, 88 subjects)")
134
+ print("=" * 70)
135
+
136
+ df_basic = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_features.csv')
137
+ y_basic = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_labels.npy', allow_pickle=True)
138
+
139
+ X_basic = df_basic.fillna(0).values
140
+ le = LabelEncoder()
141
+ y_basic_enc = le.fit_transform(y_basic)
142
+ y_basic_binary = (y_basic == 'A').astype(int)
143
+
144
+ print(f"Features: {X_basic.shape[1]}, Samples: {X_basic.shape[0]}")
145
+
146
+ for task_name, y_task in [("3-class", y_basic_enc), ("AD vs non-AD", y_basic_binary)]:
147
+ print(f"\n Task: {task_name}")
148
+
149
+ configs = {
150
+ 'No reduction (SelectKBest 100)': ('skb', 100),
151
+ 'PCA 95% variance': ('pca', 0.95),
152
+ 'PCA 99% variance': ('pca', 0.99),
153
+ 'PCA 30 components': ('pca', 30),
154
+ 'PCA 15 components': ('pca', 15),
155
+ 'PCA 10 components': ('pca', 10),
156
+ }
157
+
158
+ for config_name, (method, param) in configs.items():
159
+ scaler = StandardScaler()
160
+ X_scaled = scaler.fit_transform(X_basic)
161
+
162
+ if method == 'skb':
163
+ k = min(param, X_basic.shape[1])
164
+ selector = SelectKBest(f_classif, k=k)
165
+ X_reduced = selector.fit_transform(X_scaled, y_task)
166
+ elif method == 'pca':
167
+ if isinstance(param, float):
168
+ pca = PCA(n_components=param, random_state=42)
169
+ else:
170
+ pca = PCA(n_components=min(param, X_basic.shape[1], X_basic.shape[0]-1), random_state=42)
171
+ X_reduced = pca.fit_transform(X_scaled)
172
+
173
+ n_components = X_reduced.shape[1]
174
+
175
+ gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
176
+ learning_rate=0.05, random_state=42)
177
+ try:
178
+ y_pred = cross_val_predict(gb, X_reduced, y_task, cv=cv)
179
+ acc = accuracy_score(y_task, y_pred)
180
+
181
+ if len(np.unique(y_task)) == 2:
182
+ y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
183
+ auc = roc_auc_score(y_task, y_proba[:, 1])
184
+ print(f" {config_name:40s} β†’ {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
185
+ else:
186
+ y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
187
+ auc = roc_auc_score(y_task, y_proba, multi_class='ovr')
188
+ print(f" {config_name:40s} β†’ {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
189
+
190
+ results[f"basic_eeg_{task_name}_{config_name}"] = {'acc': acc, 'auc': auc, 'dims': n_components}
191
+ except Exception as e:
192
+ print(f" {config_name:40s} β†’ FAILED: {e}")
193
+
194
+
195
+ # ══════════════════════════════════════════════════════════════
196
+ # 3. BRAAK PREDICTOR (141 features, 174 subjects)
197
+ # ══════════════════════════════════════════════════════════════
198
+ print(f"\n\n{'=' * 70}")
199
+ print(" MODEL 3: BRAAK STAGE PREDICTOR (141 features, 174 donors)")
200
+ print("=" * 70)
201
+
202
+ df_braak = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/braak_predictor/donor_cell_features.csv')
203
+ df_braak = df_braak.dropna(subset=['braak_numeric'])
204
+
205
+ feature_cols_braak = [c for c in df_braak.columns if c not in
206
+ ['donor_label', 'braak_numeric', 'total_cells']]
207
+ X_braak = df_braak[feature_cols_braak].fillna(0).values
208
+ y_braak = df_braak['braak_numeric'].astype(int).values
209
+
210
+ # 3-group: Low (0-I), Mid (II-III), High (IV-VI)
211
+ y_braak_grouped = np.where(y_braak <= 1, 0, np.where(y_braak <= 3, 1, 2))
212
+
213
+ print(f"Features: {X_braak.shape[1]}, Samples: {X_braak.shape[0]}")
214
+ print(f"Groups: Low={sum(y_braak_grouped==0)}, Mid={sum(y_braak_grouped==1)}, High={sum(y_braak_grouped==2)}")
215
+
216
+ cv_braak = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
217
+
218
+ for config_name, (method, param) in {
219
+ 'No reduction (SelectKBest 50)': ('skb', 50),
220
+ 'PCA 95% variance': ('pca', 0.95),
221
+ 'PCA 30 components': ('pca', 30),
222
+ 'PCA 15 components': ('pca', 15),
223
+ 'PCA 10 components': ('pca', 10),
224
+ }.items():
225
+ scaler = StandardScaler()
226
+ X_scaled = scaler.fit_transform(X_braak)
227
+
228
+ if method == 'skb':
229
+ k = min(param, X_braak.shape[1])
230
+ selector = SelectKBest(f_classif, k=k)
231
+ X_reduced = selector.fit_transform(X_scaled, y_braak_grouped)
232
+ elif method == 'pca':
233
+ if isinstance(param, float):
234
+ pca = PCA(n_components=param, random_state=42)
235
+ else:
236
+ pca = PCA(n_components=min(param, X_braak.shape[1], X_braak.shape[0]-1), random_state=42)
237
+ X_reduced = pca.fit_transform(X_scaled)
238
+
239
+ n_components = X_reduced.shape[1]
240
+
241
+ gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
242
+ learning_rate=0.05, random_state=42)
243
+ try:
244
+ y_pred = cross_val_predict(gb, X_reduced, y_braak_grouped, cv=cv_braak)
245
+ acc = accuracy_score(y_braak_grouped, y_pred)
246
+ y_proba = cross_val_predict(gb, X_reduced, y_braak_grouped, cv=cv_braak, method='predict_proba')
247
+ auc = roc_auc_score(y_braak_grouped, y_proba, multi_class='ovr')
248
+ print(f" {config_name:40s} β†’ {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
249
+ results[f"braak_{config_name}"] = {'acc': acc, 'auc': auc, 'dims': n_components}
250
+ except Exception as e:
251
+ print(f" {config_name:40s} β†’ FAILED: {e}")
252
+
253
+
254
+ # ══════════════════════════════════════════════════════════════
255
+ # 4. PCA VARIANCE ANALYSIS
256
+ # ══════════════════════════════════════════════════════════════
257
+ print(f"\n\n{'=' * 70}")
258
+ print(" PCA VARIANCE ANALYSIS β€” How many components needed?")
259
+ print("=" * 70)
260
+
261
+ for name, X_data in [("Deep EEG (515)", X_raw), ("Basic EEG (851)", X_basic), ("Braak (141)", X_braak)]:
262
+ X_s = StandardScaler().fit_transform(X_data)
263
+ pca_full = PCA(random_state=42)
264
+ pca_full.fit(X_s)
265
+ cumvar = np.cumsum(pca_full.explained_variance_ratio_)
266
+
267
+ for threshold in [0.80, 0.90, 0.95, 0.99]:
268
+ n = np.argmax(cumvar >= threshold) + 1
269
+ print(f" {name:20s} {threshold:.0%} variance β†’ {n:3d} components (out of {X_data.shape[1]})")
270
+ print()
271
+
272
+
273
+ # Save results
274
+ with open(os.path.join(OUTPUT_DIR, 'pca_comparison_results.json'), 'w') as f:
275
+ json.dump(results, f, indent=2)
276
+
277
+ print(f"\n{'=' * 70}")
278
+ print(" COMPLETE β€” Results saved to {OUTPUT_DIR}")
279
+ print("=" * 70)