Upload retrain_with_pca.py with huggingface_hub
Browse files- retrain_with_pca.py +279 -0
retrain_with_pca.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Retrain all EEG + Braak models with PCA dimensionality reduction
|
| 4 |
+
================================================================
|
| 5 |
+
Compares: No PCA vs PCA(95%) vs PCA(99%) vs PCA+SelectKBest
|
| 6 |
+
Also retrains EEG without MMSE to find pure EEG biomarkers
|
| 7 |
+
|
| 8 |
+
Author: Satyawan Singh β Infonova Solutions
|
| 9 |
+
"""
|
| 10 |
+
import os, json, pickle, warnings
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
warnings.filterwarnings('ignore')
|
| 14 |
+
|
| 15 |
+
if not hasattr(np, 'trapz'):
|
| 16 |
+
np.trapz = np.trapezoid
|
| 17 |
+
|
| 18 |
+
from sklearn.decomposition import PCA
|
| 19 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 20 |
+
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
|
| 21 |
+
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
|
| 22 |
+
from sklearn.svm import SVC
|
| 23 |
+
from sklearn.feature_selection import SelectKBest, f_classif
|
| 24 |
+
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
|
| 25 |
+
from sklearn.pipeline import Pipeline
|
| 26 |
+
|
| 27 |
+
OUTPUT_DIR = '/Users/satyawansingh/Documents/alzheimer-research-complete/models/pca_analysis'
|
| 28 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
results = {}
|
| 31 |
+
|
| 32 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
# 1. DEEP EEG MODEL (515 features, 88 subjects)
|
| 34 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
print("=" * 70)
|
| 36 |
+
print(" MODEL 1: DEEP EEG β AD vs FTD vs Control (88 subjects)")
|
| 37 |
+
print("=" * 70)
|
| 38 |
+
|
| 39 |
+
df_eeg = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/deep_features.csv')
|
| 40 |
+
labels = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/labels.npy', allow_pickle=True)
|
| 41 |
+
|
| 42 |
+
feature_cols = [c for c in df_eeg.columns if c not in ['subject', 'group']]
|
| 43 |
+
X_raw = df_eeg[feature_cols].fillna(0).values
|
| 44 |
+
y = LabelEncoder().fit_transform(labels) # AD=0, C=1, F=2
|
| 45 |
+
|
| 46 |
+
# Binary: AD vs non-AD
|
| 47 |
+
y_binary = (labels == 'A').astype(int)
|
| 48 |
+
|
| 49 |
+
print(f"Features: {X_raw.shape[1]}, Samples: {X_raw.shape[0]}")
|
| 50 |
+
print(f"Classes: AD={sum(labels=='A')}, Control={sum(labels=='C')}, FTD={sum(labels=='F')}")
|
| 51 |
+
|
| 52 |
+
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
| 53 |
+
|
| 54 |
+
# --- With MMSE vs Without MMSE ---
|
| 55 |
+
mmse_col_idx = feature_cols.index('mmse') if 'mmse' in feature_cols else None
|
| 56 |
+
age_col_idx = feature_cols.index('age') if 'age' in feature_cols else None
|
| 57 |
+
|
| 58 |
+
eeg_only_cols = [i for i, c in enumerate(feature_cols) if c not in ['mmse', 'age', 'sex']]
|
| 59 |
+
X_eeg_only = X_raw[:, eeg_only_cols]
|
| 60 |
+
|
| 61 |
+
print(f"\nWith MMSE+demographics: {X_raw.shape[1]} features")
|
| 62 |
+
print(f"Pure EEG only: {X_eeg_only.shape[1]} features")
|
| 63 |
+
|
| 64 |
+
for data_name, X_data in [("All features", X_raw), ("Pure EEG only", X_eeg_only)]:
|
| 65 |
+
print(f"\n{'β' * 60}")
|
| 66 |
+
print(f" {data_name}")
|
| 67 |
+
print(f"{'β' * 60}")
|
| 68 |
+
|
| 69 |
+
for task_name, y_task in [("3-class", y), ("AD vs non-AD", y_binary)]:
|
| 70 |
+
print(f"\n Task: {task_name}")
|
| 71 |
+
|
| 72 |
+
configs = {
|
| 73 |
+
'No reduction (SelectKBest 120)': ('skb', 120),
|
| 74 |
+
'PCA 95% variance': ('pca', 0.95),
|
| 75 |
+
'PCA 99% variance': ('pca', 0.99),
|
| 76 |
+
'PCA 30 components': ('pca', 30),
|
| 77 |
+
'PCA 15 components': ('pca', 15),
|
| 78 |
+
'PCA 95% + SelectKBest 20': ('pca_skb', (0.95, 20)),
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
for config_name, (method, param) in configs.items():
|
| 82 |
+
scaler = StandardScaler()
|
| 83 |
+
X_scaled = scaler.fit_transform(X_data)
|
| 84 |
+
|
| 85 |
+
if method == 'skb':
|
| 86 |
+
k = min(param, X_data.shape[1])
|
| 87 |
+
selector = SelectKBest(f_classif, k=k)
|
| 88 |
+
X_reduced = selector.fit_transform(X_scaled, y_task)
|
| 89 |
+
elif method == 'pca':
|
| 90 |
+
if isinstance(param, float):
|
| 91 |
+
pca = PCA(n_components=param, random_state=42)
|
| 92 |
+
else:
|
| 93 |
+
pca = PCA(n_components=min(param, X_data.shape[1], X_data.shape[0]-1), random_state=42)
|
| 94 |
+
X_reduced = pca.fit_transform(X_scaled)
|
| 95 |
+
elif method == 'pca_skb':
|
| 96 |
+
pca_param, skb_k = param
|
| 97 |
+
pca = PCA(n_components=pca_param, random_state=42)
|
| 98 |
+
X_pca = pca.fit_transform(X_scaled)
|
| 99 |
+
skb_k = min(skb_k, X_pca.shape[1])
|
| 100 |
+
selector = SelectKBest(f_classif, k=skb_k)
|
| 101 |
+
X_reduced = selector.fit_transform(X_pca, y_task)
|
| 102 |
+
|
| 103 |
+
n_components = X_reduced.shape[1]
|
| 104 |
+
|
| 105 |
+
# Train GradientBoosting
|
| 106 |
+
gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
|
| 107 |
+
learning_rate=0.05, random_state=42)
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
y_pred = cross_val_predict(gb, X_reduced, y_task, cv=cv)
|
| 111 |
+
acc = accuracy_score(y_task, y_pred)
|
| 112 |
+
|
| 113 |
+
if len(np.unique(y_task)) == 2:
|
| 114 |
+
y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
|
| 115 |
+
auc = roc_auc_score(y_task, y_proba[:, 1])
|
| 116 |
+
print(f" {config_name:40s} β {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
|
| 117 |
+
key = f"deep_eeg_{data_name}_{task_name}_{config_name}"
|
| 118 |
+
results[key] = {'acc': acc, 'auc': auc, 'dims': n_components}
|
| 119 |
+
else:
|
| 120 |
+
y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
|
| 121 |
+
auc = roc_auc_score(y_task, y_proba, multi_class='ovr')
|
| 122 |
+
print(f" {config_name:40s} β {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
|
| 123 |
+
key = f"deep_eeg_{data_name}_{task_name}_{config_name}"
|
| 124 |
+
results[key] = {'acc': acc, 'auc': auc, 'dims': n_components}
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f" {config_name:40s} β FAILED: {e}")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
# 2. BASIC EEG MODEL (851 features, 88 subjects)
|
| 131 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 132 |
+
print(f"\n\n{'=' * 70}")
|
| 133 |
+
print(" MODEL 2: BASIC EEG CLASSIFIER (851 features, 88 subjects)")
|
| 134 |
+
print("=" * 70)
|
| 135 |
+
|
| 136 |
+
df_basic = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_features.csv')
|
| 137 |
+
y_basic = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_labels.npy', allow_pickle=True)
|
| 138 |
+
|
| 139 |
+
X_basic = df_basic.fillna(0).values
|
| 140 |
+
le = LabelEncoder()
|
| 141 |
+
y_basic_enc = le.fit_transform(y_basic)
|
| 142 |
+
y_basic_binary = (y_basic == 'A').astype(int)
|
| 143 |
+
|
| 144 |
+
print(f"Features: {X_basic.shape[1]}, Samples: {X_basic.shape[0]}")
|
| 145 |
+
|
| 146 |
+
for task_name, y_task in [("3-class", y_basic_enc), ("AD vs non-AD", y_basic_binary)]:
|
| 147 |
+
print(f"\n Task: {task_name}")
|
| 148 |
+
|
| 149 |
+
configs = {
|
| 150 |
+
'No reduction (SelectKBest 100)': ('skb', 100),
|
| 151 |
+
'PCA 95% variance': ('pca', 0.95),
|
| 152 |
+
'PCA 99% variance': ('pca', 0.99),
|
| 153 |
+
'PCA 30 components': ('pca', 30),
|
| 154 |
+
'PCA 15 components': ('pca', 15),
|
| 155 |
+
'PCA 10 components': ('pca', 10),
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
for config_name, (method, param) in configs.items():
|
| 159 |
+
scaler = StandardScaler()
|
| 160 |
+
X_scaled = scaler.fit_transform(X_basic)
|
| 161 |
+
|
| 162 |
+
if method == 'skb':
|
| 163 |
+
k = min(param, X_basic.shape[1])
|
| 164 |
+
selector = SelectKBest(f_classif, k=k)
|
| 165 |
+
X_reduced = selector.fit_transform(X_scaled, y_task)
|
| 166 |
+
elif method == 'pca':
|
| 167 |
+
if isinstance(param, float):
|
| 168 |
+
pca = PCA(n_components=param, random_state=42)
|
| 169 |
+
else:
|
| 170 |
+
pca = PCA(n_components=min(param, X_basic.shape[1], X_basic.shape[0]-1), random_state=42)
|
| 171 |
+
X_reduced = pca.fit_transform(X_scaled)
|
| 172 |
+
|
| 173 |
+
n_components = X_reduced.shape[1]
|
| 174 |
+
|
| 175 |
+
gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
|
| 176 |
+
learning_rate=0.05, random_state=42)
|
| 177 |
+
try:
|
| 178 |
+
y_pred = cross_val_predict(gb, X_reduced, y_task, cv=cv)
|
| 179 |
+
acc = accuracy_score(y_task, y_pred)
|
| 180 |
+
|
| 181 |
+
if len(np.unique(y_task)) == 2:
|
| 182 |
+
y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
|
| 183 |
+
auc = roc_auc_score(y_task, y_proba[:, 1])
|
| 184 |
+
print(f" {config_name:40s} β {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
|
| 185 |
+
else:
|
| 186 |
+
y_proba = cross_val_predict(gb, X_reduced, y_task, cv=cv, method='predict_proba')
|
| 187 |
+
auc = roc_auc_score(y_task, y_proba, multi_class='ovr')
|
| 188 |
+
print(f" {config_name:40s} β {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
|
| 189 |
+
|
| 190 |
+
results[f"basic_eeg_{task_name}_{config_name}"] = {'acc': acc, 'auc': auc, 'dims': n_components}
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f" {config_name:40s} β FAILED: {e}")
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
# 3. BRAAK PREDICTOR (141 features, 174 subjects)
|
| 197 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 198 |
+
print(f"\n\n{'=' * 70}")
|
| 199 |
+
print(" MODEL 3: BRAAK STAGE PREDICTOR (141 features, 174 donors)")
|
| 200 |
+
print("=" * 70)
|
| 201 |
+
|
| 202 |
+
df_braak = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/braak_predictor/donor_cell_features.csv')
|
| 203 |
+
df_braak = df_braak.dropna(subset=['braak_numeric'])
|
| 204 |
+
|
| 205 |
+
feature_cols_braak = [c for c in df_braak.columns if c not in
|
| 206 |
+
['donor_label', 'braak_numeric', 'total_cells']]
|
| 207 |
+
X_braak = df_braak[feature_cols_braak].fillna(0).values
|
| 208 |
+
y_braak = df_braak['braak_numeric'].astype(int).values
|
| 209 |
+
|
| 210 |
+
# 3-group: Low (0-I), Mid (II-III), High (IV-VI)
|
| 211 |
+
y_braak_grouped = np.where(y_braak <= 1, 0, np.where(y_braak <= 3, 1, 2))
|
| 212 |
+
|
| 213 |
+
print(f"Features: {X_braak.shape[1]}, Samples: {X_braak.shape[0]}")
|
| 214 |
+
print(f"Groups: Low={sum(y_braak_grouped==0)}, Mid={sum(y_braak_grouped==1)}, High={sum(y_braak_grouped==2)}")
|
| 215 |
+
|
| 216 |
+
cv_braak = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
| 217 |
+
|
| 218 |
+
for config_name, (method, param) in {
|
| 219 |
+
'No reduction (SelectKBest 50)': ('skb', 50),
|
| 220 |
+
'PCA 95% variance': ('pca', 0.95),
|
| 221 |
+
'PCA 30 components': ('pca', 30),
|
| 222 |
+
'PCA 15 components': ('pca', 15),
|
| 223 |
+
'PCA 10 components': ('pca', 10),
|
| 224 |
+
}.items():
|
| 225 |
+
scaler = StandardScaler()
|
| 226 |
+
X_scaled = scaler.fit_transform(X_braak)
|
| 227 |
+
|
| 228 |
+
if method == 'skb':
|
| 229 |
+
k = min(param, X_braak.shape[1])
|
| 230 |
+
selector = SelectKBest(f_classif, k=k)
|
| 231 |
+
X_reduced = selector.fit_transform(X_scaled, y_braak_grouped)
|
| 232 |
+
elif method == 'pca':
|
| 233 |
+
if isinstance(param, float):
|
| 234 |
+
pca = PCA(n_components=param, random_state=42)
|
| 235 |
+
else:
|
| 236 |
+
pca = PCA(n_components=min(param, X_braak.shape[1], X_braak.shape[0]-1), random_state=42)
|
| 237 |
+
X_reduced = pca.fit_transform(X_scaled)
|
| 238 |
+
|
| 239 |
+
n_components = X_reduced.shape[1]
|
| 240 |
+
|
| 241 |
+
gb = GradientBoostingClassifier(n_estimators=200, max_depth=3,
|
| 242 |
+
learning_rate=0.05, random_state=42)
|
| 243 |
+
try:
|
| 244 |
+
y_pred = cross_val_predict(gb, X_reduced, y_braak_grouped, cv=cv_braak)
|
| 245 |
+
acc = accuracy_score(y_braak_grouped, y_pred)
|
| 246 |
+
y_proba = cross_val_predict(gb, X_reduced, y_braak_grouped, cv=cv_braak, method='predict_proba')
|
| 247 |
+
auc = roc_auc_score(y_braak_grouped, y_proba, multi_class='ovr')
|
| 248 |
+
print(f" {config_name:40s} β {n_components:3d} dims Acc={acc:.1%} AUC={auc:.3f}")
|
| 249 |
+
results[f"braak_{config_name}"] = {'acc': acc, 'auc': auc, 'dims': n_components}
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f" {config_name:40s} β FAILED: {e}")
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 255 |
+
# 4. PCA VARIANCE ANALYSIS
|
| 256 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
+
print(f"\n\n{'=' * 70}")
|
| 258 |
+
print(" PCA VARIANCE ANALYSIS β How many components needed?")
|
| 259 |
+
print("=" * 70)
|
| 260 |
+
|
| 261 |
+
for name, X_data in [("Deep EEG (515)", X_raw), ("Basic EEG (851)", X_basic), ("Braak (141)", X_braak)]:
|
| 262 |
+
X_s = StandardScaler().fit_transform(X_data)
|
| 263 |
+
pca_full = PCA(random_state=42)
|
| 264 |
+
pca_full.fit(X_s)
|
| 265 |
+
cumvar = np.cumsum(pca_full.explained_variance_ratio_)
|
| 266 |
+
|
| 267 |
+
for threshold in [0.80, 0.90, 0.95, 0.99]:
|
| 268 |
+
n = np.argmax(cumvar >= threshold) + 1
|
| 269 |
+
print(f" {name:20s} {threshold:.0%} variance β {n:3d} components (out of {X_data.shape[1]})")
|
| 270 |
+
print()
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# Save results
|
| 274 |
+
with open(os.path.join(OUTPUT_DIR, 'pca_comparison_results.json'), 'w') as f:
|
| 275 |
+
json.dump(results, f, indent=2)
|
| 276 |
+
|
| 277 |
+
print(f"\n{'=' * 70}")
|
| 278 |
+
print(" COMPLETE β Results saved to {OUTPUT_DIR}")
|
| 279 |
+
print("=" * 70)
|