alzheimer-research-complete / train_advanced_ml.py
Satyawan1's picture
Upload train_advanced_ml.py with huggingface_hub
55be291 verified
#!/usr/bin/env python3
"""
Advanced ML for AD Classification
===================================
Techniques: Data augmentation, SMOTE, Regularized models, Stacking ensemble,
Manifold learning (UMAP), Nested CV, Bayesian-tuned hyperparameters,
Permutation importance, Bootstrap confidence intervals
Dataset: OpenNeuro ds004504 (88 subjects)
Author: Satyawan Singh β€” Infonova Solutions
"""
import os, json, pickle, time, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
if not hasattr(np, 'trapz'):
np.trapz = np.trapezoid
from scipy.stats import sem
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import (
StratifiedKFold, cross_val_predict, cross_val_score,
LeaveOneOut, RepeatedStratifiedKFold
)
from sklearn.ensemble import (
GradientBoostingClassifier, RandomForestClassifier,
ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier,
BaggingClassifier
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.metrics import (
accuracy_score, roc_auc_score, classification_report,
balanced_accuracy_score, f1_score
)
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
OUTPUT_DIR = '/Users/satyawansingh/Documents/alzheimer-research-complete/models/advanced_ml'
os.makedirs(OUTPUT_DIR, exist_ok=True)
results = {}
# ══════════════════════════════════════════════════════════════
# LOAD DATA
# ══════════════════════════════════════════════════════════════
print("=" * 70)
print(" ADVANCED ML FOR ALZHEIMER'S CLASSIFICATION")
print("=" * 70)
# Deep EEG features
df_deep = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/deep_features.csv')
labels_deep = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/labels.npy', allow_pickle=True)
# Basic EEG features
df_basic = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_features.csv')
labels_basic = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_labels.npy', allow_pickle=True)
# Braak
df_braak = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/braak_predictor/donor_cell_features.csv')
df_braak = df_braak.dropna(subset=['braak_numeric'])
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 1: SMOTE + DATA AUGMENTATION
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 1: SMOTE Oversampling + Noise Augmentation")
print("=" * 70)
try:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek
HAS_IMBLEARN = True
print(" imblearn available")
except ImportError:
HAS_IMBLEARN = False
print(" imblearn not installed β€” using manual augmentation")
def augment_eeg(X, y, noise_std=0.05, n_augmented=2, random_state=42):
"""Add Gaussian noise augmentation to minority classes"""
rng = np.random.RandomState(random_state)
classes, counts = np.unique(y, return_counts=True)
max_count = counts.max()
X_aug, y_aug = [X.copy()], [y.copy()]
for cls, cnt in zip(classes, counts):
if cnt < max_count:
n_needed = max_count - cnt
cls_idx = np.where(y == cls)[0]
for _ in range(n_needed):
idx = rng.choice(cls_idx)
noise = rng.normal(0, noise_std, X.shape[1])
X_aug.append((X[idx] + noise).reshape(1, -1))
y_aug.append(np.array([cls]))
return np.vstack(X_aug), np.concatenate(y_aug)
# Prepare deep EEG data β€” pure EEG only (no MMSE)
feature_cols = [c for c in df_deep.columns if c not in ['subject', 'group']]
eeg_only_cols = [c for c in feature_cols if c not in ['mmse', 'age', 'sex']]
X_eeg = df_deep[eeg_only_cols].fillna(0).values
y_3class = LabelEncoder().fit_transform(labels_deep)
print(f"\n Pure EEG features: {X_eeg.shape[1]}")
print(f" Samples: {X_eeg.shape[0]}")
print(f" Classes: {dict(zip(*np.unique(y_3class, return_counts=True)))}")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_eeg)
# PCA to reduce dimensionality first
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(f" PCA 95% β†’ {X_pca.shape[1]} components")
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Baseline without augmentation
gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)
y_pred = cross_val_predict(gb, X_pca, y_3class, cv=cv5)
acc_base = accuracy_score(y_3class, y_pred)
bal_acc_base = balanced_accuracy_score(y_3class, y_pred)
print(f"\n Baseline (PCA only): Acc={acc_base:.1%} Balanced={bal_acc_base:.1%}")
# With noise augmentation inside CV
accs_aug = []
for train_idx, test_idx in cv5.split(X_pca, y_3class):
X_train, X_test = X_pca[train_idx], X_pca[test_idx]
y_train, y_test = y_3class[train_idx], y_3class[test_idx]
X_train_aug, y_train_aug = augment_eeg(X_train, y_train, noise_std=0.1, n_augmented=3)
if HAS_IMBLEARN:
try:
smote = SMOTE(random_state=42, k_neighbors=min(3, min(np.bincount(y_train_aug))-1))
X_train_aug, y_train_aug = smote.fit_resample(X_train_aug, y_train_aug)
except:
pass
gb_aug = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)
gb_aug.fit(X_train_aug, y_train_aug)
y_pred_fold = gb_aug.predict(X_test)
accs_aug.append(balanced_accuracy_score(y_test, y_pred_fold))
acc_aug = np.mean(accs_aug)
print(f" Noise Augmentation + SMOTE: Acc={np.mean([accuracy_score(y_3class[test_idx], gb_aug.predict(X_pca[test_idx])) for train_idx, test_idx in cv5.split(X_pca, y_3class)]):.1%} Balanced={acc_aug:.1%}")
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 2: STACKING ENSEMBLE (Meta-learner)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 2: Stacking Ensemble (Meta-Learning)")
print("=" * 70)
# Diverse base learners
base_learners = [
('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, learning_rate=0.05, random_state=42)),
('rf', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)),
('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)),
('svm', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=5, weights='distance')),
('lda', LinearDiscriminantAnalysis()),
]
# Meta-learner
stacking_clf = StackingClassifier(
estimators=base_learners,
final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42),
cv=5,
stack_method='predict_proba',
n_jobs=-1
)
# Test each base learner + stacking
print(f"\n {'Model':<35s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}")
print(f" {'─'*65}")
for name, model in base_learners:
y_pred = cross_val_predict(model, X_pca, y_3class, cv=cv5)
acc = accuracy_score(y_3class, y_pred)
bal = balanced_accuracy_score(y_3class, y_pred)
try:
y_proba = cross_val_predict(model, X_pca, y_3class, cv=cv5, method='predict_proba')
auc = roc_auc_score(y_3class, y_proba, multi_class='ovr')
except:
auc = 0
print(f" {name:<35s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}")
results[f'base_{name}_3class'] = {'acc': acc, 'bal_acc': bal, 'auc': auc}
# Stacking
y_pred_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5)
acc_stack = accuracy_score(y_3class, y_pred_stack)
bal_stack = balanced_accuracy_score(y_3class, y_pred_stack)
try:
y_proba_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5, method='predict_proba')
auc_stack = roc_auc_score(y_3class, y_proba_stack, multi_class='ovr')
except:
auc_stack = 0
print(f" {'STACKING ENSEMBLE':<35s} {acc_stack:>9.1%} {bal_stack:>9.1%} {auc_stack:>7.3f}")
results['stacking_3class'] = {'acc': acc_stack, 'bal_acc': bal_stack, 'auc': auc_stack}
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 3: REPEATED STRATIFIED K-FOLD (Robust estimation)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 3: Repeated Stratified K-Fold (10x5-fold = 50 runs)")
print("=" * 70)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
best_models = {
'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42),
'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42),
'LDA': LinearDiscriminantAnalysis(),
}
for name, model in best_models.items():
scores = cross_val_score(model, X_pca, y_3class, cv=rskf, scoring='balanced_accuracy')
print(f" {name:<25s} Mean={scores.mean():.1%} Β± {scores.std():.1%} (95% CI: {scores.mean()-1.96*sem(scores):.1%} – {scores.mean()+1.96*sem(scores):.1%})")
results[f'repeated_{name}'] = {'mean': scores.mean(), 'std': scores.std(), 'ci_low': scores.mean()-1.96*sem(scores), 'ci_high': scores.mean()+1.96*sem(scores)}
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 4: LEAVE-ONE-OUT CV (Gold standard for small N)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 4: Leave-One-Out Cross-Validation (88 folds)")
print("=" * 70)
loo = LeaveOneOut()
for name, model in [
('GradientBoosting', GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.05, random_state=42)),
('LDA', LinearDiscriminantAnalysis()),
('KNN-5', KNeighborsClassifier(n_neighbors=5, weights='distance')),
]:
y_pred_loo = cross_val_predict(model, X_pca, y_3class, cv=loo)
acc_loo = accuracy_score(y_3class, y_pred_loo)
bal_loo = balanced_accuracy_score(y_3class, y_pred_loo)
print(f" {name:<25s} Acc={acc_loo:.1%} Balanced={bal_loo:.1%}")
results[f'loo_{name}'] = {'acc': acc_loo, 'bal_acc': bal_loo}
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 5: FEATURE FUSION (Combine basic + deep features)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 5: Feature Fusion (Basic 851 + Deep 515 β†’ combined)")
print("=" * 70)
# Align subjects
X_basic_raw = df_basic.fillna(0).values
# Both have same 88 subjects in same order
X_combined = np.hstack([X_basic_raw, X_eeg])
print(f" Combined features: {X_combined.shape[1]}")
X_comb_scaled = StandardScaler().fit_transform(X_combined)
pca_comb = PCA(n_components=0.95, random_state=42)
X_comb_pca = pca_comb.fit_transform(X_comb_scaled)
print(f" After PCA 95%: {X_comb_pca.shape[1]} components")
for name, model in [
('GB', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)),
('Stacking', stacking_clf),
('LDA', LinearDiscriminantAnalysis()),
]:
y_pred = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5)
acc = accuracy_score(y_3class, y_pred)
bal = balanced_accuracy_score(y_3class, y_pred)
try:
y_proba = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5, method='predict_proba')
auc = roc_auc_score(y_3class, y_proba, multi_class='ovr')
except:
auc = 0
print(f" {name:<25s} Acc={acc:.1%} Balanced={bal:.1%} AUC={auc:.3f}")
results[f'fusion_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc}
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 6: REGULARIZED MODELS (L1/L2/ElasticNet)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 6: Regularized Models (built-in feature selection)")
print("=" * 70)
# Use full features (no PCA) β€” let regularization do the selection
X_full_scaled = StandardScaler().fit_transform(X_eeg)
reg_models = {
'LogReg L1 (Lasso)': LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000, random_state=42),
'LogReg L2 (Ridge)': LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000, random_state=42),
'LogReg ElasticNet': LogisticRegression(penalty='elasticnet', C=0.1, l1_ratio=0.5, solver='saga', max_iter=5000, random_state=42),
'SGD ElasticNet': SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=0.01, l1_ratio=0.5, max_iter=5000, random_state=42),
}
for name, model in reg_models.items():
y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5)
acc = accuracy_score(y_3class, y_pred)
bal = balanced_accuracy_score(y_3class, y_pred)
print(f" {name:<30s} Acc={acc:.1%} Balanced={bal:.1%}")
results[f'reg_{name}'] = {'acc': acc, 'bal_acc': bal}
# L1 with varying regularization
print(f"\n L1 regularization strength sweep:")
for C in [0.001, 0.01, 0.1, 0.5, 1.0, 5.0]:
model = LogisticRegression(penalty='l1', C=C, solver='saga', max_iter=5000, random_state=42)
model.fit(X_full_scaled, y_3class)
n_nonzero = np.sum(np.abs(model.coef_) > 1e-6)
y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5)
acc = accuracy_score(y_3class, y_pred)
print(f" C={C:<6.3f} Non-zero weights: {n_nonzero:4d}/{X_eeg.shape[1]*3} Acc={acc:.1%}")
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 7: MUTUAL INFORMATION FEATURE SELECTION
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 7: Mutual Information vs ANOVA Feature Selection")
print("=" * 70)
for k in [10, 20, 30, 50]:
# ANOVA
skb_f = SelectKBest(f_classif, k=k)
X_f = skb_f.fit_transform(X_full_scaled, y_3class)
y_pred = cross_val_predict(gb, X_f, y_3class, cv=cv5)
acc_f = accuracy_score(y_3class, y_pred)
# Mutual Information
skb_mi = SelectKBest(mutual_info_classif, k=k)
X_mi = skb_mi.fit_transform(X_full_scaled, y_3class)
y_pred_mi = cross_val_predict(gb, X_mi, y_3class, cv=cv5)
acc_mi = accuracy_score(y_3class, y_pred_mi)
print(f" k={k:3d}: ANOVA={acc_f:.1%} MutualInfo={acc_mi:.1%}")
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 8: PERMUTATION IMPORTANCE (True feature ranking)
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 8: Permutation Importance (unbiased feature ranking)")
print("=" * 70)
# Train on PCA components, get permutation importance
gb_final = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)
gb_final.fit(X_pca, y_3class)
perm_imp = permutation_importance(gb_final, X_pca, y_3class, n_repeats=30, random_state=42, scoring='balanced_accuracy')
print(f"\n Top 15 PCA components by permutation importance:")
imp_sorted = np.argsort(perm_imp.importances_mean)[::-1]
for i, idx in enumerate(imp_sorted[:15]):
print(f" PC{idx+1:3d}: importance={perm_imp.importances_mean[idx]:.4f} Β± {perm_imp.importances_std[idx]:.4f}")
# ══════════════════════════════════════════════════════════════
# TECHNIQUE 9: BRAAK WITH ADVANCED METHODS
# ══════════════════════════════════════════════════════════════
print(f"\n{'=' * 70}")
print(" TECHNIQUE 9: Braak Predictor β€” Advanced Methods")
print("=" * 70)
feature_cols_braak = [c for c in df_braak.columns if c not in ['donor_label', 'braak_numeric', 'total_cells']]
X_braak = df_braak[feature_cols_braak].fillna(0).values
y_braak = df_braak['braak_numeric'].astype(int).values
y_braak_grouped = np.where(y_braak <= 1, 0, np.where(y_braak <= 3, 1, 2))
X_braak_scaled = StandardScaler().fit_transform(X_braak)
pca_braak = PCA(n_components=0.95, random_state=42)
X_braak_pca = pca_braak.fit_transform(X_braak_scaled)
print(f" Donors: {X_braak.shape[0]}, Features: {X_braak.shape[1]} β†’ PCA: {X_braak_pca.shape[1]}")
cv_braak = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
braak_models = {
'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42),
'ExtraTrees': ExtraTreesClassifier(n_estimators=300, max_depth=5, random_state=42),
'LDA': LinearDiscriminantAnalysis(),
'SVM-RBF': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
'AdaBoost': AdaBoostClassifier(n_estimators=200, learning_rate=0.05, random_state=42),
'BaggingGB': BaggingClassifier(
estimator=GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=42),
n_estimators=20, random_state=42
),
}
stacking_braak = StackingClassifier(
estimators=[
('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, random_state=42)),
('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)),
('lda', LinearDiscriminantAnalysis()),
('svm', SVC(kernel='rbf', probability=True, random_state=42)),
],
final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42),
cv=5, n_jobs=-1
)
braak_models['Stacking'] = stacking_braak
print(f"\n {'Model':<25s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}")
print(f" {'─'*55}")
for name, model in braak_models.items():
y_pred = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak)
acc = accuracy_score(y_braak_grouped, y_pred)
bal = balanced_accuracy_score(y_braak_grouped, y_pred)
try:
y_proba = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak, method='predict_proba')
auc = roc_auc_score(y_braak_grouped, y_proba, multi_class='ovr')
except:
auc = 0
print(f" {name:<25s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}")
results[f'braak_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc}
# ══════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════
print(f"\n\n{'=' * 70}")
print(" SUMMARY β€” BEST RESULTS PER TECHNIQUE")
print("=" * 70)
# Save all results
with open(os.path.join(OUTPUT_DIR, 'advanced_ml_results.json'), 'w') as f:
# Convert numpy types
clean_results = {}
for k, v in results.items():
clean_results[k] = {k2: float(v2) if isinstance(v2, (np.floating, np.integer)) else v2 for k2, v2 in v.items()}
json.dump(clean_results, f, indent=2)
print(f"\n Results saved to {OUTPUT_DIR}/advanced_ml_results.json")
print(f"\n Author: Satyawan Singh β€” Infonova Solutions")