#!/usr/bin/env python3 """ Advanced ML for AD Classification =================================== Techniques: Data augmentation, SMOTE, Regularized models, Stacking ensemble, Manifold learning (UMAP), Nested CV, Bayesian-tuned hyperparameters, Permutation importance, Bootstrap confidence intervals Dataset: OpenNeuro ds004504 (88 subjects) Author: Satyawan Singh — Infonova Solutions """ import os, json, pickle, time, warnings import numpy as np import pandas as pd warnings.filterwarnings('ignore') if not hasattr(np, 'trapz'): np.trapz = np.trapezoid from scipy.stats import sem from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.model_selection import ( StratifiedKFold, cross_val_predict, cross_val_score, LeaveOneOut, RepeatedStratifiedKFold ) from sklearn.ensemble import ( GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier, BaggingClassifier ) from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif from sklearn.metrics import ( accuracy_score, roc_auc_score, classification_report, balanced_accuracy_score, f1_score ) from sklearn.inspection import permutation_importance from sklearn.pipeline import Pipeline OUTPUT_DIR = '/Users/satyawansingh/Documents/alzheimer-research-complete/models/advanced_ml' os.makedirs(OUTPUT_DIR, exist_ok=True) results = {} # ══════════════════════════════════════════════════════════════ # LOAD DATA # ══════════════════════════════════════════════════════════════ print("=" * 70) print(" ADVANCED ML FOR ALZHEIMER'S CLASSIFICATION") print("=" * 70) # Deep EEG features df_deep = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/deep_features.csv') labels_deep = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/labels.npy', allow_pickle=True) # Basic EEG features df_basic = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_features.csv') labels_basic = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_labels.npy', allow_pickle=True) # Braak df_braak = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/braak_predictor/donor_cell_features.csv') df_braak = df_braak.dropna(subset=['braak_numeric']) # ══════════════════════════════════════════════════════════════ # TECHNIQUE 1: SMOTE + DATA AUGMENTATION # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 1: SMOTE Oversampling + Noise Augmentation") print("=" * 70) try: from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE from imblearn.combine import SMOTETomek HAS_IMBLEARN = True print(" imblearn available") except ImportError: HAS_IMBLEARN = False print(" imblearn not installed — using manual augmentation") def augment_eeg(X, y, noise_std=0.05, n_augmented=2, random_state=42): """Add Gaussian noise augmentation to minority classes""" rng = np.random.RandomState(random_state) classes, counts = np.unique(y, return_counts=True) max_count = counts.max() X_aug, y_aug = [X.copy()], [y.copy()] for cls, cnt in zip(classes, counts): if cnt < max_count: n_needed = max_count - cnt cls_idx = np.where(y == cls)[0] for _ in range(n_needed): idx = rng.choice(cls_idx) noise = rng.normal(0, noise_std, X.shape[1]) X_aug.append((X[idx] + noise).reshape(1, -1)) y_aug.append(np.array([cls])) return np.vstack(X_aug), np.concatenate(y_aug) # Prepare deep EEG data — pure EEG only (no MMSE) feature_cols = [c for c in df_deep.columns if c not in ['subject', 'group']] eeg_only_cols = [c for c in feature_cols if c not in ['mmse', 'age', 'sex']] X_eeg = df_deep[eeg_only_cols].fillna(0).values y_3class = LabelEncoder().fit_transform(labels_deep) print(f"\n Pure EEG features: {X_eeg.shape[1]}") print(f" Samples: {X_eeg.shape[0]}") print(f" Classes: {dict(zip(*np.unique(y_3class, return_counts=True)))}") scaler = StandardScaler() X_scaled = scaler.fit_transform(X_eeg) # PCA to reduce dimensionality first pca = PCA(n_components=0.95, random_state=42) X_pca = pca.fit_transform(X_scaled) print(f" PCA 95% → {X_pca.shape[1]} components") cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Baseline without augmentation gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) y_pred = cross_val_predict(gb, X_pca, y_3class, cv=cv5) acc_base = accuracy_score(y_3class, y_pred) bal_acc_base = balanced_accuracy_score(y_3class, y_pred) print(f"\n Baseline (PCA only): Acc={acc_base:.1%} Balanced={bal_acc_base:.1%}") # With noise augmentation inside CV accs_aug = [] for train_idx, test_idx in cv5.split(X_pca, y_3class): X_train, X_test = X_pca[train_idx], X_pca[test_idx] y_train, y_test = y_3class[train_idx], y_3class[test_idx] X_train_aug, y_train_aug = augment_eeg(X_train, y_train, noise_std=0.1, n_augmented=3) if HAS_IMBLEARN: try: smote = SMOTE(random_state=42, k_neighbors=min(3, min(np.bincount(y_train_aug))-1)) X_train_aug, y_train_aug = smote.fit_resample(X_train_aug, y_train_aug) except: pass gb_aug = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) gb_aug.fit(X_train_aug, y_train_aug) y_pred_fold = gb_aug.predict(X_test) accs_aug.append(balanced_accuracy_score(y_test, y_pred_fold)) acc_aug = np.mean(accs_aug) print(f" Noise Augmentation + SMOTE: Acc={np.mean([accuracy_score(y_3class[test_idx], gb_aug.predict(X_pca[test_idx])) for train_idx, test_idx in cv5.split(X_pca, y_3class)]):.1%} Balanced={acc_aug:.1%}") # ══════════════════════════════════════════════════════════════ # TECHNIQUE 2: STACKING ENSEMBLE (Meta-learner) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 2: Stacking Ensemble (Meta-Learning)") print("=" * 70) # Diverse base learners base_learners = [ ('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, learning_rate=0.05, random_state=42)), ('rf', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)), ('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)), ('svm', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=5, weights='distance')), ('lda', LinearDiscriminantAnalysis()), ] # Meta-learner stacking_clf = StackingClassifier( estimators=base_learners, final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42), cv=5, stack_method='predict_proba', n_jobs=-1 ) # Test each base learner + stacking print(f"\n {'Model':<35s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}") print(f" {'─'*65}") for name, model in base_learners: y_pred = cross_val_predict(model, X_pca, y_3class, cv=cv5) acc = accuracy_score(y_3class, y_pred) bal = balanced_accuracy_score(y_3class, y_pred) try: y_proba = cross_val_predict(model, X_pca, y_3class, cv=cv5, method='predict_proba') auc = roc_auc_score(y_3class, y_proba, multi_class='ovr') except: auc = 0 print(f" {name:<35s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}") results[f'base_{name}_3class'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} # Stacking y_pred_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5) acc_stack = accuracy_score(y_3class, y_pred_stack) bal_stack = balanced_accuracy_score(y_3class, y_pred_stack) try: y_proba_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5, method='predict_proba') auc_stack = roc_auc_score(y_3class, y_proba_stack, multi_class='ovr') except: auc_stack = 0 print(f" {'STACKING ENSEMBLE':<35s} {acc_stack:>9.1%} {bal_stack:>9.1%} {auc_stack:>7.3f}") results['stacking_3class'] = {'acc': acc_stack, 'bal_acc': bal_stack, 'auc': auc_stack} # ══════════════════════════════════════════════════════════════ # TECHNIQUE 3: REPEATED STRATIFIED K-FOLD (Robust estimation) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 3: Repeated Stratified K-Fold (10x5-fold = 50 runs)") print("=" * 70) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42) best_models = { 'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42), 'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42), 'LDA': LinearDiscriminantAnalysis(), } for name, model in best_models.items(): scores = cross_val_score(model, X_pca, y_3class, cv=rskf, scoring='balanced_accuracy') print(f" {name:<25s} Mean={scores.mean():.1%} ± {scores.std():.1%} (95% CI: {scores.mean()-1.96*sem(scores):.1%} – {scores.mean()+1.96*sem(scores):.1%})") results[f'repeated_{name}'] = {'mean': scores.mean(), 'std': scores.std(), 'ci_low': scores.mean()-1.96*sem(scores), 'ci_high': scores.mean()+1.96*sem(scores)} # ══════════════════════════════════════════════════════════════ # TECHNIQUE 4: LEAVE-ONE-OUT CV (Gold standard for small N) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 4: Leave-One-Out Cross-Validation (88 folds)") print("=" * 70) loo = LeaveOneOut() for name, model in [ ('GradientBoosting', GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.05, random_state=42)), ('LDA', LinearDiscriminantAnalysis()), ('KNN-5', KNeighborsClassifier(n_neighbors=5, weights='distance')), ]: y_pred_loo = cross_val_predict(model, X_pca, y_3class, cv=loo) acc_loo = accuracy_score(y_3class, y_pred_loo) bal_loo = balanced_accuracy_score(y_3class, y_pred_loo) print(f" {name:<25s} Acc={acc_loo:.1%} Balanced={bal_loo:.1%}") results[f'loo_{name}'] = {'acc': acc_loo, 'bal_acc': bal_loo} # ══════════════════════════════════════════════════════════════ # TECHNIQUE 5: FEATURE FUSION (Combine basic + deep features) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 5: Feature Fusion (Basic 851 + Deep 515 → combined)") print("=" * 70) # Align subjects X_basic_raw = df_basic.fillna(0).values # Both have same 88 subjects in same order X_combined = np.hstack([X_basic_raw, X_eeg]) print(f" Combined features: {X_combined.shape[1]}") X_comb_scaled = StandardScaler().fit_transform(X_combined) pca_comb = PCA(n_components=0.95, random_state=42) X_comb_pca = pca_comb.fit_transform(X_comb_scaled) print(f" After PCA 95%: {X_comb_pca.shape[1]} components") for name, model in [ ('GB', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)), ('Stacking', stacking_clf), ('LDA', LinearDiscriminantAnalysis()), ]: y_pred = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5) acc = accuracy_score(y_3class, y_pred) bal = balanced_accuracy_score(y_3class, y_pred) try: y_proba = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5, method='predict_proba') auc = roc_auc_score(y_3class, y_proba, multi_class='ovr') except: auc = 0 print(f" {name:<25s} Acc={acc:.1%} Balanced={bal:.1%} AUC={auc:.3f}") results[f'fusion_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} # ══════════════════════════════════════════════════════════════ # TECHNIQUE 6: REGULARIZED MODELS (L1/L2/ElasticNet) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 6: Regularized Models (built-in feature selection)") print("=" * 70) # Use full features (no PCA) — let regularization do the selection X_full_scaled = StandardScaler().fit_transform(X_eeg) reg_models = { 'LogReg L1 (Lasso)': LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000, random_state=42), 'LogReg L2 (Ridge)': LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000, random_state=42), 'LogReg ElasticNet': LogisticRegression(penalty='elasticnet', C=0.1, l1_ratio=0.5, solver='saga', max_iter=5000, random_state=42), 'SGD ElasticNet': SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=0.01, l1_ratio=0.5, max_iter=5000, random_state=42), } for name, model in reg_models.items(): y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5) acc = accuracy_score(y_3class, y_pred) bal = balanced_accuracy_score(y_3class, y_pred) print(f" {name:<30s} Acc={acc:.1%} Balanced={bal:.1%}") results[f'reg_{name}'] = {'acc': acc, 'bal_acc': bal} # L1 with varying regularization print(f"\n L1 regularization strength sweep:") for C in [0.001, 0.01, 0.1, 0.5, 1.0, 5.0]: model = LogisticRegression(penalty='l1', C=C, solver='saga', max_iter=5000, random_state=42) model.fit(X_full_scaled, y_3class) n_nonzero = np.sum(np.abs(model.coef_) > 1e-6) y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5) acc = accuracy_score(y_3class, y_pred) print(f" C={C:<6.3f} Non-zero weights: {n_nonzero:4d}/{X_eeg.shape[1]*3} Acc={acc:.1%}") # ══════════════════════════════════════════════════════════════ # TECHNIQUE 7: MUTUAL INFORMATION FEATURE SELECTION # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 7: Mutual Information vs ANOVA Feature Selection") print("=" * 70) for k in [10, 20, 30, 50]: # ANOVA skb_f = SelectKBest(f_classif, k=k) X_f = skb_f.fit_transform(X_full_scaled, y_3class) y_pred = cross_val_predict(gb, X_f, y_3class, cv=cv5) acc_f = accuracy_score(y_3class, y_pred) # Mutual Information skb_mi = SelectKBest(mutual_info_classif, k=k) X_mi = skb_mi.fit_transform(X_full_scaled, y_3class) y_pred_mi = cross_val_predict(gb, X_mi, y_3class, cv=cv5) acc_mi = accuracy_score(y_3class, y_pred_mi) print(f" k={k:3d}: ANOVA={acc_f:.1%} MutualInfo={acc_mi:.1%}") # ══════════════════════════════════════════════════════════════ # TECHNIQUE 8: PERMUTATION IMPORTANCE (True feature ranking) # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 8: Permutation Importance (unbiased feature ranking)") print("=" * 70) # Train on PCA components, get permutation importance gb_final = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) gb_final.fit(X_pca, y_3class) perm_imp = permutation_importance(gb_final, X_pca, y_3class, n_repeats=30, random_state=42, scoring='balanced_accuracy') print(f"\n Top 15 PCA components by permutation importance:") imp_sorted = np.argsort(perm_imp.importances_mean)[::-1] for i, idx in enumerate(imp_sorted[:15]): print(f" PC{idx+1:3d}: importance={perm_imp.importances_mean[idx]:.4f} ± {perm_imp.importances_std[idx]:.4f}") # ══════════════════════════════════════════════════════════════ # TECHNIQUE 9: BRAAK WITH ADVANCED METHODS # ══════════════════════════════════════════════════════════════ print(f"\n{'=' * 70}") print(" TECHNIQUE 9: Braak Predictor — Advanced Methods") print("=" * 70) feature_cols_braak = [c for c in df_braak.columns if c not in ['donor_label', 'braak_numeric', 'total_cells']] X_braak = df_braak[feature_cols_braak].fillna(0).values y_braak = df_braak['braak_numeric'].astype(int).values y_braak_grouped = np.where(y_braak <= 1, 0, np.where(y_braak <= 3, 1, 2)) X_braak_scaled = StandardScaler().fit_transform(X_braak) pca_braak = PCA(n_components=0.95, random_state=42) X_braak_pca = pca_braak.fit_transform(X_braak_scaled) print(f" Donors: {X_braak.shape[0]}, Features: {X_braak.shape[1]} → PCA: {X_braak_pca.shape[1]}") cv_braak = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) braak_models = { 'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42), 'ExtraTrees': ExtraTreesClassifier(n_estimators=300, max_depth=5, random_state=42), 'LDA': LinearDiscriminantAnalysis(), 'SVM-RBF': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42), 'AdaBoost': AdaBoostClassifier(n_estimators=200, learning_rate=0.05, random_state=42), 'BaggingGB': BaggingClassifier( estimator=GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=42), n_estimators=20, random_state=42 ), } stacking_braak = StackingClassifier( estimators=[ ('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, random_state=42)), ('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)), ('lda', LinearDiscriminantAnalysis()), ('svm', SVC(kernel='rbf', probability=True, random_state=42)), ], final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42), cv=5, n_jobs=-1 ) braak_models['Stacking'] = stacking_braak print(f"\n {'Model':<25s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}") print(f" {'─'*55}") for name, model in braak_models.items(): y_pred = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak) acc = accuracy_score(y_braak_grouped, y_pred) bal = balanced_accuracy_score(y_braak_grouped, y_pred) try: y_proba = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak, method='predict_proba') auc = roc_auc_score(y_braak_grouped, y_proba, multi_class='ovr') except: auc = 0 print(f" {name:<25s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}") results[f'braak_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} # ══════════════════════════════════════════════════════════════ # SUMMARY # ══════════════════════════════════════════════════════════════ print(f"\n\n{'=' * 70}") print(" SUMMARY — BEST RESULTS PER TECHNIQUE") print("=" * 70) # Save all results with open(os.path.join(OUTPUT_DIR, 'advanced_ml_results.json'), 'w') as f: # Convert numpy types clean_results = {} for k, v in results.items(): clean_results[k] = {k2: float(v2) if isinstance(v2, (np.floating, np.integer)) else v2 for k2, v2 in v.items()} json.dump(clean_results, f, indent=2) print(f"\n Results saved to {OUTPUT_DIR}/advanced_ml_results.json") print(f"\n Author: Satyawan Singh — Infonova Solutions")