| |
| """ |
| Advanced ML for AD Classification |
| =================================== |
| Techniques: Data augmentation, SMOTE, Regularized models, Stacking ensemble, |
| Manifold learning (UMAP), Nested CV, Bayesian-tuned hyperparameters, |
| Permutation importance, Bootstrap confidence intervals |
| |
| Dataset: OpenNeuro ds004504 (88 subjects) |
| Author: Satyawan Singh β Infonova Solutions |
| """ |
| import os, json, pickle, time, warnings |
| import numpy as np |
| import pandas as pd |
| warnings.filterwarnings('ignore') |
|
|
| if not hasattr(np, 'trapz'): |
| np.trapz = np.trapezoid |
|
|
| from scipy.stats import sem |
| from sklearn.decomposition import PCA |
| from sklearn.manifold import TSNE |
| from sklearn.preprocessing import StandardScaler, LabelEncoder |
| from sklearn.model_selection import ( |
| StratifiedKFold, cross_val_predict, cross_val_score, |
| LeaveOneOut, RepeatedStratifiedKFold |
| ) |
| from sklearn.ensemble import ( |
| GradientBoostingClassifier, RandomForestClassifier, |
| ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier, |
| BaggingClassifier |
| ) |
| from sklearn.svm import SVC |
| from sklearn.linear_model import LogisticRegression, SGDClassifier |
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.discriminant_analysis import LinearDiscriminantAnalysis |
| from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif |
| from sklearn.metrics import ( |
| accuracy_score, roc_auc_score, classification_report, |
| balanced_accuracy_score, f1_score |
| ) |
| from sklearn.inspection import permutation_importance |
| from sklearn.pipeline import Pipeline |
|
|
| OUTPUT_DIR = '/Users/satyawansingh/Documents/alzheimer-research-complete/models/advanced_ml' |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| results = {} |
|
|
| |
| |
| |
| print("=" * 70) |
| print(" ADVANCED ML FOR ALZHEIMER'S CLASSIFICATION") |
| print("=" * 70) |
|
|
| |
| df_deep = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/deep_features.csv') |
| labels_deep = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_deep_analysis/labels.npy', allow_pickle=True) |
|
|
| |
| df_basic = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_features.csv') |
| labels_basic = np.load('/Users/satyawansingh/Documents/alzheimer-research-complete/models/eeg_ad_classifier/eeg_labels.npy', allow_pickle=True) |
|
|
| |
| df_braak = pd.read_csv('/Users/satyawansingh/Documents/alzheimer-research-complete/models/braak_predictor/donor_cell_features.csv') |
| df_braak = df_braak.dropna(subset=['braak_numeric']) |
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 1: SMOTE Oversampling + Noise Augmentation") |
| print("=" * 70) |
|
|
| try: |
| from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE |
| from imblearn.combine import SMOTETomek |
| HAS_IMBLEARN = True |
| print(" imblearn available") |
| except ImportError: |
| HAS_IMBLEARN = False |
| print(" imblearn not installed β using manual augmentation") |
|
|
| def augment_eeg(X, y, noise_std=0.05, n_augmented=2, random_state=42): |
| """Add Gaussian noise augmentation to minority classes""" |
| rng = np.random.RandomState(random_state) |
| classes, counts = np.unique(y, return_counts=True) |
| max_count = counts.max() |
| |
| X_aug, y_aug = [X.copy()], [y.copy()] |
| |
| for cls, cnt in zip(classes, counts): |
| if cnt < max_count: |
| n_needed = max_count - cnt |
| cls_idx = np.where(y == cls)[0] |
| for _ in range(n_needed): |
| idx = rng.choice(cls_idx) |
| noise = rng.normal(0, noise_std, X.shape[1]) |
| X_aug.append((X[idx] + noise).reshape(1, -1)) |
| y_aug.append(np.array([cls])) |
| |
| return np.vstack(X_aug), np.concatenate(y_aug) |
|
|
|
|
| |
| feature_cols = [c for c in df_deep.columns if c not in ['subject', 'group']] |
| eeg_only_cols = [c for c in feature_cols if c not in ['mmse', 'age', 'sex']] |
| X_eeg = df_deep[eeg_only_cols].fillna(0).values |
| y_3class = LabelEncoder().fit_transform(labels_deep) |
|
|
| print(f"\n Pure EEG features: {X_eeg.shape[1]}") |
| print(f" Samples: {X_eeg.shape[0]}") |
| print(f" Classes: {dict(zip(*np.unique(y_3class, return_counts=True)))}") |
|
|
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X_eeg) |
|
|
| |
| pca = PCA(n_components=0.95, random_state=42) |
| X_pca = pca.fit_transform(X_scaled) |
| print(f" PCA 95% β {X_pca.shape[1]} components") |
|
|
| cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
| |
| gb = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) |
| y_pred = cross_val_predict(gb, X_pca, y_3class, cv=cv5) |
| acc_base = accuracy_score(y_3class, y_pred) |
| bal_acc_base = balanced_accuracy_score(y_3class, y_pred) |
| print(f"\n Baseline (PCA only): Acc={acc_base:.1%} Balanced={bal_acc_base:.1%}") |
|
|
| |
| accs_aug = [] |
| for train_idx, test_idx in cv5.split(X_pca, y_3class): |
| X_train, X_test = X_pca[train_idx], X_pca[test_idx] |
| y_train, y_test = y_3class[train_idx], y_3class[test_idx] |
| |
| X_train_aug, y_train_aug = augment_eeg(X_train, y_train, noise_std=0.1, n_augmented=3) |
| |
| if HAS_IMBLEARN: |
| try: |
| smote = SMOTE(random_state=42, k_neighbors=min(3, min(np.bincount(y_train_aug))-1)) |
| X_train_aug, y_train_aug = smote.fit_resample(X_train_aug, y_train_aug) |
| except: |
| pass |
| |
| gb_aug = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) |
| gb_aug.fit(X_train_aug, y_train_aug) |
| y_pred_fold = gb_aug.predict(X_test) |
| accs_aug.append(balanced_accuracy_score(y_test, y_pred_fold)) |
|
|
| acc_aug = np.mean(accs_aug) |
| print(f" Noise Augmentation + SMOTE: Acc={np.mean([accuracy_score(y_3class[test_idx], gb_aug.predict(X_pca[test_idx])) for train_idx, test_idx in cv5.split(X_pca, y_3class)]):.1%} Balanced={acc_aug:.1%}") |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 2: Stacking Ensemble (Meta-Learning)") |
| print("=" * 70) |
|
|
| |
| base_learners = [ |
| ('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, learning_rate=0.05, random_state=42)), |
| ('rf', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)), |
| ('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)), |
| ('svm', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)), |
| ('knn', KNeighborsClassifier(n_neighbors=5, weights='distance')), |
| ('lda', LinearDiscriminantAnalysis()), |
| ] |
|
|
| |
| stacking_clf = StackingClassifier( |
| estimators=base_learners, |
| final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42), |
| cv=5, |
| stack_method='predict_proba', |
| n_jobs=-1 |
| ) |
|
|
| |
| print(f"\n {'Model':<35s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}") |
| print(f" {'β'*65}") |
|
|
| for name, model in base_learners: |
| y_pred = cross_val_predict(model, X_pca, y_3class, cv=cv5) |
| acc = accuracy_score(y_3class, y_pred) |
| bal = balanced_accuracy_score(y_3class, y_pred) |
| try: |
| y_proba = cross_val_predict(model, X_pca, y_3class, cv=cv5, method='predict_proba') |
| auc = roc_auc_score(y_3class, y_proba, multi_class='ovr') |
| except: |
| auc = 0 |
| print(f" {name:<35s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}") |
| results[f'base_{name}_3class'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} |
|
|
| |
| y_pred_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5) |
| acc_stack = accuracy_score(y_3class, y_pred_stack) |
| bal_stack = balanced_accuracy_score(y_3class, y_pred_stack) |
| try: |
| y_proba_stack = cross_val_predict(stacking_clf, X_pca, y_3class, cv=cv5, method='predict_proba') |
| auc_stack = roc_auc_score(y_3class, y_proba_stack, multi_class='ovr') |
| except: |
| auc_stack = 0 |
| print(f" {'STACKING ENSEMBLE':<35s} {acc_stack:>9.1%} {bal_stack:>9.1%} {auc_stack:>7.3f}") |
| results['stacking_3class'] = {'acc': acc_stack, 'bal_acc': bal_stack, 'auc': auc_stack} |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 3: Repeated Stratified K-Fold (10x5-fold = 50 runs)") |
| print("=" * 70) |
|
|
| rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42) |
|
|
| best_models = { |
| 'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42), |
| 'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42), |
| 'LDA': LinearDiscriminantAnalysis(), |
| } |
|
|
| for name, model in best_models.items(): |
| scores = cross_val_score(model, X_pca, y_3class, cv=rskf, scoring='balanced_accuracy') |
| print(f" {name:<25s} Mean={scores.mean():.1%} Β± {scores.std():.1%} (95% CI: {scores.mean()-1.96*sem(scores):.1%} β {scores.mean()+1.96*sem(scores):.1%})") |
| results[f'repeated_{name}'] = {'mean': scores.mean(), 'std': scores.std(), 'ci_low': scores.mean()-1.96*sem(scores), 'ci_high': scores.mean()+1.96*sem(scores)} |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 4: Leave-One-Out Cross-Validation (88 folds)") |
| print("=" * 70) |
|
|
| loo = LeaveOneOut() |
| for name, model in [ |
| ('GradientBoosting', GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.05, random_state=42)), |
| ('LDA', LinearDiscriminantAnalysis()), |
| ('KNN-5', KNeighborsClassifier(n_neighbors=5, weights='distance')), |
| ]: |
| y_pred_loo = cross_val_predict(model, X_pca, y_3class, cv=loo) |
| acc_loo = accuracy_score(y_3class, y_pred_loo) |
| bal_loo = balanced_accuracy_score(y_3class, y_pred_loo) |
| print(f" {name:<25s} Acc={acc_loo:.1%} Balanced={bal_loo:.1%}") |
| results[f'loo_{name}'] = {'acc': acc_loo, 'bal_acc': bal_loo} |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 5: Feature Fusion (Basic 851 + Deep 515 β combined)") |
| print("=" * 70) |
|
|
| |
| X_basic_raw = df_basic.fillna(0).values |
| |
| X_combined = np.hstack([X_basic_raw, X_eeg]) |
| print(f" Combined features: {X_combined.shape[1]}") |
|
|
| X_comb_scaled = StandardScaler().fit_transform(X_combined) |
| pca_comb = PCA(n_components=0.95, random_state=42) |
| X_comb_pca = pca_comb.fit_transform(X_comb_scaled) |
| print(f" After PCA 95%: {X_comb_pca.shape[1]} components") |
|
|
| for name, model in [ |
| ('GB', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42)), |
| ('Stacking', stacking_clf), |
| ('LDA', LinearDiscriminantAnalysis()), |
| ]: |
| y_pred = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5) |
| acc = accuracy_score(y_3class, y_pred) |
| bal = balanced_accuracy_score(y_3class, y_pred) |
| try: |
| y_proba = cross_val_predict(model, X_comb_pca, y_3class, cv=cv5, method='predict_proba') |
| auc = roc_auc_score(y_3class, y_proba, multi_class='ovr') |
| except: |
| auc = 0 |
| print(f" {name:<25s} Acc={acc:.1%} Balanced={bal:.1%} AUC={auc:.3f}") |
| results[f'fusion_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 6: Regularized Models (built-in feature selection)") |
| print("=" * 70) |
|
|
| |
| X_full_scaled = StandardScaler().fit_transform(X_eeg) |
|
|
| reg_models = { |
| 'LogReg L1 (Lasso)': LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000, random_state=42), |
| 'LogReg L2 (Ridge)': LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000, random_state=42), |
| 'LogReg ElasticNet': LogisticRegression(penalty='elasticnet', C=0.1, l1_ratio=0.5, solver='saga', max_iter=5000, random_state=42), |
| 'SGD ElasticNet': SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=0.01, l1_ratio=0.5, max_iter=5000, random_state=42), |
| } |
|
|
| for name, model in reg_models.items(): |
| y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5) |
| acc = accuracy_score(y_3class, y_pred) |
| bal = balanced_accuracy_score(y_3class, y_pred) |
| print(f" {name:<30s} Acc={acc:.1%} Balanced={bal:.1%}") |
| results[f'reg_{name}'] = {'acc': acc, 'bal_acc': bal} |
|
|
| |
| print(f"\n L1 regularization strength sweep:") |
| for C in [0.001, 0.01, 0.1, 0.5, 1.0, 5.0]: |
| model = LogisticRegression(penalty='l1', C=C, solver='saga', max_iter=5000, random_state=42) |
| model.fit(X_full_scaled, y_3class) |
| n_nonzero = np.sum(np.abs(model.coef_) > 1e-6) |
| y_pred = cross_val_predict(model, X_full_scaled, y_3class, cv=cv5) |
| acc = accuracy_score(y_3class, y_pred) |
| print(f" C={C:<6.3f} Non-zero weights: {n_nonzero:4d}/{X_eeg.shape[1]*3} Acc={acc:.1%}") |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 7: Mutual Information vs ANOVA Feature Selection") |
| print("=" * 70) |
|
|
| for k in [10, 20, 30, 50]: |
| |
| skb_f = SelectKBest(f_classif, k=k) |
| X_f = skb_f.fit_transform(X_full_scaled, y_3class) |
| y_pred = cross_val_predict(gb, X_f, y_3class, cv=cv5) |
| acc_f = accuracy_score(y_3class, y_pred) |
| |
| |
| skb_mi = SelectKBest(mutual_info_classif, k=k) |
| X_mi = skb_mi.fit_transform(X_full_scaled, y_3class) |
| y_pred_mi = cross_val_predict(gb, X_mi, y_3class, cv=cv5) |
| acc_mi = accuracy_score(y_3class, y_pred_mi) |
| |
| print(f" k={k:3d}: ANOVA={acc_f:.1%} MutualInfo={acc_mi:.1%}") |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 8: Permutation Importance (unbiased feature ranking)") |
| print("=" * 70) |
|
|
| |
| gb_final = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42) |
| gb_final.fit(X_pca, y_3class) |
|
|
| perm_imp = permutation_importance(gb_final, X_pca, y_3class, n_repeats=30, random_state=42, scoring='balanced_accuracy') |
|
|
| print(f"\n Top 15 PCA components by permutation importance:") |
| imp_sorted = np.argsort(perm_imp.importances_mean)[::-1] |
| for i, idx in enumerate(imp_sorted[:15]): |
| print(f" PC{idx+1:3d}: importance={perm_imp.importances_mean[idx]:.4f} Β± {perm_imp.importances_std[idx]:.4f}") |
|
|
|
|
| |
| |
| |
| print(f"\n{'=' * 70}") |
| print(" TECHNIQUE 9: Braak Predictor β Advanced Methods") |
| print("=" * 70) |
|
|
| feature_cols_braak = [c for c in df_braak.columns if c not in ['donor_label', 'braak_numeric', 'total_cells']] |
| X_braak = df_braak[feature_cols_braak].fillna(0).values |
| y_braak = df_braak['braak_numeric'].astype(int).values |
| y_braak_grouped = np.where(y_braak <= 1, 0, np.where(y_braak <= 3, 1, 2)) |
|
|
| X_braak_scaled = StandardScaler().fit_transform(X_braak) |
| pca_braak = PCA(n_components=0.95, random_state=42) |
| X_braak_pca = pca_braak.fit_transform(X_braak_scaled) |
|
|
| print(f" Donors: {X_braak.shape[0]}, Features: {X_braak.shape[1]} β PCA: {X_braak_pca.shape[1]}") |
|
|
| cv_braak = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
| braak_models = { |
| 'GradientBoosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.05, random_state=42), |
| 'ExtraTrees': ExtraTreesClassifier(n_estimators=300, max_depth=5, random_state=42), |
| 'LDA': LinearDiscriminantAnalysis(), |
| 'SVM-RBF': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42), |
| 'AdaBoost': AdaBoostClassifier(n_estimators=200, learning_rate=0.05, random_state=42), |
| 'BaggingGB': BaggingClassifier( |
| estimator=GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=42), |
| n_estimators=20, random_state=42 |
| ), |
| } |
|
|
| stacking_braak = StackingClassifier( |
| estimators=[ |
| ('gb', GradientBoostingClassifier(n_estimators=150, max_depth=3, random_state=42)), |
| ('et', ExtraTreesClassifier(n_estimators=200, max_depth=5, random_state=42)), |
| ('lda', LinearDiscriminantAnalysis()), |
| ('svm', SVC(kernel='rbf', probability=True, random_state=42)), |
| ], |
| final_estimator=LogisticRegression(C=1.0, max_iter=1000, random_state=42), |
| cv=5, n_jobs=-1 |
| ) |
| braak_models['Stacking'] = stacking_braak |
|
|
| print(f"\n {'Model':<25s} {'Accuracy':>10s} {'Balanced':>10s} {'AUC':>8s}") |
| print(f" {'β'*55}") |
|
|
| for name, model in braak_models.items(): |
| y_pred = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak) |
| acc = accuracy_score(y_braak_grouped, y_pred) |
| bal = balanced_accuracy_score(y_braak_grouped, y_pred) |
| try: |
| y_proba = cross_val_predict(model, X_braak_pca, y_braak_grouped, cv=cv_braak, method='predict_proba') |
| auc = roc_auc_score(y_braak_grouped, y_proba, multi_class='ovr') |
| except: |
| auc = 0 |
| print(f" {name:<25s} {acc:>9.1%} {bal:>9.1%} {auc:>7.3f}") |
| results[f'braak_{name}'] = {'acc': acc, 'bal_acc': bal, 'auc': auc} |
|
|
|
|
| |
| |
| |
| print(f"\n\n{'=' * 70}") |
| print(" SUMMARY β BEST RESULTS PER TECHNIQUE") |
| print("=" * 70) |
|
|
| |
| with open(os.path.join(OUTPUT_DIR, 'advanced_ml_results.json'), 'w') as f: |
| |
| clean_results = {} |
| for k, v in results.items(): |
| clean_results[k] = {k2: float(v2) if isinstance(v2, (np.floating, np.integer)) else v2 for k2, v2 in v.items()} |
| json.dump(clean_results, f, indent=2) |
|
|
| print(f"\n Results saved to {OUTPUT_DIR}/advanced_ml_results.json") |
| print(f"\n Author: Satyawan Singh β Infonova Solutions") |
|
|