""" Process all 438 SANDI dev files through the full 6-dimension pipeline. Uses Apple MPS acceleration for WhisperX where supported. Usage: python run_sandi_438.py Output: ../EDA/data/sandi_438_full_pipeline.csv """ import sys, os, time, warnings import numpy as np import pandas as pd warnings.filterwarnings("ignore") sys.path.insert(0, os.path.dirname(__file__)) from pathlib import Path from scipy.stats import spearmanr, kendalltau from scipy.optimize import minimize from sklearn.model_selection import KFold BASE = Path(__file__).parent.parent # ── Load file list ── filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") print(f"Files to process: {len(filelist)}") # ── Load pipeline modules ── print("Loading models...", flush=True) from pipeline.vad import run_vad from pipeline.transcribe import transcribe_and_align from pipeline.placement import classify_pauses from pipeline.fa_features import compute_fa_features from pipeline.syntactic_features import compute_syntactic_features from models.inference import predict from pipeline.composite import compute_composite print("Models loaded.") # ── Process all files ── results = [] errors = 0 start_time = time.time() for idx, row in filelist.iterrows(): audio_path = row['audio_path'] if not os.path.exists(audio_path): audio_path = str(BASE / audio_path) if not os.path.exists(audio_path): errors += 1 continue n = idx + 1 elapsed = time.time() - start_time rate = n / max(elapsed, 1) eta = (len(filelist) - n) / max(rate, 0.01) if n % 10 == 0 or n <= 3 or n == len(filelist): print(f" [{n}/{len(filelist)}] {row['file_id']} (expert={row['expert_score']}) " f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True) try: vad = run_vad(audio_path) tx = transcribe_and_align(audio_path) words = tx['words'] vad['mlu'] = round(len(words) / max(vad['speech_segments'], 1), 2) placement = classify_pauses(words, vad) fa = compute_fa_features(words, vad['total_duration_sec']) syn = compute_syntactic_features(words, tx['transcript']) all_features = {**vad, **placement, **fa, **syn} predictions = predict(all_features) composite = compute_composite(all_features, predictions) results.append({ 'file_id': row['file_id'], 'expert_score': row['expert_score'], 'composite_raw': composite['composite_raw'], 'composite_percentile': composite['composite_percentile'], 'fluency_band': composite['fluency_band'], 'speech_ratio': vad['speech_ratio'], 'mlu': vad['mlu'], 'word_count': len(words), 'pause_count': vad['pause_count'], 'mean_pause_dur': vad['mean_pause_duration_sec'], 'long_pause_ratio': vad['long_pause_ratio'], 'short_pause_share': vad['short_pause_share'], 'boundary_pause_ratio': placement['boundary_pause_ratio'], 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], 'articulation_pred': predictions.get('articulation_ordinal_pred', np.nan), 'articulation_label': predictions.get('articulation_ordinal_label', '?'), 'pause_freq_label': predictions.get('pause_freq_ordinal_label', '?'), 'pause_dur_label': predictions.get('pause_dur_ordinal_label', '?'), 'pause_place_label': predictions.get('pause_place_ordinal_label', '?'), 'cognitive_load_label': predictions.get('cognitive_load_ordinal_label', '?'), 'utt_constraints_label': predictions.get('utterance_constraints_ordinal_label', '?'), 'prop_unplanned': predictions.get('prop_unplanned_pred', np.nan), 'prop_planned': predictions.get('prop_planned_pred', np.nan), 'prop_neutral': predictions.get('prop_neutral_pred', np.nan), 'dim_continuity': composite['dim_continuity'], 'dim_pause_quality': composite['dim_pause_quality'], 'dim_articulation': composite['dim_articulation'], 'dim_dominance': composite['dim_dominance'], 'dim_placement': composite['dim_placement'], 'dim_word_precision': composite['dim_word_precision'], }) except Exception as e: errors += 1 if errors <= 10: print(f" ERROR {row['file_id']}: {str(e)[:80]}") total_time = time.time() - start_time print(f"\nDone: {len(results)}/{len(filelist)} processed, {errors} errors, {total_time/60:.1f} minutes") # ── Save results ── sf = pd.DataFrame(results) out_path = BASE / "EDA/data/sandi_438_full_pipeline.csv" sf.to_csv(out_path, index=False) print(f"Saved: {out_path}") # ══════════════════════════════════════════════════════════════ # ANALYSIS # ══════════════════════════════════════════════════════════════ dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', 'dim_dominance', 'dim_placement', 'dim_word_precision'] print(f"\n{'='*70}") print(f"SANDI FULL VALIDATION — 6-DIMENSION PIPELINE (N={len(sf)})") print(f"{'='*70}") # 1. Overall correlation rho, p = spearmanr(sf['expert_score'], sf['composite_raw']) tau, tp = kendalltau(sf['expert_score'], sf['composite_raw']) print(f"\n Spearman rho: {rho:.3f} (p={p:.2e})") print(f" Kendall tau: {tau:.3f} (p={tp:.2e})") # 2. Per-dimension print(f"\n Per-dimension correlations with expert score:") for d in dims: v = sf[['expert_score', d]].dropna() if len(v) > 10: r, p_ = spearmanr(v['expert_score'], v[d]) sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns' print(f" {d:<25s} rho={r:+.3f} {sig}") # 3. Raw features print(f"\n Raw feature correlations:") for feat in ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'mid_clause_pause_ratio']: r, _ = spearmanr(sf['expert_score'], sf[feat]) print(f" {feat:<25s} rho={r:+.3f}") # 4. Band agreement def expert_band(score): if score < 3.0: return 'LOW' elif score < 4.5: return 'MEDIUM' else: return 'HIGH' sf['expert_band'] = sf['expert_score'].apply(expert_band) agree = (sf['expert_band'] == sf['fluency_band']).sum() print(f"\n Band agreement: {agree}/{len(sf)} ({agree/len(sf):.1%})") print(f"\n Confusion matrix:") print(f" {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}") for eb in ['LOW', 'MEDIUM', 'HIGH']: row = [] for pb in ['LOW', 'MEDIUM', 'HIGH']: n = ((sf['expert_band'] == eb) & (sf['fluency_band'] == pb)).sum() row.append(n) print(f" Expert {eb:>6}: {row[0]:>6} {row[1]:>6} {row[2]:>6}") # 5. Expert score by pipeline band print(f"\n Expert score by pipeline band:") for pb in ['LOW', 'MEDIUM', 'HIGH']: sub = sf[sf['fluency_band'] == pb] if len(sub) > 0: print(f" {pb}: n={len(sub)}, expert_mean={sub['expert_score'].mean():.2f} " f"[{sub['expert_score'].min():.1f}-{sub['expert_score'].max():.1f}]") # 6. Weight optimization print(f"\n{'='*70}") print(f"WEIGHT OPTIMIZATION (z-scored dimensions)") print(f"{'='*70}") dim_data = sf[dims].copy() for d in dims: mu, sigma = dim_data[d].mean(), dim_data[d].std() if sigma > 0: dim_data[d] = (dim_data[d] - mu) / sigma else: dim_data[d] = 0 X = dim_data.values y = sf['expert_score'].values eq_w = np.ones(6) / 6 lit_w = np.array([2, 2, 1, 1, 3, 2], dtype=float) lit_w /= lit_w.sum() def neg_rho(w): wn = np.abs(w) / np.abs(w).sum() r, _ = spearmanr(X @ wn, y) return -r if not np.isnan(r) else 0 best_w, best_rho = None, -1 for seed in range(200): rng = np.random.default_rng(seed) w0 = rng.dirichlet(np.ones(6)) res = minimize(neg_rho, w0, method='Nelder-Mead', options={'maxiter': 10000, 'xatol': 1e-8}) r = -res.fun if r > best_rho: best_rho = r best_w = np.abs(res.x) / np.abs(res.x).sum() print(f" Optimized rho: {best_rho:.3f}") # Cross-validate kf = KFold(n_splits=5, shuffle=True, random_state=42) cv = {'equal': [], 'literature': [], 'optimized': []} for tri, tei in kf.split(X): bw, br = None, -1 for seed in range(50): rng = np.random.default_rng(seed) def nr(w): wn = np.abs(w) / np.abs(w).sum() r, _ = spearmanr(X[tri] @ wn, y[tri]) return -r if not np.isnan(r) else 0 res = minimize(nr, rng.dirichlet(np.ones(6)), method='Nelder-Mead', options={'maxiter': 5000}) if -res.fun > br: br = -res.fun bw = np.abs(res.x) / np.abs(res.x).sum() for label, w in [('equal', eq_w), ('literature', lit_w), ('optimized', bw)]: r, _ = spearmanr(X[tei] @ w, y[tei]) cv[label].append(r if not np.isnan(r) else 0) print(f"\n 5-fold CV (out-of-fold rho):") print(f" {'Method':<15s} {'Mean':>7s} {'Std':>7s} Folds") for label in ['equal', 'literature', 'optimized']: rhos = cv[label] print(f" {label:<15s} {np.mean(rhos):>7.3f} {np.std(rhos):>7.3f} {[round(r, 3) for r in rhos]}") print(f"\n Weight comparison:") print(f" {'Dimension':<25s} {'Equal':>8s} {'Lit':>8s} {'Opt':>8s}") for i, d in enumerate(dims): print(f" {d:<25s} {eq_w[i]:>8.3f} {lit_w[i]:>8.3f} {best_w[i]:>8.3f}") # Recommendation co = np.mean(cv['optimized']) cl = np.mean(cv['literature']) ce = np.mean(cv['equal']) print(f"\n Equal={ce:.3f} Literature={cl:.3f} Optimized={co:.3f}") if co > cl + 0.03: print(f" → USE OPTIMIZED (+{co - cl:.3f})") elif cl > ce + 0.03: print(f" → KEEP LITERATURE (+{cl - ce:.3f})") else: print(f" → Differences < 0.03 — keep literature for interpretability") # Compare with 4-dim Colab colab = pd.read_csv(BASE / "EDA/data/sandi_validation_full.csv") if len(colab) > 0: merged = sf.merge(colab[['file_id', 'composite_raw']], on='file_id', suffixes=('_6dim', '_4dim')) rho_6, _ = spearmanr(merged['expert_score'], merged['composite_raw_6dim']) rho_4, _ = spearmanr(merged['expert_score'], merged['composite_raw_4dim']) print(f"\n 4-dim (Colab) vs 6-dim (local) on {len(merged)} files:") print(f" 4-dim rho: {rho_4:.3f}") print(f" 6-dim rho: {rho_6:.3f}") print(f" Gain: {rho_6 - rho_4:+.3f}") print(f"\nTotal processing time: {total_time / 60:.1f} minutes")