Spaces:
Sleeping
Sleeping
| """ | |
| Process all 438 SANDI dev files through the full 6-dimension pipeline. | |
| Uses Apple MPS acceleration for WhisperX where supported. | |
| Usage: python run_sandi_438.py | |
| Output: ../EDA/data/sandi_438_full_pipeline.csv | |
| """ | |
| import sys, os, time, warnings | |
| import numpy as np | |
| import pandas as pd | |
| warnings.filterwarnings("ignore") | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from pathlib import Path | |
| from scipy.stats import spearmanr, kendalltau | |
| from scipy.optimize import minimize | |
| from sklearn.model_selection import KFold | |
| BASE = Path(__file__).parent.parent | |
| # ββ Load file list ββ | |
| filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") | |
| print(f"Files to process: {len(filelist)}") | |
| # ββ Load pipeline modules ββ | |
| print("Loading models...", flush=True) | |
| from pipeline.vad import run_vad | |
| from pipeline.transcribe import transcribe_and_align | |
| from pipeline.placement import classify_pauses | |
| from pipeline.fa_features import compute_fa_features | |
| from pipeline.syntactic_features import compute_syntactic_features | |
| from models.inference import predict | |
| from pipeline.composite import compute_composite | |
| print("Models loaded.") | |
| # ββ Process all files ββ | |
| results = [] | |
| errors = 0 | |
| start_time = time.time() | |
| for idx, row in filelist.iterrows(): | |
| audio_path = row['audio_path'] | |
| if not os.path.exists(audio_path): | |
| audio_path = str(BASE / audio_path) | |
| if not os.path.exists(audio_path): | |
| errors += 1 | |
| continue | |
| n = idx + 1 | |
| elapsed = time.time() - start_time | |
| rate = n / max(elapsed, 1) | |
| eta = (len(filelist) - n) / max(rate, 0.01) | |
| if n % 10 == 0 or n <= 3 or n == len(filelist): | |
| print(f" [{n}/{len(filelist)}] {row['file_id']} (expert={row['expert_score']}) " | |
| f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True) | |
| try: | |
| vad = run_vad(audio_path) | |
| tx = transcribe_and_align(audio_path) | |
| words = tx['words'] | |
| vad['mlu'] = round(len(words) / max(vad['speech_segments'], 1), 2) | |
| placement = classify_pauses(words, vad) | |
| fa = compute_fa_features(words, vad['total_duration_sec']) | |
| syn = compute_syntactic_features(words, tx['transcript']) | |
| all_features = {**vad, **placement, **fa, **syn} | |
| predictions = predict(all_features) | |
| composite = compute_composite(all_features, predictions) | |
| results.append({ | |
| 'file_id': row['file_id'], | |
| 'expert_score': row['expert_score'], | |
| 'composite_raw': composite['composite_raw'], | |
| 'composite_percentile': composite['composite_percentile'], | |
| 'fluency_band': composite['fluency_band'], | |
| 'speech_ratio': vad['speech_ratio'], | |
| 'mlu': vad['mlu'], | |
| 'word_count': len(words), | |
| 'pause_count': vad['pause_count'], | |
| 'mean_pause_dur': vad['mean_pause_duration_sec'], | |
| 'long_pause_ratio': vad['long_pause_ratio'], | |
| 'short_pause_share': vad['short_pause_share'], | |
| 'boundary_pause_ratio': placement['boundary_pause_ratio'], | |
| 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], | |
| 'articulation_pred': predictions.get('articulation_ordinal_pred', np.nan), | |
| 'articulation_label': predictions.get('articulation_ordinal_label', '?'), | |
| 'pause_freq_label': predictions.get('pause_freq_ordinal_label', '?'), | |
| 'pause_dur_label': predictions.get('pause_dur_ordinal_label', '?'), | |
| 'pause_place_label': predictions.get('pause_place_ordinal_label', '?'), | |
| 'cognitive_load_label': predictions.get('cognitive_load_ordinal_label', '?'), | |
| 'utt_constraints_label': predictions.get('utterance_constraints_ordinal_label', '?'), | |
| 'prop_unplanned': predictions.get('prop_unplanned_pred', np.nan), | |
| 'prop_planned': predictions.get('prop_planned_pred', np.nan), | |
| 'prop_neutral': predictions.get('prop_neutral_pred', np.nan), | |
| 'dim_continuity': composite['dim_continuity'], | |
| 'dim_pause_quality': composite['dim_pause_quality'], | |
| 'dim_articulation': composite['dim_articulation'], | |
| 'dim_dominance': composite['dim_dominance'], | |
| 'dim_placement': composite['dim_placement'], | |
| 'dim_word_precision': composite['dim_word_precision'], | |
| }) | |
| except Exception as e: | |
| errors += 1 | |
| if errors <= 10: | |
| print(f" ERROR {row['file_id']}: {str(e)[:80]}") | |
| total_time = time.time() - start_time | |
| print(f"\nDone: {len(results)}/{len(filelist)} processed, {errors} errors, {total_time/60:.1f} minutes") | |
| # ββ Save results ββ | |
| sf = pd.DataFrame(results) | |
| out_path = BASE / "EDA/data/sandi_438_full_pipeline.csv" | |
| sf.to_csv(out_path, index=False) | |
| print(f"Saved: {out_path}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ANALYSIS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', | |
| 'dim_dominance', 'dim_placement', 'dim_word_precision'] | |
| print(f"\n{'='*70}") | |
| print(f"SANDI FULL VALIDATION β 6-DIMENSION PIPELINE (N={len(sf)})") | |
| print(f"{'='*70}") | |
| # 1. Overall correlation | |
| rho, p = spearmanr(sf['expert_score'], sf['composite_raw']) | |
| tau, tp = kendalltau(sf['expert_score'], sf['composite_raw']) | |
| print(f"\n Spearman rho: {rho:.3f} (p={p:.2e})") | |
| print(f" Kendall tau: {tau:.3f} (p={tp:.2e})") | |
| # 2. Per-dimension | |
| print(f"\n Per-dimension correlations with expert score:") | |
| for d in dims: | |
| v = sf[['expert_score', d]].dropna() | |
| if len(v) > 10: | |
| r, p_ = spearmanr(v['expert_score'], v[d]) | |
| sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns' | |
| print(f" {d:<25s} rho={r:+.3f} {sig}") | |
| # 3. Raw features | |
| print(f"\n Raw feature correlations:") | |
| for feat in ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'mid_clause_pause_ratio']: | |
| r, _ = spearmanr(sf['expert_score'], sf[feat]) | |
| print(f" {feat:<25s} rho={r:+.3f}") | |
| # 4. Band agreement | |
| def expert_band(score): | |
| if score < 3.0: return 'LOW' | |
| elif score < 4.5: return 'MEDIUM' | |
| else: return 'HIGH' | |
| sf['expert_band'] = sf['expert_score'].apply(expert_band) | |
| agree = (sf['expert_band'] == sf['fluency_band']).sum() | |
| print(f"\n Band agreement: {agree}/{len(sf)} ({agree/len(sf):.1%})") | |
| print(f"\n Confusion matrix:") | |
| print(f" {'':>15} Pipelineβ {'LOW':>6} {'MED':>6} {'HIGH':>6}") | |
| for eb in ['LOW', 'MEDIUM', 'HIGH']: | |
| row = [] | |
| for pb in ['LOW', 'MEDIUM', 'HIGH']: | |
| n = ((sf['expert_band'] == eb) & (sf['fluency_band'] == pb)).sum() | |
| row.append(n) | |
| print(f" Expert {eb:>6}: {row[0]:>6} {row[1]:>6} {row[2]:>6}") | |
| # 5. Expert score by pipeline band | |
| print(f"\n Expert score by pipeline band:") | |
| for pb in ['LOW', 'MEDIUM', 'HIGH']: | |
| sub = sf[sf['fluency_band'] == pb] | |
| if len(sub) > 0: | |
| print(f" {pb}: n={len(sub)}, expert_mean={sub['expert_score'].mean():.2f} " | |
| f"[{sub['expert_score'].min():.1f}-{sub['expert_score'].max():.1f}]") | |
| # 6. Weight optimization | |
| print(f"\n{'='*70}") | |
| print(f"WEIGHT OPTIMIZATION (z-scored dimensions)") | |
| print(f"{'='*70}") | |
| dim_data = sf[dims].copy() | |
| for d in dims: | |
| mu, sigma = dim_data[d].mean(), dim_data[d].std() | |
| if sigma > 0: | |
| dim_data[d] = (dim_data[d] - mu) / sigma | |
| else: | |
| dim_data[d] = 0 | |
| X = dim_data.values | |
| y = sf['expert_score'].values | |
| eq_w = np.ones(6) / 6 | |
| lit_w = np.array([2, 2, 1, 1, 3, 2], dtype=float) | |
| lit_w /= lit_w.sum() | |
| def neg_rho(w): | |
| wn = np.abs(w) / np.abs(w).sum() | |
| r, _ = spearmanr(X @ wn, y) | |
| return -r if not np.isnan(r) else 0 | |
| best_w, best_rho = None, -1 | |
| for seed in range(200): | |
| rng = np.random.default_rng(seed) | |
| w0 = rng.dirichlet(np.ones(6)) | |
| res = minimize(neg_rho, w0, method='Nelder-Mead', options={'maxiter': 10000, 'xatol': 1e-8}) | |
| r = -res.fun | |
| if r > best_rho: | |
| best_rho = r | |
| best_w = np.abs(res.x) / np.abs(res.x).sum() | |
| print(f" Optimized rho: {best_rho:.3f}") | |
| # Cross-validate | |
| kf = KFold(n_splits=5, shuffle=True, random_state=42) | |
| cv = {'equal': [], 'literature': [], 'optimized': []} | |
| for tri, tei in kf.split(X): | |
| bw, br = None, -1 | |
| for seed in range(50): | |
| rng = np.random.default_rng(seed) | |
| def nr(w): | |
| wn = np.abs(w) / np.abs(w).sum() | |
| r, _ = spearmanr(X[tri] @ wn, y[tri]) | |
| return -r if not np.isnan(r) else 0 | |
| res = minimize(nr, rng.dirichlet(np.ones(6)), method='Nelder-Mead', options={'maxiter': 5000}) | |
| if -res.fun > br: | |
| br = -res.fun | |
| bw = np.abs(res.x) / np.abs(res.x).sum() | |
| for label, w in [('equal', eq_w), ('literature', lit_w), ('optimized', bw)]: | |
| r, _ = spearmanr(X[tei] @ w, y[tei]) | |
| cv[label].append(r if not np.isnan(r) else 0) | |
| print(f"\n 5-fold CV (out-of-fold rho):") | |
| print(f" {'Method':<15s} {'Mean':>7s} {'Std':>7s} Folds") | |
| for label in ['equal', 'literature', 'optimized']: | |
| rhos = cv[label] | |
| print(f" {label:<15s} {np.mean(rhos):>7.3f} {np.std(rhos):>7.3f} {[round(r, 3) for r in rhos]}") | |
| print(f"\n Weight comparison:") | |
| print(f" {'Dimension':<25s} {'Equal':>8s} {'Lit':>8s} {'Opt':>8s}") | |
| for i, d in enumerate(dims): | |
| print(f" {d:<25s} {eq_w[i]:>8.3f} {lit_w[i]:>8.3f} {best_w[i]:>8.3f}") | |
| # Recommendation | |
| co = np.mean(cv['optimized']) | |
| cl = np.mean(cv['literature']) | |
| ce = np.mean(cv['equal']) | |
| print(f"\n Equal={ce:.3f} Literature={cl:.3f} Optimized={co:.3f}") | |
| if co > cl + 0.03: | |
| print(f" β USE OPTIMIZED (+{co - cl:.3f})") | |
| elif cl > ce + 0.03: | |
| print(f" β KEEP LITERATURE (+{cl - ce:.3f})") | |
| else: | |
| print(f" β Differences < 0.03 β keep literature for interpretability") | |
| # Compare with 4-dim Colab | |
| colab = pd.read_csv(BASE / "EDA/data/sandi_validation_full.csv") | |
| if len(colab) > 0: | |
| merged = sf.merge(colab[['file_id', 'composite_raw']], on='file_id', suffixes=('_6dim', '_4dim')) | |
| rho_6, _ = spearmanr(merged['expert_score'], merged['composite_raw_6dim']) | |
| rho_4, _ = spearmanr(merged['expert_score'], merged['composite_raw_4dim']) | |
| print(f"\n 4-dim (Colab) vs 6-dim (local) on {len(merged)} files:") | |
| print(f" 4-dim rho: {rho_4:.3f}") | |
| print(f" 6-dim rho: {rho_6:.3f}") | |
| print(f" Gain: {rho_6 - rho_4:+.3f}") | |
| print(f"\nTotal processing time: {total_time / 60:.1f} minutes") | |