fluency-benchmark / run_sandi_438.py
keshavgautam03
Initial deploy: fluency benchmark app
1e81b0d
"""
Process all 438 SANDI dev files through the full 6-dimension pipeline.
Uses Apple MPS acceleration for WhisperX where supported.
Usage: python run_sandi_438.py
Output: ../EDA/data/sandi_438_full_pipeline.csv
"""
import sys, os, time, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))
from pathlib import Path
from scipy.stats import spearmanr, kendalltau
from scipy.optimize import minimize
from sklearn.model_selection import KFold
BASE = Path(__file__).parent.parent
# ── Load file list ──
filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"Files to process: {len(filelist)}")
# ── Load pipeline modules ──
print("Loading models...", flush=True)
from pipeline.vad import run_vad
from pipeline.transcribe import transcribe_and_align
from pipeline.placement import classify_pauses
from pipeline.fa_features import compute_fa_features
from pipeline.syntactic_features import compute_syntactic_features
from models.inference import predict
from pipeline.composite import compute_composite
print("Models loaded.")
# ── Process all files ──
results = []
errors = 0
start_time = time.time()
for idx, row in filelist.iterrows():
audio_path = row['audio_path']
if not os.path.exists(audio_path):
audio_path = str(BASE / audio_path)
if not os.path.exists(audio_path):
errors += 1
continue
n = idx + 1
elapsed = time.time() - start_time
rate = n / max(elapsed, 1)
eta = (len(filelist) - n) / max(rate, 0.01)
if n % 10 == 0 or n <= 3 or n == len(filelist):
print(f" [{n}/{len(filelist)}] {row['file_id']} (expert={row['expert_score']}) "
f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True)
try:
vad = run_vad(audio_path)
tx = transcribe_and_align(audio_path)
words = tx['words']
vad['mlu'] = round(len(words) / max(vad['speech_segments'], 1), 2)
placement = classify_pauses(words, vad)
fa = compute_fa_features(words, vad['total_duration_sec'])
syn = compute_syntactic_features(words, tx['transcript'])
all_features = {**vad, **placement, **fa, **syn}
predictions = predict(all_features)
composite = compute_composite(all_features, predictions)
results.append({
'file_id': row['file_id'],
'expert_score': row['expert_score'],
'composite_raw': composite['composite_raw'],
'composite_percentile': composite['composite_percentile'],
'fluency_band': composite['fluency_band'],
'speech_ratio': vad['speech_ratio'],
'mlu': vad['mlu'],
'word_count': len(words),
'pause_count': vad['pause_count'],
'mean_pause_dur': vad['mean_pause_duration_sec'],
'long_pause_ratio': vad['long_pause_ratio'],
'short_pause_share': vad['short_pause_share'],
'boundary_pause_ratio': placement['boundary_pause_ratio'],
'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
'articulation_pred': predictions.get('articulation_ordinal_pred', np.nan),
'articulation_label': predictions.get('articulation_ordinal_label', '?'),
'pause_freq_label': predictions.get('pause_freq_ordinal_label', '?'),
'pause_dur_label': predictions.get('pause_dur_ordinal_label', '?'),
'pause_place_label': predictions.get('pause_place_ordinal_label', '?'),
'cognitive_load_label': predictions.get('cognitive_load_ordinal_label', '?'),
'utt_constraints_label': predictions.get('utterance_constraints_ordinal_label', '?'),
'prop_unplanned': predictions.get('prop_unplanned_pred', np.nan),
'prop_planned': predictions.get('prop_planned_pred', np.nan),
'prop_neutral': predictions.get('prop_neutral_pred', np.nan),
'dim_continuity': composite['dim_continuity'],
'dim_pause_quality': composite['dim_pause_quality'],
'dim_articulation': composite['dim_articulation'],
'dim_dominance': composite['dim_dominance'],
'dim_placement': composite['dim_placement'],
'dim_word_precision': composite['dim_word_precision'],
})
except Exception as e:
errors += 1
if errors <= 10:
print(f" ERROR {row['file_id']}: {str(e)[:80]}")
total_time = time.time() - start_time
print(f"\nDone: {len(results)}/{len(filelist)} processed, {errors} errors, {total_time/60:.1f} minutes")
# ── Save results ──
sf = pd.DataFrame(results)
out_path = BASE / "EDA/data/sandi_438_full_pipeline.csv"
sf.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
# ══════════════════════════════════════════════════════════════
# ANALYSIS
# ══════════════════════════════════════════════════════════════
dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
'dim_dominance', 'dim_placement', 'dim_word_precision']
print(f"\n{'='*70}")
print(f"SANDI FULL VALIDATION β€” 6-DIMENSION PIPELINE (N={len(sf)})")
print(f"{'='*70}")
# 1. Overall correlation
rho, p = spearmanr(sf['expert_score'], sf['composite_raw'])
tau, tp = kendalltau(sf['expert_score'], sf['composite_raw'])
print(f"\n Spearman rho: {rho:.3f} (p={p:.2e})")
print(f" Kendall tau: {tau:.3f} (p={tp:.2e})")
# 2. Per-dimension
print(f"\n Per-dimension correlations with expert score:")
for d in dims:
v = sf[['expert_score', d]].dropna()
if len(v) > 10:
r, p_ = spearmanr(v['expert_score'], v[d])
sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns'
print(f" {d:<25s} rho={r:+.3f} {sig}")
# 3. Raw features
print(f"\n Raw feature correlations:")
for feat in ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'mid_clause_pause_ratio']:
r, _ = spearmanr(sf['expert_score'], sf[feat])
print(f" {feat:<25s} rho={r:+.3f}")
# 4. Band agreement
def expert_band(score):
if score < 3.0: return 'LOW'
elif score < 4.5: return 'MEDIUM'
else: return 'HIGH'
sf['expert_band'] = sf['expert_score'].apply(expert_band)
agree = (sf['expert_band'] == sf['fluency_band']).sum()
print(f"\n Band agreement: {agree}/{len(sf)} ({agree/len(sf):.1%})")
print(f"\n Confusion matrix:")
print(f" {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}")
for eb in ['LOW', 'MEDIUM', 'HIGH']:
row = []
for pb in ['LOW', 'MEDIUM', 'HIGH']:
n = ((sf['expert_band'] == eb) & (sf['fluency_band'] == pb)).sum()
row.append(n)
print(f" Expert {eb:>6}: {row[0]:>6} {row[1]:>6} {row[2]:>6}")
# 5. Expert score by pipeline band
print(f"\n Expert score by pipeline band:")
for pb in ['LOW', 'MEDIUM', 'HIGH']:
sub = sf[sf['fluency_band'] == pb]
if len(sub) > 0:
print(f" {pb}: n={len(sub)}, expert_mean={sub['expert_score'].mean():.2f} "
f"[{sub['expert_score'].min():.1f}-{sub['expert_score'].max():.1f}]")
# 6. Weight optimization
print(f"\n{'='*70}")
print(f"WEIGHT OPTIMIZATION (z-scored dimensions)")
print(f"{'='*70}")
dim_data = sf[dims].copy()
for d in dims:
mu, sigma = dim_data[d].mean(), dim_data[d].std()
if sigma > 0:
dim_data[d] = (dim_data[d] - mu) / sigma
else:
dim_data[d] = 0
X = dim_data.values
y = sf['expert_score'].values
eq_w = np.ones(6) / 6
lit_w = np.array([2, 2, 1, 1, 3, 2], dtype=float)
lit_w /= lit_w.sum()
def neg_rho(w):
wn = np.abs(w) / np.abs(w).sum()
r, _ = spearmanr(X @ wn, y)
return -r if not np.isnan(r) else 0
best_w, best_rho = None, -1
for seed in range(200):
rng = np.random.default_rng(seed)
w0 = rng.dirichlet(np.ones(6))
res = minimize(neg_rho, w0, method='Nelder-Mead', options={'maxiter': 10000, 'xatol': 1e-8})
r = -res.fun
if r > best_rho:
best_rho = r
best_w = np.abs(res.x) / np.abs(res.x).sum()
print(f" Optimized rho: {best_rho:.3f}")
# Cross-validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv = {'equal': [], 'literature': [], 'optimized': []}
for tri, tei in kf.split(X):
bw, br = None, -1
for seed in range(50):
rng = np.random.default_rng(seed)
def nr(w):
wn = np.abs(w) / np.abs(w).sum()
r, _ = spearmanr(X[tri] @ wn, y[tri])
return -r if not np.isnan(r) else 0
res = minimize(nr, rng.dirichlet(np.ones(6)), method='Nelder-Mead', options={'maxiter': 5000})
if -res.fun > br:
br = -res.fun
bw = np.abs(res.x) / np.abs(res.x).sum()
for label, w in [('equal', eq_w), ('literature', lit_w), ('optimized', bw)]:
r, _ = spearmanr(X[tei] @ w, y[tei])
cv[label].append(r if not np.isnan(r) else 0)
print(f"\n 5-fold CV (out-of-fold rho):")
print(f" {'Method':<15s} {'Mean':>7s} {'Std':>7s} Folds")
for label in ['equal', 'literature', 'optimized']:
rhos = cv[label]
print(f" {label:<15s} {np.mean(rhos):>7.3f} {np.std(rhos):>7.3f} {[round(r, 3) for r in rhos]}")
print(f"\n Weight comparison:")
print(f" {'Dimension':<25s} {'Equal':>8s} {'Lit':>8s} {'Opt':>8s}")
for i, d in enumerate(dims):
print(f" {d:<25s} {eq_w[i]:>8.3f} {lit_w[i]:>8.3f} {best_w[i]:>8.3f}")
# Recommendation
co = np.mean(cv['optimized'])
cl = np.mean(cv['literature'])
ce = np.mean(cv['equal'])
print(f"\n Equal={ce:.3f} Literature={cl:.3f} Optimized={co:.3f}")
if co > cl + 0.03:
print(f" β†’ USE OPTIMIZED (+{co - cl:.3f})")
elif cl > ce + 0.03:
print(f" β†’ KEEP LITERATURE (+{cl - ce:.3f})")
else:
print(f" β†’ Differences < 0.03 β€” keep literature for interpretability")
# Compare with 4-dim Colab
colab = pd.read_csv(BASE / "EDA/data/sandi_validation_full.csv")
if len(colab) > 0:
merged = sf.merge(colab[['file_id', 'composite_raw']], on='file_id', suffixes=('_6dim', '_4dim'))
rho_6, _ = spearmanr(merged['expert_score'], merged['composite_raw_6dim'])
rho_4, _ = spearmanr(merged['expert_score'], merged['composite_raw_4dim'])
print(f"\n 4-dim (Colab) vs 6-dim (local) on {len(merged)} files:")
print(f" 4-dim rho: {rho_4:.3f}")
print(f" 6-dim rho: {rho_6:.3f}")
print(f" Gain: {rho_6 - rho_4:+.3f}")
print(f"\nTotal processing time: {total_time / 60:.1f} minutes")