Spaces:

stimuler
/

fluency-benchmark

Sleeping

keshavgautam03

Initial deploy: fluency benchmark app

1e81b0d 17 days ago

10.6 kB

	"""
	Process all 438 SANDI dev files through the full 6-dimension pipeline.
	Uses Apple MPS acceleration for WhisperX where supported.

	Usage: python run_sandi_438.py
	Output: ../EDA/data/sandi_438_full_pipeline.csv
	"""

	import sys, os, time, warnings
	import numpy as np
	import pandas as pd

	warnings.filterwarnings("ignore")
	sys.path.insert(0, os.path.dirname(__file__))

	from pathlib import Path
	from scipy.stats import spearmanr, kendalltau
	from scipy.optimize import minimize
	from sklearn.model_selection import KFold

	BASE = Path(__file__).parent.parent

	# ── Load file list ──
	filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
	print(f"Files to process: {len(filelist)}")

	# ── Load pipeline modules ──
	print("Loading models...", flush=True)
	from pipeline.vad import run_vad
	from pipeline.transcribe import transcribe_and_align
	from pipeline.placement import classify_pauses
	from pipeline.fa_features import compute_fa_features
	from pipeline.syntactic_features import compute_syntactic_features
	from models.inference import predict
	from pipeline.composite import compute_composite
	print("Models loaded.")

	# ── Process all files ──
	results = []
	errors = 0
	start_time = time.time()

	for idx, row in filelist.iterrows():
	audio_path = row['audio_path']
	if not os.path.exists(audio_path):
	audio_path = str(BASE / audio_path)
	if not os.path.exists(audio_path):
	errors += 1
	continue

	n = idx + 1
	elapsed = time.time() - start_time
	rate = n / max(elapsed, 1)
	eta = (len(filelist) - n) / max(rate, 0.01)

	if n % 10 == 0 or n <= 3 or n == len(filelist):
	print(f" [{n}/{len(filelist)}] {row['file_id']} (expert={row['expert_score']}) "
	f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True)

	try:
	vad = run_vad(audio_path)
	tx = transcribe_and_align(audio_path)
	words = tx['words']
	vad['mlu'] = round(len(words) / max(vad['speech_segments'], 1), 2)
	placement = classify_pauses(words, vad)
	fa = compute_fa_features(words, vad['total_duration_sec'])
	syn = compute_syntactic_features(words, tx['transcript'])
	all_features = {vad, placement, fa, syn}
	predictions = predict(all_features)
	composite = compute_composite(all_features, predictions)

	results.append({
	'file_id': row['file_id'],
	'expert_score': row['expert_score'],
	'composite_raw': composite['composite_raw'],
	'composite_percentile': composite['composite_percentile'],
	'fluency_band': composite['fluency_band'],
	'speech_ratio': vad['speech_ratio'],
	'mlu': vad['mlu'],
	'word_count': len(words),
	'pause_count': vad['pause_count'],
	'mean_pause_dur': vad['mean_pause_duration_sec'],
	'long_pause_ratio': vad['long_pause_ratio'],
	'short_pause_share': vad['short_pause_share'],
	'boundary_pause_ratio': placement['boundary_pause_ratio'],
	'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
	'articulation_pred': predictions.get('articulation_ordinal_pred', np.nan),
	'articulation_label': predictions.get('articulation_ordinal_label', '?'),
	'pause_freq_label': predictions.get('pause_freq_ordinal_label', '?'),
	'pause_dur_label': predictions.get('pause_dur_ordinal_label', '?'),
	'pause_place_label': predictions.get('pause_place_ordinal_label', '?'),
	'cognitive_load_label': predictions.get('cognitive_load_ordinal_label', '?'),
	'utt_constraints_label': predictions.get('utterance_constraints_ordinal_label', '?'),
	'prop_unplanned': predictions.get('prop_unplanned_pred', np.nan),
	'prop_planned': predictions.get('prop_planned_pred', np.nan),
	'prop_neutral': predictions.get('prop_neutral_pred', np.nan),
	'dim_continuity': composite['dim_continuity'],
	'dim_pause_quality': composite['dim_pause_quality'],
	'dim_articulation': composite['dim_articulation'],
	'dim_dominance': composite['dim_dominance'],
	'dim_placement': composite['dim_placement'],
	'dim_word_precision': composite['dim_word_precision'],
	})
	except Exception as e:
	errors += 1
	if errors <= 10:
	print(f" ERROR {row['file_id']}: {str(e)[:80]}")

	total_time = time.time() - start_time
	print(f"\nDone: {len(results)}/{len(filelist)} processed, {errors} errors, {total_time/60:.1f} minutes")

	# ── Save results ──
	sf = pd.DataFrame(results)
	out_path = BASE / "EDA/data/sandi_438_full_pipeline.csv"
	sf.to_csv(out_path, index=False)
	print(f"Saved: {out_path}")

	# ══════════════════════════════════════════════════════════════
	# ANALYSIS
	# ══════════════════════════════════════════════════════════════

	dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
	'dim_dominance', 'dim_placement', 'dim_word_precision']

	print(f"\n{'='*70}")
	print(f"SANDI FULL VALIDATION — 6-DIMENSION PIPELINE (N={len(sf)})")
	print(f"{'='*70}")

	# 1. Overall correlation
	rho, p = spearmanr(sf['expert_score'], sf['composite_raw'])
	tau, tp = kendalltau(sf['expert_score'], sf['composite_raw'])
	print(f"\n Spearman rho: {rho:.3f} (p={p:.2e})")
	print(f" Kendall tau: {tau:.3f} (p={tp:.2e})")

	# 2. Per-dimension
	print(f"\n Per-dimension correlations with expert score:")
	for d in dims:
	v = sf[['expert_score', d]].dropna()
	if len(v) > 10:
	r, p_ = spearmanr(v['expert_score'], v[d])
	sig = '*' if p_ < 0.001 else '' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns'
	print(f" {d:<25s} rho={r:+.3f} {sig}")

	# 3. Raw features
	print(f"\n Raw feature correlations:")
	for feat in ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'mid_clause_pause_ratio']:
	r, _ = spearmanr(sf['expert_score'], sf[feat])
	print(f" {feat:<25s} rho={r:+.3f}")

	# 4. Band agreement
	def expert_band(score):
	if score < 3.0: return 'LOW'
	elif score < 4.5: return 'MEDIUM'
	else: return 'HIGH'

	sf['expert_band'] = sf['expert_score'].apply(expert_band)
	agree = (sf['expert_band'] == sf['fluency_band']).sum()
	print(f"\n Band agreement: {agree}/{len(sf)} ({agree/len(sf):.1%})")

	print(f"\n Confusion matrix:")
	print(f" {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}")
	for eb in ['LOW', 'MEDIUM', 'HIGH']:
	row = []
	for pb in ['LOW', 'MEDIUM', 'HIGH']:
	n = ((sf['expert_band'] == eb) & (sf['fluency_band'] == pb)).sum()
	row.append(n)
	print(f" Expert {eb:>6}: {row[0]:>6} {row[1]:>6} {row[2]:>6}")

	# 5. Expert score by pipeline band
	print(f"\n Expert score by pipeline band:")
	for pb in ['LOW', 'MEDIUM', 'HIGH']:
	sub = sf[sf['fluency_band'] == pb]
	if len(sub) > 0:
	print(f" {pb}: n={len(sub)}, expert_mean={sub['expert_score'].mean():.2f} "
	f"[{sub['expert_score'].min():.1f}-{sub['expert_score'].max():.1f}]")

	# 6. Weight optimization
	print(f"\n{'='*70}")
	print(f"WEIGHT OPTIMIZATION (z-scored dimensions)")
	print(f"{'='*70}")

	dim_data = sf[dims].copy()
	for d in dims:
	mu, sigma = dim_data[d].mean(), dim_data[d].std()
	if sigma > 0:
	dim_data[d] = (dim_data[d] - mu) / sigma
	else:
	dim_data[d] = 0

	X = dim_data.values
	y = sf['expert_score'].values

	eq_w = np.ones(6) / 6
	lit_w = np.array([2, 2, 1, 1, 3, 2], dtype=float)
	lit_w /= lit_w.sum()

	def neg_rho(w):
	wn = np.abs(w) / np.abs(w).sum()
	r, _ = spearmanr(X @ wn, y)
	return -r if not np.isnan(r) else 0

	best_w, best_rho = None, -1
	for seed in range(200):
	rng = np.random.default_rng(seed)
	w0 = rng.dirichlet(np.ones(6))
	res = minimize(neg_rho, w0, method='Nelder-Mead', options={'maxiter': 10000, 'xatol': 1e-8})
	r = -res.fun
	if r > best_rho:
	best_rho = r
	best_w = np.abs(res.x) / np.abs(res.x).sum()

	print(f" Optimized rho: {best_rho:.3f}")

	# Cross-validate
	kf = KFold(n_splits=5, shuffle=True, random_state=42)
	cv = {'equal': [], 'literature': [], 'optimized': []}
	for tri, tei in kf.split(X):
	bw, br = None, -1
	for seed in range(50):
	rng = np.random.default_rng(seed)
	def nr(w):
	wn = np.abs(w) / np.abs(w).sum()
	r, _ = spearmanr(X[tri] @ wn, y[tri])
	return -r if not np.isnan(r) else 0
	res = minimize(nr, rng.dirichlet(np.ones(6)), method='Nelder-Mead', options={'maxiter': 5000})
	if -res.fun > br:
	br = -res.fun
	bw = np.abs(res.x) / np.abs(res.x).sum()
	for label, w in [('equal', eq_w), ('literature', lit_w), ('optimized', bw)]:
	r, _ = spearmanr(X[tei] @ w, y[tei])
	cv[label].append(r if not np.isnan(r) else 0)

	print(f"\n 5-fold CV (out-of-fold rho):")
	print(f" {'Method':<15s} {'Mean':>7s} {'Std':>7s} Folds")
	for label in ['equal', 'literature', 'optimized']:
	rhos = cv[label]
	print(f" {label:<15s} {np.mean(rhos):>7.3f} {np.std(rhos):>7.3f} {[round(r, 3) for r in rhos]}")

	print(f"\n Weight comparison:")
	print(f" {'Dimension':<25s} {'Equal':>8s} {'Lit':>8s} {'Opt':>8s}")
	for i, d in enumerate(dims):
	print(f" {d:<25s} {eq_w[i]:>8.3f} {lit_w[i]:>8.3f} {best_w[i]:>8.3f}")

	# Recommendation
	co = np.mean(cv['optimized'])
	cl = np.mean(cv['literature'])
	ce = np.mean(cv['equal'])
	print(f"\n Equal={ce:.3f} Literature={cl:.3f} Optimized={co:.3f}")
	if co > cl + 0.03:
	print(f" → USE OPTIMIZED (+{co - cl:.3f})")
	elif cl > ce + 0.03:
	print(f" → KEEP LITERATURE (+{cl - ce:.3f})")
	else:
	print(f" → Differences < 0.03 — keep literature for interpretability")

	# Compare with 4-dim Colab
	colab = pd.read_csv(BASE / "EDA/data/sandi_validation_full.csv")
	if len(colab) > 0:
	merged = sf.merge(colab[['file_id', 'composite_raw']], on='file_id', suffixes=('_6dim', '_4dim'))
	rho_6, _ = spearmanr(merged['expert_score'], merged['composite_raw_6dim'])
	rho_4, _ = spearmanr(merged['expert_score'], merged['composite_raw_4dim'])
	print(f"\n 4-dim (Colab) vs 6-dim (local) on {len(merged)} files:")
	print(f" 4-dim rho: {rho_4:.3f}")
	print(f" 6-dim rho: {rho_6:.3f}")
	print(f" Gain: {rho_6 - rho_4:+.3f}")

	print(f"\nTotal processing time: {total_time / 60:.1f} minutes")