""" Hybrid VAD A/B Test: Silero vs MarbleNet vs Hybrid on SANDI 438. Design: Transcription is done ONCE (WhisperX), cached per file. Only the VAD component is swapped across modes. This isolates the single variable under test (VAD accuracy) and avoids redundant transcription. Statistical validation: 1. Spearman rho vs expert scores (overall + per-dimension) 2. Quintile separation (Q1-Q5 spread, Q2-Q3-Q4 discrimination) 3. Bootstrap significance test (hybrid vs silero baseline) 4. Effect sizes (Cohen's d per quintile pair) Usage: Step 1 (transcribe): python run_hybrid_vad_test.py --transcribe Step 2 (evaluate): python run_hybrid_vad_test.py --evaluate Or both: python run_hybrid_vad_test.py """ import sys, os, time, warnings, json, pickle, argparse import numpy as np import pandas as pd warnings.filterwarnings("ignore") sys.path.insert(0, os.path.dirname(__file__)) from pathlib import Path from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal BASE = Path(__file__).parent.parent CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache" CACHE_DIR.mkdir(exist_ok=True) # ── Args ── parser = argparse.ArgumentParser() parser.add_argument("--transcribe", action="store_true", help="Run transcription step only") parser.add_argument("--evaluate", action="store_true", help="Run evaluation step only (requires cached transcriptions)") args, _ = parser.parse_known_args() if not args.transcribe and not args.evaluate: args.transcribe = True args.evaluate = True # ── Load file list ── filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") print(f"SANDI files: {len(filelist)}") # ═══════════════════════════════════════════════════════════════════ # STEP 1: Transcribe and cache (only needs ffmpeg — run WITHOUT NeMo) # ═══════════════════════════════════════════════════════════════════ if args.transcribe: print("\n─── STEP 1: Transcription (cached) ───") # Monkey-patch whisperx.load_audio to use torchaudio instead of ffmpeg # (NeMo install broke ffmpeg's OpenGL dependency on this system) import torchaudio import whisperx _WHISPERX_SR = 16000 def _torchaudio_load(file: str, sr: int = _WHISPERX_SR): wav, orig_sr = torchaudio.load(file) if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True) if orig_sr != sr: wav = torchaudio.functional.resample(wav, orig_sr, sr) return wav.squeeze().numpy() whisperx.load_audio = _torchaudio_load # Also patch the lazy-loaded audio module if it's already imported try: from whisperx import audio as _wx_audio _wx_audio.load_audio = _torchaudio_load except (ImportError, AttributeError): pass from pipeline.transcribe import transcribe_and_align tx_errors = 0 tx_done = 0 for idx, row in filelist.iterrows(): file_id = row['file_id'] cache_path = CACHE_DIR / f"{file_id}_tx.pkl" if cache_path.exists(): tx_done += 1 continue audio_path = row['audio_path'] if not os.path.isabs(audio_path): audio_path = str(BASE / audio_path) if not os.path.exists(audio_path): tx_errors += 1 continue try: tx = transcribe_and_align(audio_path) with open(cache_path, 'wb') as f: pickle.dump(tx, f) tx_done += 1 if tx_done % 50 == 0: print(f" Transcribed {tx_done}...", flush=True) except Exception as e: tx_errors += 1 if tx_errors <= 3: print(f" TX ERROR {file_id}: {e}") print(f" Transcription: {tx_done} cached, {tx_errors} errors") # ═══════════════════════════════════════════════════════════════════ # STEP 2: VAD evaluation (uses NeMo/MarbleNet — no ffmpeg needed) # ═══════════════════════════════════════════════════════════════════ if args.evaluate: print("\n─── STEP 2: VAD Evaluation (3 modes) ───") print("Loading models...", flush=True) from pipeline.hybrid_vad import run_hybrid_vad from pipeline.placement import classify_pauses from pipeline.fa_features import compute_fa_features from pipeline.syntactic_features import compute_syntactic_features from models.inference import predict from pipeline.composite import compute_composite print("Models loaded.\n") MODES = ["silero", "marblenet", "hybrid"] all_results = {m: [] for m in MODES} errors = {m: 0 for m in MODES} skipped = 0 start_time = time.time() for idx, row in filelist.iterrows(): file_id = row['file_id'] cache_path = CACHE_DIR / f"{file_id}_tx.pkl" # Skip if no cached transcription if not cache_path.exists(): skipped += 1 continue audio_path = row['audio_path'] if not os.path.isabs(audio_path): audio_path = str(BASE / audio_path) if not os.path.exists(audio_path): skipped += 1 continue # Load cached transcription with open(cache_path, 'rb') as f: tx = pickle.load(f) words = tx['words'] word_count = len(words) n = idx + 1 elapsed = time.time() - start_time rate = n / max(elapsed, 1) eta = (len(filelist) - n) / max(rate, 0.01) if n % 50 == 0 or n <= 3 or n == len(filelist): print(f" [{n}/{len(filelist)}] {file_id} " f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True) for mode in MODES: try: vad = run_hybrid_vad(audio_path, mode=mode) vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2) placement = classify_pauses(words, vad) fa = compute_fa_features(words, vad['total_duration_sec']) syn = compute_syntactic_features(words, tx['transcript']) all_features = {**vad, **placement, **fa, **syn} predictions = predict(all_features) composite = compute_composite(all_features, predictions) all_results[mode].append({ 'file_id': file_id, 'expert_score': row['expert_score'], 'composite_raw': composite['composite_raw'], 'composite_percentile': composite['composite_percentile'], 'fluency_band': composite['fluency_band'], 'speech_ratio': vad['speech_ratio'], 'mlu': vad['mlu'], 'word_count': word_count, 'pause_count': vad['pause_count'], 'mean_pause_dur': vad['mean_pause_duration_sec'], 'long_pause_ratio': vad['long_pause_ratio'], 'short_pause_share': vad['short_pause_share'], 'speech_segments': vad['speech_segments'], 'speech_duration_sec': vad['speech_duration_sec'], 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], 'boundary_pause_ratio': placement['boundary_pause_ratio'], 'dim_continuity': composite['dim_continuity'], 'dim_pause_quality': composite['dim_pause_quality'], 'dim_articulation': composite['dim_articulation'], 'dim_dominance': composite['dim_dominance'], 'dim_placement': composite['dim_placement'], 'dim_word_precision': composite['dim_word_precision'], }) except Exception as e: errors[mode] += 1 if errors[mode] <= 3: print(f" ERROR [{mode}] {file_id}: {e}", flush=True) total_time = time.time() - start_time print(f"\nDone in {total_time/60:.1f} minutes (skipped {skipped} without transcription)") for m in MODES: print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors") # ── Save per-mode results ── dfs = {} for mode in MODES: df = pd.DataFrame(all_results[mode]) out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv" df.to_csv(out_path, index=False) dfs[mode] = df print(f"Saved: {out_path}") # Verify we have data min_n = min(len(dfs[m]) for m in MODES) if min_n == 0: print("\nERROR: No results to analyze. Run --transcribe first.") sys.exit(1) # ══════════════════════════════════════════════════════════════════ # STATISTICAL ANALYSIS # ══════════════════════════════════════════════════════════════════ def section(title): print(f"\n{'='*75}") print(f" {title}") print(f"{'='*75}") dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', 'dim_dominance', 'dim_placement', 'dim_word_precision'] # ── 1. Overall correlation with expert scores ── section("1. OVERALL CORRELATION WITH EXPERT SCORES") print(f"\n {'Mode':<12s} {'Spearman ρ':>12s} {'p-value':>12s} {'Kendall τ':>12s} {'p-value':>12s} {'N':>5s}") print(f" {'-'*65}") for mode in MODES: df = dfs[mode] rho, p_rho = spearmanr(df['expert_score'], df['composite_raw']) tau, p_tau = kendalltau(df['expert_score'], df['composite_raw']) print(f" {mode:<12s} {rho:>12.4f} {p_rho:>12.2e} {tau:>12.4f} {p_tau:>12.2e} {len(df):>5d}") # ── 2. Per-dimension correlations ── section("2. PER-DIMENSION CORRELATIONS WITH EXPERT SCORES") for mode in MODES: df = dfs[mode] print(f"\n [{mode}]") for d in dims: v = df[['expert_score', d]].dropna() if len(v) > 10: r, p_ = spearmanr(v['expert_score'], v[d]) sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns' print(f" {d:<25s} ρ={r:+.4f} {sig}") # ── 3. Raw VAD feature differences between modes ── section("3. RAW VAD FEATURE COMPARISON ACROSS MODES") vad_features = ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'pause_count', 'speech_segments', 'speech_duration_sec'] merged = dfs['silero'][['file_id', 'expert_score']].copy() for feat in vad_features: for mode in MODES: merged = merged.merge( dfs[mode][['file_id', feat]].rename(columns={feat: f'{feat}_{mode}'}), on='file_id', how='inner' ) print(f"\n Feature deltas (hybrid - silero), N={len(merged)}:") print(f" {'Feature':<25s} {'Mean Δ':>10s} {'Median Δ':>10s} {'Std Δ':>10s} {'ρ_hybrid':>10s} {'ρ_silero':>10s}") print(f" {'-'*75}") for feat in vad_features: delta = merged[f'{feat}_hybrid'] - merged[f'{feat}_silero'] r_h, _ = spearmanr(merged['expert_score'], merged[f'{feat}_hybrid']) r_s, _ = spearmanr(merged['expert_score'], merged[f'{feat}_silero']) print(f" {feat:<25s} {delta.mean():>+10.4f} {delta.median():>+10.4f} {delta.std():>10.4f} " f"{r_h:>+10.3f} {r_s:>+10.3f}") # ── 4. Quintile separation analysis ── section("4. QUINTILE SEPARATION (Q1=lowest, Q5=highest expert score)") for mode in MODES: df = dfs[mode].copy() df['quintile'] = pd.qcut(df['expert_score'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'], duplicates='drop') print(f"\n [{mode}]") print(f" {'Quintile':<8s} {'N':>4s} {'Expert':>8s} {'Composite':>10s} {'Speech%':>8s} " f"{'MLU':>8s} {'PauseDur':>9s} {'LPR':>8s}") print(f" {'-'*75}") q_means = {} for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']: sub = df[df['quintile'] == q] if len(sub) == 0: continue q_means[q] = sub['composite_raw'].mean() print(f" {q:<8s} {len(sub):>4d} {sub['expert_score'].mean():>8.2f} " f"{sub['composite_raw'].mean():>+10.4f} {sub['speech_ratio'].mean():>8.3f} " f"{sub['mlu'].mean():>8.2f} {sub['mean_pause_dur'].mean():>9.4f} " f"{sub['long_pause_ratio'].mean():>8.4f}") if 'Q5' in q_means and 'Q1' in q_means: spread = q_means['Q5'] - q_means['Q1'] print(f" Q5-Q1 spread: {spread:+.4f}") # Adjacent quintile separation (Cohen's d) print(f"\n Adjacent quintile Cohen's d:") pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')] for q_lo, q_hi in pairs: a = df[df['quintile'] == q_lo]['composite_raw'] b = df[df['quintile'] == q_hi]['composite_raw'] if len(a) == 0 or len(b) == 0: continue pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2) d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0 u, p = mannwhitneyu(a, b, alternative='less') sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns' print(f" {q_lo}→{q_hi}: d={d:+.3f} U-test p={p:.4f} {sig}") # Kruskal-Wallis across all quintiles groups = [df[df['quintile'] == q]['composite_raw'].values for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5'] if len(df[df['quintile'] == q]) > 0] if len(groups) >= 2: H, p_kw = kruskal(*groups) print(f" Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}") # ── 5. Bootstrap significance test: hybrid vs silero ── section("5. BOOTSTRAP SIGNIFICANCE TEST (Hybrid vs Silero)") common = dfs['silero'][['file_id', 'expert_score', 'composite_raw']].merge( dfs['hybrid'][['file_id', 'composite_raw']], on='file_id', suffixes=('_silero', '_hybrid') ) common = common.merge( dfs['marblenet'][['file_id', 'composite_raw']].rename( columns={'composite_raw': 'composite_raw_marblenet'}), on='file_id' ) n_boot = 10000 rng = np.random.default_rng(42) expert = common['expert_score'].values comp_s = common['composite_raw_silero'].values comp_h = common['composite_raw_hybrid'].values comp_m = common['composite_raw_marblenet'].values N = len(common) rho_s_obs, _ = spearmanr(expert, comp_s) rho_h_obs, _ = spearmanr(expert, comp_h) rho_m_obs, _ = spearmanr(expert, comp_m) delta_obs = rho_h_obs - rho_s_obs boot_rho_s = [] boot_rho_h = [] boot_rho_m = [] boot_deltas_hs = [] boot_deltas_ms = [] for _ in range(n_boot): idx = rng.choice(N, N, replace=True) rs, _ = spearmanr(expert[idx], comp_s[idx]) rh, _ = spearmanr(expert[idx], comp_h[idx]) rm, _ = spearmanr(expert[idx], comp_m[idx]) boot_rho_s.append(rs) boot_rho_h.append(rh) boot_rho_m.append(rm) boot_deltas_hs.append(rh - rs) boot_deltas_ms.append(rm - rs) boot_deltas_hs = np.array(boot_deltas_hs) boot_deltas_ms = np.array(boot_deltas_ms) boot_rho_s = np.array(boot_rho_s) boot_rho_h = np.array(boot_rho_h) boot_rho_m = np.array(boot_rho_m) p_hybrid = (boot_deltas_hs <= 0).mean() p_marble = (boot_deltas_ms <= 0).mean() print(f"\n Paired comparison on N={N} common files:") print(f"\n {'Mode':<12s} {'ρ observed':>12s} {'95% CI':>20s}") print(f" {'-'*50}") for label, obs, boot in [('silero', rho_s_obs, boot_rho_s), ('marblenet', rho_m_obs, boot_rho_m), ('hybrid', rho_h_obs, boot_rho_h)]: ci_lo, ci_hi = np.percentile(boot, [2.5, 97.5]) print(f" {label:<12s} {obs:>12.4f} [{ci_lo:.4f}, {ci_hi:.4f}]") print(f"\n Δρ (hybrid - silero): {delta_obs:+.4f} " f"95% CI [{np.percentile(boot_deltas_hs, 2.5):+.4f}, {np.percentile(boot_deltas_hs, 97.5):+.4f}] " f"p={p_hybrid:.4f}") print(f" Δρ (marble - silero): {rho_m_obs - rho_s_obs:+.4f} " f"95% CI [{np.percentile(boot_deltas_ms, 2.5):+.4f}, {np.percentile(boot_deltas_ms, 97.5):+.4f}] " f"p={p_marble:.4f}") if p_hybrid < 0.05: print(f"\n HYBRID is SIGNIFICANTLY better than Silero (p={p_hybrid:.4f})") elif delta_obs > 0: print(f"\n ~ Hybrid shows improvement but NOT significant (p={p_hybrid:.4f})") else: print(f"\n Hybrid shows NO improvement over Silero") # ── 6. Band agreement ── section("6. FLUENCY BAND AGREEMENT WITH EXPERT BANDS") def expert_band(score): if score < 3.0: return 'LOW' elif score < 4.5: return 'MEDIUM' else: return 'HIGH' for mode in MODES: df = dfs[mode].copy() df['expert_band'] = df['expert_score'].apply(expert_band) agree = (df['expert_band'] == df['fluency_band']).sum() print(f"\n [{mode}] Agreement: {agree}/{len(df)} ({agree/len(df):.1%})") print(f" {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}") for eb in ['LOW', 'MEDIUM', 'HIGH']: row = [] for pb in ['LOW', 'MEDIUM', 'HIGH']: n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum() row.append(n) print(f" Expert {eb:>6}: {row[0]:>6} {row[1]:>6} {row[2]:>6}") # ── 7. Per-dimension improvement breakdown ── section("7. PER-DIMENSION Δρ (hybrid vs silero)") dim_merged = dfs['silero'][['file_id', 'expert_score'] + dims].merge( dfs['hybrid'][['file_id'] + dims], on='file_id', suffixes=('_silero', '_hybrid') ) print(f"\n {'Dimension':<25s} {'ρ_silero':>10s} {'ρ_hybrid':>10s} {'Δρ':>8s} {'Direction':>10s}") print(f" {'-'*70}") for d in dims: rs, _ = spearmanr(dim_merged['expert_score'], dim_merged[f'{d}_silero']) rh, _ = spearmanr(dim_merged['expert_score'], dim_merged[f'{d}_hybrid']) delta = rh - rs direction = '^ better' if delta > 0.005 else 'v worse' if delta < -0.005 else '= same' print(f" {d:<25s} {rs:>+10.4f} {rh:>+10.4f} {delta:>+8.4f} {direction:>10s}") # ── 8. Summary verdict ── section("SUMMARY VERDICT") print(f"\n Baseline (Silero): rho = {rho_s_obs:.4f}") print(f" MarbleNet only: rho = {rho_m_obs:.4f} (delta = {rho_m_obs - rho_s_obs:+.4f})") print(f" Hybrid (S+M): rho = {rho_h_obs:.4f} (delta = {rho_h_obs - rho_s_obs:+.4f})") print(f" Bootstrap p (hybrid): {p_hybrid:.4f}") # Quintile discrimination summary for mode in MODES: df = dfs[mode].copy() df['quintile'] = pd.qcut(df['expert_score'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'], duplicates='drop') q_means = {} for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']: sub = df[df['quintile'] == q] if len(sub) > 0: q_means[q] = sub['composite_raw'].mean() if len(q_means) >= 4: q23 = q_means.get('Q3', 0) - q_means.get('Q2', 0) q34 = q_means.get('Q4', 0) - q_means.get('Q3', 0) spread = q_means.get('Q5', 0) - q_means.get('Q1', 0) print(f"\n [{mode}] Q2->Q3 gap: {q23:+.4f} Q3->Q4 gap: {q34:+.4f} " f"Q5-Q1 spread: {spread:+.4f}") if rho_h_obs > rho_s_obs and p_hybrid < 0.05: print(f"\n -> RECOMMENDATION: Adopt hybrid VAD (+{rho_h_obs - rho_s_obs:.4f} rho, p={p_hybrid:.4f})") elif rho_h_obs > rho_s_obs: print(f"\n -> RECOMMENDATION: Hybrid shows promise (+{rho_h_obs - rho_s_obs:.4f} rho) but needs " f"larger corpus for significance (p={p_hybrid:.4f})") else: print(f"\n -> RECOMMENDATION: Keep Silero baseline — hybrid VAD does not improve correlation") print(f"\nTotal evaluation time: {total_time/60:.1f} minutes")