""" Selective Feature Fusion VAD Test on SANDI 438. 4 VAD modes: silero (baseline), ten, marblenet, fusion (selective) Uses cached transcriptions from prior run. Reports: 1. Spearman rho/Kendall tau vs expert scores 2. Rank-order accuracy (Spearman + pairwise ordering) 3. Quintile segregation with Cohen's d and U-tests 4. Classification accuracy (band agreement + confusion matrix) 5. Per-dimension correlation breakdown 6. Bootstrap significance tests (fusion vs baseline) 7. Ordering analysis: how many pairs correctly ordered Usage: python run_selective_fusion_test.py """ import sys, os, time, warnings, pickle import numpy as np import pandas as pd warnings.filterwarnings("ignore") sys.path.insert(0, os.path.dirname(__file__)) from pathlib import Path from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal BASE = Path(__file__).parent.parent CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache" filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") print(f"SANDI files: {len(filelist)}") # ── Verify transcription cache ── cached = sum(1 for _, row in filelist.iterrows() if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists()) print(f"Cached transcriptions: {cached}/{len(filelist)}") if cached < len(filelist) * 0.5: print("ERROR: Run run_hybrid_vad_test.py --transcribe first") sys.exit(1) # ── Load pipeline modules ── print("Loading models...", flush=True) from pipeline.selective_fusion_vad import run_selective_vad from pipeline.placement import classify_pauses from pipeline.fa_features import compute_fa_features from pipeline.syntactic_features import compute_syntactic_features from models.inference import predict from pipeline.composite import compute_composite print("Models loaded.\n") MODES = ["silero", "ten", "marblenet", "fusion"] all_results = {m: [] for m in MODES} errors = {m: 0 for m in MODES} start_time = time.time() for idx, row in filelist.iterrows(): file_id = row['file_id'] cache_path = CACHE_DIR / f"{file_id}_tx.pkl" if not cache_path.exists(): continue audio_path = row['audio_path'] if not os.path.isabs(audio_path): audio_path = str(BASE / audio_path) if not os.path.exists(audio_path): continue with open(cache_path, 'rb') as f: tx = pickle.load(f) words = tx['words'] word_count = len(words) n = idx + 1 elapsed = time.time() - start_time rate = n / max(elapsed, 1) eta = (len(filelist) - n) / max(rate, 0.01) if n % 50 == 0 or n <= 3 or n == len(filelist): print(f" [{n}/{len(filelist)}] {file_id} " f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True) for mode in MODES: try: vad = run_selective_vad(audio_path, mode=mode) vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2) placement = classify_pauses(words, vad) fa = compute_fa_features(words, vad['total_duration_sec']) syn = compute_syntactic_features(words, tx['transcript']) all_features = {**vad, **placement, **fa, **syn} predictions = predict(all_features) composite = compute_composite(all_features, predictions) all_results[mode].append({ 'file_id': file_id, 'expert_score': row['expert_score'], 'composite_raw': composite['composite_raw'], 'composite_percentile': composite['composite_percentile'], 'fluency_band': composite['fluency_band'], 'speech_ratio': vad['speech_ratio'], 'mlu': vad['mlu'], 'word_count': word_count, 'pause_count': vad['pause_count'], 'mean_pause_dur': vad['mean_pause_duration_sec'], 'long_pause_ratio': vad['long_pause_ratio'], 'short_pause_share': vad.get('short_pause_share', 0), 'speech_segments': vad['speech_segments'], 'speech_duration_sec': vad['speech_duration_sec'], 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], 'boundary_pause_ratio': placement['boundary_pause_ratio'], 'dim_continuity': composite['dim_continuity'], 'dim_pause_quality': composite['dim_pause_quality'], 'dim_articulation': composite['dim_articulation'], 'dim_dominance': composite['dim_dominance'], 'dim_placement': composite['dim_placement'], 'dim_word_precision': composite['dim_word_precision'], }) except Exception as e: errors[mode] += 1 if errors[mode] <= 3: print(f" ERROR [{mode}] {file_id}: {e}", flush=True) total_time = time.time() - start_time print(f"\nDone in {total_time/60:.1f} minutes") for m in MODES: print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors") dfs = {} for mode in MODES: df = pd.DataFrame(all_results[mode]) out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv" df.to_csv(out_path, index=False) dfs[mode] = df print(f"Saved: {out_path}") # ══════════════════════════════════════════════════════════════════ # ANALYSIS # ══════════════════════════════════════════════════════════════════ def section(title): w = 80 print(f"\n{'='*w}") print(f" {title}") print(f"{'='*w}") dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', 'dim_dominance', 'dim_placement', 'dim_word_precision'] # ── 1. Overall correlation ── section("1. OVERALL CORRELATION WITH EXPERT SCORES") print(f"\n {'Mode':<12s} {'Spearman':>10s} {'p':>10s} {'Kendall':>10s} {'p':>10s} {'N':>5s}") print(f" {'-'*55}") rhos = {} for mode in MODES: df = dfs[mode] rho, p_rho = spearmanr(df['expert_score'], df['composite_raw']) tau, p_tau = kendalltau(df['expert_score'], df['composite_raw']) rhos[mode] = rho marker = ' <-- BEST' if rho == max(rhos.values()) else '' print(f" {mode:<12s} {rho:>+10.4f} {p_rho:>10.2e} {tau:>+10.4f} {p_tau:>10.2e} {len(df):>5d}{marker}") # ── 2. Pairwise ordering accuracy ── section("2. PAIRWISE ORDERING ACCURACY") print(" (What % of speaker pairs are ordered correctly by the pipeline?)") for mode in MODES: df = dfs[mode] exp = df['expert_score'].values comp = df['composite_raw'].values n = len(exp) concordant = 0 discordant = 0 tied = 0 for i in range(n): for j in range(i + 1, n): e_diff = exp[i] - exp[j] c_diff = comp[i] - comp[j] if e_diff == 0: tied += 1 elif e_diff * c_diff > 0: concordant += 1 else: discordant += 1 total_pairs = concordant + discordant accuracy = concordant / total_pairs * 100 if total_pairs > 0 else 0 print(f" {mode:<12s}: {concordant}/{total_pairs} pairs correct ({accuracy:.1f}%)" f" tied={tied}") # ── 3. Quintile segregation ── section("3. QUINTILE SEGREGATION") for mode in MODES: df = dfs[mode].copy() df['quintile'] = pd.qcut(df['expert_score'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'], duplicates='drop') print(f"\n [{mode}]") print(f" {'Q':>4s} {'N':>4s} {'Expert':>8s} {'Composite':>10s} {'SR':>7s} " f"{'MLU':>7s} {'PauseDur':>9s} {'LPR':>7s}") print(f" {'-'*65}") q_means = {} for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']: sub = df[df['quintile'] == q] if len(sub) == 0: continue q_means[q] = sub['composite_raw'].mean() print(f" {q:>4s} {len(sub):>4d} {sub['expert_score'].mean():>8.2f} " f"{sub['composite_raw'].mean():>+10.4f} {sub['speech_ratio'].mean():>7.3f} " f"{sub['mlu'].mean():>7.2f} {sub['mean_pause_dur'].mean():>9.4f} " f"{sub['long_pause_ratio'].mean():>7.4f}") # Adjacent quintile effect sizes print(f"\n Cohen's d (adjacent quintiles):") pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')] for q_lo, q_hi in pairs: a = df[df['quintile'] == q_lo]['composite_raw'] b = df[df['quintile'] == q_hi]['composite_raw'] if len(a) == 0 or len(b) == 0: continue pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2) d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0 u, p = mannwhitneyu(a, b, alternative='less') sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns' print(f" {q_lo}->{q_hi}: d={d:+.3f} p={p:.4f} {sig}") if len(q_means) >= 2: qs = sorted(q_means.keys()) spread = q_means[qs[-1]] - q_means[qs[0]] print(f" Q5-Q1 spread: {spread:+.4f}") groups = [df[df['quintile'] == q]['composite_raw'].values for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5'] if len(df[df['quintile'] == q]) > 0] if len(groups) >= 2: H, p_kw = kruskal(*groups) print(f" Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}") # ── 4. Band classification accuracy ── section("4. FLUENCY BAND CLASSIFICATION") def expert_band(score): if score < 3.0: return 'LOW' elif score < 4.5: return 'MEDIUM' else: return 'HIGH' for mode in MODES: df = dfs[mode].copy() df['expert_band'] = df['expert_score'].apply(expert_band) agree = (df['expert_band'] == df['fluency_band']).sum() total = len(df) print(f"\n [{mode}] Accuracy: {agree}/{total} ({agree/total:.1%})") print(f" {'':>15} Pipeline-> {'LOW':>5} {'MED':>5} {'HIGH':>5}") for eb in ['LOW', 'MEDIUM', 'HIGH']: row = [] for pb in ['LOW', 'MEDIUM', 'HIGH']: n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum() row.append(n) print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}") # ── 5. Per-dimension correlations ── section("5. PER-DIMENSION CORRELATIONS") print(f"\n {'Dimension':<25s}", end="") for mode in MODES: print(f" {mode:>10s}", end="") print() print(f" {'-'*70}") for d in dims: print(f" {d:<25s}", end="") for mode in MODES: v = dfs[mode][['expert_score', d]].dropna() r, p = spearmanr(v['expert_score'], v[d]) sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns' print(f" {r:>+7.4f}{sig}", end="") print() # ── 6. Bootstrap significance ── section("6. BOOTSTRAP SIGNIFICANCE TEST") # Find common files across all modes common_ids = set(dfs[MODES[0]]['file_id']) for mode in MODES[1:]: common_ids &= set(dfs[mode]['file_id']) common_ids = sorted(common_ids) base_df = dfs['silero'].set_index('file_id').loc[common_ids] expert = base_df['expert_score'].values N = len(expert) comp = {} for mode in MODES: comp[mode] = dfs[mode].set_index('file_id').loc[common_ids]['composite_raw'].values n_boot = 10000 rng = np.random.default_rng(42) boot_rho = {m: [] for m in MODES} boot_delta = {m: [] for m in MODES if m != 'silero'} for _ in range(n_boot): idx = rng.choice(N, N, replace=True) for mode in MODES: r, _ = spearmanr(expert[idx], comp[mode][idx]) boot_rho[mode].append(r) for mode in [m for m in MODES if m != 'silero']: boot_delta[mode].append(boot_rho[mode][-1] - boot_rho['silero'][-1]) for mode in MODES: boot_rho[mode] = np.array(boot_rho[mode]) for mode in boot_delta: boot_delta[mode] = np.array(boot_delta[mode]) print(f"\n Paired comparison on N={N} common files:") print(f"\n {'Mode':<12s} {'rho':>8s} {'95% CI':>20s} {'delta':>8s} {'p vs silero':>12s}") print(f" {'-'*65}") for mode in MODES: r_obs, _ = spearmanr(expert, comp[mode]) ci = np.percentile(boot_rho[mode], [2.5, 97.5]) if mode == 'silero': print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {'baseline':>8s} {'--':>12s}") else: delta = r_obs - rhos['silero'] p_val = (boot_delta[mode] <= 0).mean() print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {delta:>+8.4f} p={p_val:.4f}") # ── 7. Score ordering analysis ── section("7. SCORE ORDERING ANALYSIS") print(" (Files sorted by expert score — how well does pipeline preserve order?)") for mode in MODES: df = dfs[mode].sort_values('expert_score').reset_index(drop=True) # Compute rank correlation on sorted data exp_rank = df['expert_score'].rank(method='average') comp_rank = df['composite_raw'].rank(method='average') rho_rank, _ = spearmanr(exp_rank, comp_rank) # Monotonicity: how many consecutive pairs are in correct order? correct_order = 0 total_consecutive = 0 for i in range(len(df) - 1): if df.iloc[i + 1]['expert_score'] > df.iloc[i]['expert_score']: total_consecutive += 1 if df.iloc[i + 1]['composite_raw'] >= df.iloc[i]['composite_raw']: correct_order += 1 mono_pct = correct_order / total_consecutive * 100 if total_consecutive > 0 else 0 print(f" {mode:<12s}: rank rho={rho_rank:.4f} " f"monotonic pairs: {correct_order}/{total_consecutive} ({mono_pct:.1f}%)") # ── 8. Summary ── section("FINAL SUMMARY") best_mode = max(rhos, key=rhos.get) print(f"\n Results on SANDI dev N={N}:") for mode in MODES: marker = ' <-- BEST' if mode == best_mode else '' print(f" {mode:<12s}: rho={rhos[mode]:+.4f}{marker}") print(f"\n Best mode: {best_mode} (rho={rhos[best_mode]:.4f})") if best_mode != 'silero': delta = rhos[best_mode] - rhos['silero'] p_val = (boot_delta[best_mode] <= 0).mean() print(f" Improvement over baseline: +{delta:.4f} (bootstrap p={p_val:.4f})") if p_val < 0.05: print(f" -> SIGNIFICANT improvement. Adopt {best_mode}.") else: print(f" -> Not significant. More data needed or recalibrate pipeline.") else: print(f" -> Silero baseline remains the best. No VAD swap needed.") print(f"\nTotal time: {total_time/60:.1f} minutes")