Spaces:
Running
Running
| """ | |
| Selective Feature Fusion VAD Test on SANDI 438. | |
| 4 VAD modes: silero (baseline), ten, marblenet, fusion (selective) | |
| Uses cached transcriptions from prior run. | |
| Reports: | |
| 1. Spearman rho/Kendall tau vs expert scores | |
| 2. Rank-order accuracy (Spearman + pairwise ordering) | |
| 3. Quintile segregation with Cohen's d and U-tests | |
| 4. Classification accuracy (band agreement + confusion matrix) | |
| 5. Per-dimension correlation breakdown | |
| 6. Bootstrap significance tests (fusion vs baseline) | |
| 7. Ordering analysis: how many pairs correctly ordered | |
| Usage: python run_selective_fusion_test.py | |
| """ | |
| import sys, os, time, warnings, pickle | |
| import numpy as np | |
| import pandas as pd | |
| warnings.filterwarnings("ignore") | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from pathlib import Path | |
| from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal | |
| BASE = Path(__file__).parent.parent | |
| CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache" | |
| filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") | |
| print(f"SANDI files: {len(filelist)}") | |
| # ββ Verify transcription cache ββ | |
| cached = sum(1 for _, row in filelist.iterrows() | |
| if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists()) | |
| print(f"Cached transcriptions: {cached}/{len(filelist)}") | |
| if cached < len(filelist) * 0.5: | |
| print("ERROR: Run run_hybrid_vad_test.py --transcribe first") | |
| sys.exit(1) | |
| # ββ Load pipeline modules ββ | |
| print("Loading models...", flush=True) | |
| from pipeline.selective_fusion_vad import run_selective_vad | |
| from pipeline.placement import classify_pauses | |
| from pipeline.fa_features import compute_fa_features | |
| from pipeline.syntactic_features import compute_syntactic_features | |
| from models.inference import predict | |
| from pipeline.composite import compute_composite | |
| print("Models loaded.\n") | |
| MODES = ["silero", "ten", "marblenet", "fusion"] | |
| all_results = {m: [] for m in MODES} | |
| errors = {m: 0 for m in MODES} | |
| start_time = time.time() | |
| for idx, row in filelist.iterrows(): | |
| file_id = row['file_id'] | |
| cache_path = CACHE_DIR / f"{file_id}_tx.pkl" | |
| if not cache_path.exists(): | |
| continue | |
| audio_path = row['audio_path'] | |
| if not os.path.isabs(audio_path): | |
| audio_path = str(BASE / audio_path) | |
| if not os.path.exists(audio_path): | |
| continue | |
| with open(cache_path, 'rb') as f: | |
| tx = pickle.load(f) | |
| words = tx['words'] | |
| word_count = len(words) | |
| n = idx + 1 | |
| elapsed = time.time() - start_time | |
| rate = n / max(elapsed, 1) | |
| eta = (len(filelist) - n) / max(rate, 0.01) | |
| if n % 50 == 0 or n <= 3 or n == len(filelist): | |
| print(f" [{n}/{len(filelist)}] {file_id} " | |
| f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True) | |
| for mode in MODES: | |
| try: | |
| vad = run_selective_vad(audio_path, mode=mode) | |
| vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2) | |
| placement = classify_pauses(words, vad) | |
| fa = compute_fa_features(words, vad['total_duration_sec']) | |
| syn = compute_syntactic_features(words, tx['transcript']) | |
| all_features = {**vad, **placement, **fa, **syn} | |
| predictions = predict(all_features) | |
| composite = compute_composite(all_features, predictions) | |
| all_results[mode].append({ | |
| 'file_id': file_id, | |
| 'expert_score': row['expert_score'], | |
| 'composite_raw': composite['composite_raw'], | |
| 'composite_percentile': composite['composite_percentile'], | |
| 'fluency_band': composite['fluency_band'], | |
| 'speech_ratio': vad['speech_ratio'], | |
| 'mlu': vad['mlu'], | |
| 'word_count': word_count, | |
| 'pause_count': vad['pause_count'], | |
| 'mean_pause_dur': vad['mean_pause_duration_sec'], | |
| 'long_pause_ratio': vad['long_pause_ratio'], | |
| 'short_pause_share': vad.get('short_pause_share', 0), | |
| 'speech_segments': vad['speech_segments'], | |
| 'speech_duration_sec': vad['speech_duration_sec'], | |
| 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], | |
| 'boundary_pause_ratio': placement['boundary_pause_ratio'], | |
| 'dim_continuity': composite['dim_continuity'], | |
| 'dim_pause_quality': composite['dim_pause_quality'], | |
| 'dim_articulation': composite['dim_articulation'], | |
| 'dim_dominance': composite['dim_dominance'], | |
| 'dim_placement': composite['dim_placement'], | |
| 'dim_word_precision': composite['dim_word_precision'], | |
| }) | |
| except Exception as e: | |
| errors[mode] += 1 | |
| if errors[mode] <= 3: | |
| print(f" ERROR [{mode}] {file_id}: {e}", flush=True) | |
| total_time = time.time() - start_time | |
| print(f"\nDone in {total_time/60:.1f} minutes") | |
| for m in MODES: | |
| print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors") | |
| dfs = {} | |
| for mode in MODES: | |
| df = pd.DataFrame(all_results[mode]) | |
| out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv" | |
| df.to_csv(out_path, index=False) | |
| dfs[mode] = df | |
| print(f"Saved: {out_path}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ANALYSIS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def section(title): | |
| w = 80 | |
| print(f"\n{'='*w}") | |
| print(f" {title}") | |
| print(f"{'='*w}") | |
| dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', | |
| 'dim_dominance', 'dim_placement', 'dim_word_precision'] | |
| # ββ 1. Overall correlation ββ | |
| section("1. OVERALL CORRELATION WITH EXPERT SCORES") | |
| print(f"\n {'Mode':<12s} {'Spearman':>10s} {'p':>10s} {'Kendall':>10s} {'p':>10s} {'N':>5s}") | |
| print(f" {'-'*55}") | |
| rhos = {} | |
| for mode in MODES: | |
| df = dfs[mode] | |
| rho, p_rho = spearmanr(df['expert_score'], df['composite_raw']) | |
| tau, p_tau = kendalltau(df['expert_score'], df['composite_raw']) | |
| rhos[mode] = rho | |
| marker = ' <-- BEST' if rho == max(rhos.values()) else '' | |
| print(f" {mode:<12s} {rho:>+10.4f} {p_rho:>10.2e} {tau:>+10.4f} {p_tau:>10.2e} {len(df):>5d}{marker}") | |
| # ββ 2. Pairwise ordering accuracy ββ | |
| section("2. PAIRWISE ORDERING ACCURACY") | |
| print(" (What % of speaker pairs are ordered correctly by the pipeline?)") | |
| for mode in MODES: | |
| df = dfs[mode] | |
| exp = df['expert_score'].values | |
| comp = df['composite_raw'].values | |
| n = len(exp) | |
| concordant = 0 | |
| discordant = 0 | |
| tied = 0 | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| e_diff = exp[i] - exp[j] | |
| c_diff = comp[i] - comp[j] | |
| if e_diff == 0: | |
| tied += 1 | |
| elif e_diff * c_diff > 0: | |
| concordant += 1 | |
| else: | |
| discordant += 1 | |
| total_pairs = concordant + discordant | |
| accuracy = concordant / total_pairs * 100 if total_pairs > 0 else 0 | |
| print(f" {mode:<12s}: {concordant}/{total_pairs} pairs correct ({accuracy:.1f}%)" | |
| f" tied={tied}") | |
| # ββ 3. Quintile segregation ββ | |
| section("3. QUINTILE SEGREGATION") | |
| for mode in MODES: | |
| df = dfs[mode].copy() | |
| df['quintile'] = pd.qcut(df['expert_score'], 5, | |
| labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'], | |
| duplicates='drop') | |
| print(f"\n [{mode}]") | |
| print(f" {'Q':>4s} {'N':>4s} {'Expert':>8s} {'Composite':>10s} {'SR':>7s} " | |
| f"{'MLU':>7s} {'PauseDur':>9s} {'LPR':>7s}") | |
| print(f" {'-'*65}") | |
| q_means = {} | |
| for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']: | |
| sub = df[df['quintile'] == q] | |
| if len(sub) == 0: continue | |
| q_means[q] = sub['composite_raw'].mean() | |
| print(f" {q:>4s} {len(sub):>4d} {sub['expert_score'].mean():>8.2f} " | |
| f"{sub['composite_raw'].mean():>+10.4f} {sub['speech_ratio'].mean():>7.3f} " | |
| f"{sub['mlu'].mean():>7.2f} {sub['mean_pause_dur'].mean():>9.4f} " | |
| f"{sub['long_pause_ratio'].mean():>7.4f}") | |
| # Adjacent quintile effect sizes | |
| print(f"\n Cohen's d (adjacent quintiles):") | |
| pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')] | |
| for q_lo, q_hi in pairs: | |
| a = df[df['quintile'] == q_lo]['composite_raw'] | |
| b = df[df['quintile'] == q_hi]['composite_raw'] | |
| if len(a) == 0 or len(b) == 0: continue | |
| pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2) | |
| d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0 | |
| u, p = mannwhitneyu(a, b, alternative='less') | |
| sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns' | |
| print(f" {q_lo}->{q_hi}: d={d:+.3f} p={p:.4f} {sig}") | |
| if len(q_means) >= 2: | |
| qs = sorted(q_means.keys()) | |
| spread = q_means[qs[-1]] - q_means[qs[0]] | |
| print(f" Q5-Q1 spread: {spread:+.4f}") | |
| groups = [df[df['quintile'] == q]['composite_raw'].values | |
| for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5'] | |
| if len(df[df['quintile'] == q]) > 0] | |
| if len(groups) >= 2: | |
| H, p_kw = kruskal(*groups) | |
| print(f" Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}") | |
| # ββ 4. Band classification accuracy ββ | |
| section("4. FLUENCY BAND CLASSIFICATION") | |
| def expert_band(score): | |
| if score < 3.0: return 'LOW' | |
| elif score < 4.5: return 'MEDIUM' | |
| else: return 'HIGH' | |
| for mode in MODES: | |
| df = dfs[mode].copy() | |
| df['expert_band'] = df['expert_score'].apply(expert_band) | |
| agree = (df['expert_band'] == df['fluency_band']).sum() | |
| total = len(df) | |
| print(f"\n [{mode}] Accuracy: {agree}/{total} ({agree/total:.1%})") | |
| print(f" {'':>15} Pipeline-> {'LOW':>5} {'MED':>5} {'HIGH':>5}") | |
| for eb in ['LOW', 'MEDIUM', 'HIGH']: | |
| row = [] | |
| for pb in ['LOW', 'MEDIUM', 'HIGH']: | |
| n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum() | |
| row.append(n) | |
| print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}") | |
| # ββ 5. Per-dimension correlations ββ | |
| section("5. PER-DIMENSION CORRELATIONS") | |
| print(f"\n {'Dimension':<25s}", end="") | |
| for mode in MODES: | |
| print(f" {mode:>10s}", end="") | |
| print() | |
| print(f" {'-'*70}") | |
| for d in dims: | |
| print(f" {d:<25s}", end="") | |
| for mode in MODES: | |
| v = dfs[mode][['expert_score', d]].dropna() | |
| r, p = spearmanr(v['expert_score'], v[d]) | |
| sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns' | |
| print(f" {r:>+7.4f}{sig}", end="") | |
| print() | |
| # ββ 6. Bootstrap significance ββ | |
| section("6. BOOTSTRAP SIGNIFICANCE TEST") | |
| # Find common files across all modes | |
| common_ids = set(dfs[MODES[0]]['file_id']) | |
| for mode in MODES[1:]: | |
| common_ids &= set(dfs[mode]['file_id']) | |
| common_ids = sorted(common_ids) | |
| base_df = dfs['silero'].set_index('file_id').loc[common_ids] | |
| expert = base_df['expert_score'].values | |
| N = len(expert) | |
| comp = {} | |
| for mode in MODES: | |
| comp[mode] = dfs[mode].set_index('file_id').loc[common_ids]['composite_raw'].values | |
| n_boot = 10000 | |
| rng = np.random.default_rng(42) | |
| boot_rho = {m: [] for m in MODES} | |
| boot_delta = {m: [] for m in MODES if m != 'silero'} | |
| for _ in range(n_boot): | |
| idx = rng.choice(N, N, replace=True) | |
| for mode in MODES: | |
| r, _ = spearmanr(expert[idx], comp[mode][idx]) | |
| boot_rho[mode].append(r) | |
| for mode in [m for m in MODES if m != 'silero']: | |
| boot_delta[mode].append(boot_rho[mode][-1] - boot_rho['silero'][-1]) | |
| for mode in MODES: | |
| boot_rho[mode] = np.array(boot_rho[mode]) | |
| for mode in boot_delta: | |
| boot_delta[mode] = np.array(boot_delta[mode]) | |
| print(f"\n Paired comparison on N={N} common files:") | |
| print(f"\n {'Mode':<12s} {'rho':>8s} {'95% CI':>20s} {'delta':>8s} {'p vs silero':>12s}") | |
| print(f" {'-'*65}") | |
| for mode in MODES: | |
| r_obs, _ = spearmanr(expert, comp[mode]) | |
| ci = np.percentile(boot_rho[mode], [2.5, 97.5]) | |
| if mode == 'silero': | |
| print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {'baseline':>8s} {'--':>12s}") | |
| else: | |
| delta = r_obs - rhos['silero'] | |
| p_val = (boot_delta[mode] <= 0).mean() | |
| print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {delta:>+8.4f} p={p_val:.4f}") | |
| # ββ 7. Score ordering analysis ββ | |
| section("7. SCORE ORDERING ANALYSIS") | |
| print(" (Files sorted by expert score β how well does pipeline preserve order?)") | |
| for mode in MODES: | |
| df = dfs[mode].sort_values('expert_score').reset_index(drop=True) | |
| # Compute rank correlation on sorted data | |
| exp_rank = df['expert_score'].rank(method='average') | |
| comp_rank = df['composite_raw'].rank(method='average') | |
| rho_rank, _ = spearmanr(exp_rank, comp_rank) | |
| # Monotonicity: how many consecutive pairs are in correct order? | |
| correct_order = 0 | |
| total_consecutive = 0 | |
| for i in range(len(df) - 1): | |
| if df.iloc[i + 1]['expert_score'] > df.iloc[i]['expert_score']: | |
| total_consecutive += 1 | |
| if df.iloc[i + 1]['composite_raw'] >= df.iloc[i]['composite_raw']: | |
| correct_order += 1 | |
| mono_pct = correct_order / total_consecutive * 100 if total_consecutive > 0 else 0 | |
| print(f" {mode:<12s}: rank rho={rho_rank:.4f} " | |
| f"monotonic pairs: {correct_order}/{total_consecutive} ({mono_pct:.1f}%)") | |
| # ββ 8. Summary ββ | |
| section("FINAL SUMMARY") | |
| best_mode = max(rhos, key=rhos.get) | |
| print(f"\n Results on SANDI dev N={N}:") | |
| for mode in MODES: | |
| marker = ' <-- BEST' if mode == best_mode else '' | |
| print(f" {mode:<12s}: rho={rhos[mode]:+.4f}{marker}") | |
| print(f"\n Best mode: {best_mode} (rho={rhos[best_mode]:.4f})") | |
| if best_mode != 'silero': | |
| delta = rhos[best_mode] - rhos['silero'] | |
| p_val = (boot_delta[best_mode] <= 0).mean() | |
| print(f" Improvement over baseline: +{delta:.4f} (bootstrap p={p_val:.4f})") | |
| if p_val < 0.05: | |
| print(f" -> SIGNIFICANT improvement. Adopt {best_mode}.") | |
| else: | |
| print(f" -> Not significant. More data needed or recalibrate pipeline.") | |
| else: | |
| print(f" -> Silero baseline remains the best. No VAD swap needed.") | |
| print(f"\nTotal time: {total_time/60:.1f} minutes") | |