""" Hybrid V3 VAD-Only Benchmark: Isolated test on SANDI dev-438. VAD-only comparison: Only swaps the VAD component, computes VAD features + banding directly without the full composite pipeline (avoids mord dependency). Uses cached transcriptions for MLU computation and the reference composite scores from the existing silero baseline CSV. Usage: PYTHONPATH=/tmp/pylibs:/tmp/pip_libs \ /opt/anaconda3/envs/deeplearning/bin/python -u run_hybrid_v3_test.py """ import sys, os, time, warnings, pickle import numpy as np import pandas as pd warnings.filterwarnings("ignore") sys.path.insert(0, os.path.dirname(__file__)) from pathlib import Path from scipy.stats import spearmanr, kendalltau, mannwhitneyu BASE = Path(__file__).parent.parent CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache" filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") print(f"SANDI files: {len(filelist)}") cached = sum(1 for _, row in filelist.iterrows() if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists()) print(f"Cached transcriptions: {cached}/{len(filelist)}") print("Loading VAD models...", flush=True) from pipeline.hybrid_vad import run_hybrid_vad print("VAD models loaded.\n") # ── Banding functions (isolated — NO composite.py dependency) ── def band_production(sr, mlu, lpr): """Current production banding (mirrors composite.py).""" if mlu >= 7 and sr >= 0.75 and lpr <= 0.15: return "HIGH" elif mlu < 3 and sr < 0.55: return "LOW" elif mlu < 2: return "LOW" elif sr < 0.35: return "LOW" else: return "MEDIUM" def band_relaxed(sr, mlu, lpr): """Relaxed HIGH threshold (V3 experiment).""" if mlu >= 5.5 and sr >= 0.70 and lpr <= 0.20: return "HIGH" elif mlu < 3 and sr < 0.55: return "LOW" elif mlu < 2: return "LOW" elif sr < 0.35: return "LOW" else: return "MEDIUM" def expert_band(score): if score < 3.0: return 'LOW' elif score < 4.5: return 'MEDIUM' else: return 'HIGH' # ── Composite-like score from VAD features (no models needed) ── def vad_composite(sr, mlu, lpr, pf): """Simple composite score from VAD features for ranking.""" return sr * 0.35 + min(mlu / 15.0, 1.0) * 0.30 + (1 - lpr) * 0.20 + max(0, 1 - pf) * 0.15 # ── Process all files ── MODES = ["silero", "hybrid_v2", "hybrid_v3"] all_results = {m: [] for m in MODES} errors = {m: 0 for m in MODES} v3_diagnostics = [] start_time = time.time() for idx, row in filelist.iterrows(): file_id = row['file_id'] cache_path = CACHE_DIR / f"{file_id}_tx.pkl" if not cache_path.exists(): continue audio_path = row['audio_path'] if not os.path.isabs(audio_path): audio_path = str(BASE / audio_path) if not os.path.exists(audio_path): continue with open(cache_path, 'rb') as f: tx = pickle.load(f) words = tx['words'] word_count = len(words) n = idx + 1 elapsed = time.time() - start_time rate = max(n / max(elapsed, 0.01), 0.01) eta = (len(filelist) - n) / rate if n % 50 == 0 or n <= 3 or n == len(filelist): print(f" [{n}/{len(filelist)}] {file_id} " f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True) for mode in MODES: try: vad = run_hybrid_vad(audio_path, mode=mode, diagnostics=(mode == "hybrid_v3")) seg_count = max(vad['speech_segments'], 1) mlu = round(word_count / seg_count, 2) vad['mlu'] = mlu sr = vad['speech_ratio'] lpr = vad['long_pause_ratio'] pf = vad.get('pause_frequency_per_sec', 0) # Capture V3 diagnostics if mode == "hybrid_v3" and "_diagnostics" in vad: diag = vad["_diagnostics"] v3_diagnostics.append({ "file_id": file_id, "energy_recoveries": diag.get("energy_recoveries", 0), "merges_performed": diag.get("merges_performed", 0), "merges_refused_mlu": diag.get("merges_refused_mlu", 0), "silero_segments": diag.get("silero_segments", 0), "final_segments": diag.get("final_segments", 0), "vad_ms": diag.get("vad_ms", 0), }) all_results[mode].append({ 'file_id': file_id, 'expert_score': row['expert_score'], 'expert_band': expert_band(row['expert_score']), 'speech_ratio': sr, 'mlu': mlu, 'word_count': word_count, 'pause_count': vad['pause_count'], 'mean_pause_dur': vad['mean_pause_duration_sec'], 'long_pause_ratio': lpr, 'short_pause_share': vad.get('short_pause_share', 0), 'speech_segments': vad['speech_segments'], 'speech_duration_sec': vad['speech_duration_sec'], 'pause_freq': pf, 'band_prod': band_production(sr, mlu, lpr), 'band_relaxed': band_relaxed(sr, mlu, lpr), 'composite_vad': vad_composite(sr, mlu, lpr, pf), }) except Exception as e: errors[mode] += 1 if errors[mode] <= 5: print(f" ERROR [{mode}] {file_id}: {e}", flush=True) total_time = time.time() - start_time print(f"\nDone in {total_time/60:.1f} minutes") for m in MODES: print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors") dfs = {mode: pd.DataFrame(all_results[mode]) for mode in MODES} BANDS = ["LOW", "MEDIUM", "HIGH"] def section(title): print(f"\n{'='*80}") print(f" {title}") print(f"{'='*80}") # ══════════════════════════════════════════════════════════════════ # 1. OVERALL METRICS (production banding) # ══════════════════════════════════════════════════════════════════ section("1. OVERALL METRICS (production banding)") metrics = {} for mode in MODES: df = dfs[mode] rho, p = spearmanr(df['expert_score'], df['composite_vad']) agree = (df['expert_band'] == df['band_prod']).sum() acc = agree / len(df) f1s = [] for band in BANDS: tp = ((df['expert_band'] == band) & (df['band_prod'] == band)).sum() fp = ((df['expert_band'] != band) & (df['band_prod'] == band)).sum() fn = ((df['expert_band'] == band) & (df['band_prod'] != band)).sum() prec = tp/(tp+fp) if (tp+fp) > 0 else 0 rec = tp/(tp+fn) if (tp+fn) > 0 else 0 f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0 f1s.append(f1) macro_f1 = np.mean(f1s) mh_fp = ((df['expert_band'] == 'MEDIUM') & (df['band_prod'] == 'HIGH')).sum() hm_fn = ((df['expert_band'] == 'HIGH') & (df['band_prod'] == 'MEDIUM')).sum() metrics[mode] = { 'rho': rho, 'acc': acc, 'macro_f1': macro_f1, 'mh_fp': int(mh_fp), 'hm_fn': int(hm_fn) } marker = " ◄ V3" if mode == "hybrid_v3" else "" print(f" {mode:<12s} ρ={rho:+.4f} acc={acc:.1%} F1={macro_f1:.3f} " f"MED→HIGH={mh_fp} HIGH→MED={hm_fn}{marker}") # ══════════════════════════════════════════════════════════════════ # 2. OVERALL METRICS (relaxed banding — experimental) # ══════════════════════════════════════════════════════════════════ section("2. OVERALL METRICS (relaxed banding — experimental)") metrics_relaxed = {} for mode in MODES: df = dfs[mode] agree = (df['expert_band'] == df['band_relaxed']).sum() acc = agree / len(df) f1s = [] for band in BANDS: tp = ((df['expert_band'] == band) & (df['band_relaxed'] == band)).sum() fp = ((df['expert_band'] != band) & (df['band_relaxed'] == band)).sum() fn = ((df['expert_band'] == band) & (df['band_relaxed'] != band)).sum() prec = tp/(tp+fp) if (tp+fp) > 0 else 0 rec = tp/(tp+fn) if (tp+fn) > 0 else 0 f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0 f1s.append(f1) macro_f1 = np.mean(f1s) mh_fp = ((df['expert_band'] == 'MEDIUM') & (df['band_relaxed'] == 'HIGH')).sum() hm_fn = ((df['expert_band'] == 'HIGH') & (df['band_relaxed'] == 'MEDIUM')).sum() metrics_relaxed[mode] = { 'acc': acc, 'macro_f1': macro_f1, 'mh_fp': int(mh_fp), 'hm_fn': int(hm_fn) } marker = " ◄ V3" if mode == "hybrid_v3" else "" print(f" {mode:<12s} acc={acc:.1%} F1={macro_f1:.3f} " f"MED→HIGH={mh_fp} HIGH→MED={hm_fn}{marker}") # ══════════════════════════════════════════════════════════════════ # 3. CONFUSION MATRICES # ══════════════════════════════════════════════════════════════════ section("3. CONFUSION MATRICES (production banding)") for mode in MODES: df = dfs[mode] agree = (df['expert_band'] == df['band_prod']).sum() print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})") print(f" {'':>16} Pipeline→ {'LOW':>5} {'MED':>5} {'HIGH':>5}") for eb in BANDS: row = [((df['expert_band'] == eb) & (df['band_prod'] == pb)).sum() for pb in BANDS] print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}") # ══════════════════════════════════════════════════════════════════ # 4. SEGMENT AND MLU COMPARISON # ══════════════════════════════════════════════════════════════════ section("4. SEGMENT COUNT & MLU COMPARISON") print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} " f"{'PauseDur':>10s} {'LPR':>10s}") print(f" {'-'*65}") for mode in MODES: df = dfs[mode] print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} " f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} " f"{df['long_pause_ratio'].mean():>10.4f}") # ══════════════════════════════════════════════════════════════════ # 5. MLU INFLATION CHECK (hard failure mode) # ══════════════════════════════════════════════════════════════════ section("5. MLU INFLATION CHECK") for mode in MODES: df = dfs[mode] mean_mlu = df['mlu'].mean() max_mlu = df['mlu'].max() p95_mlu = df['mlu'].quantile(0.95) status = "✅ SAFE" if mean_mlu < 9.0 else "⚠ WARNING" if mean_mlu < 12.0 else "❌ REJECT" print(f" {mode:<12s} mean={mean_mlu:.2f} p95={p95_mlu:.2f} max={max_mlu:.2f} {status}") # ══════════════════════════════════════════════════════════════════ # 6. PER-BAND FEATURE ANALYSIS # ══════════════════════════════════════════════════════════════════ section("6. PER-BAND FEATURE COMPARISON") for band in BANDS: print(f"\n [{band}]") print(f" {'Mode':<12s} {'N':>4s} {'Segments':>10s} {'MLU':>10s} " f"{'SR':>10s} {'LPR':>10s}") print(f" {'-'*55}") for mode in MODES: df = dfs[mode] sub = df[df['expert_band'] == band] print(f" {mode:<12s} {len(sub):>4d} {sub['speech_segments'].mean():>10.2f} " f"{sub['mlu'].mean():>10.2f} {sub['speech_ratio'].mean():>10.4f} " f"{sub['long_pause_ratio'].mean():>10.4f}") # ══════════════════════════════════════════════════════════════════ # 7. V3 DIAGNOSTICS SUMMARY # ══════════════════════════════════════════════════════════════════ section("7. V3 DIAGNOSTICS") if v3_diagnostics: diag_df = pd.DataFrame(v3_diagnostics) print(f" Total files: {len(diag_df)}") print(f" Energy recoveries: total={diag_df['energy_recoveries'].sum()}, " f"mean={diag_df['energy_recoveries'].mean():.2f}/file") print(f" Merges performed: total={diag_df['merges_performed'].sum()}, " f"mean={diag_df['merges_performed'].mean():.2f}/file") print(f" Merges refused (MLU): total={diag_df['merges_refused_mlu'].sum()}") print(f" Segment change: Silero={diag_df['silero_segments'].mean():.2f} → " f"V3={diag_df['final_segments'].mean():.2f}") if 'vad_ms' in diag_df.columns: print(f" V3 latency: mean={diag_df['vad_ms'].mean():.0f}ms, " f"p95={diag_df['vad_ms'].quantile(0.95):.0f}ms") else: print(" No V3 diagnostics collected.") # ══════════════════════════════════════════════════════════════════ # 8. BOOTSTRAP SIGNIFICANCE TEST # ══════════════════════════════════════════════════════════════════ section("8. BOOTSTRAP SIGNIFICANCE (composite_vad vs expert_score)") common_ids = set(dfs['silero']['file_id']) for mode in MODES[1:]: common_ids &= set(dfs[mode]['file_id']) common_ids = sorted(common_ids) NC = len(common_ids) expert_arr = dfs['silero'].set_index('file_id').loc[common_ids]['expert_score'].values comp_arr = {m: dfs[m].set_index('file_id').loc[common_ids]['composite_vad'].values for m in MODES} n_boot = 10000 rng = np.random.default_rng(42) boot_rho = {m: np.zeros(n_boot) for m in MODES} for i in range(n_boot): idx = rng.choice(NC, NC, replace=True) for mode in MODES: r, _ = spearmanr(expert_arr[idx], comp_arr[mode][idx]) boot_rho[mode][i] = r print(f"\n N={NC} paired files, {n_boot} bootstrap iterations\n") print(f" {'Mode':<12s} {'ρ obs':>10s} {'95% CI':>22s} " f"{'Δρ vs silero':>12s} {'p':>10s} {'Sig':>5s}") print(f" {'-'*76}") rho_obs = {} for mode in MODES: r, _ = spearmanr(expert_arr, comp_arr[mode]) rho_obs[mode] = r ci = np.percentile(boot_rho[mode], [2.5, 97.5]) if mode == "silero": print(f" {mode:<12s} {r:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{'baseline':>12s} {'--':>10s} {'--':>5s}") else: delta = r - rho_obs['silero'] boot_delta = boot_rho[mode] - boot_rho['silero'] p_val = (boot_delta <= 0).mean() sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns" print(f" {mode:<12s} {r:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{delta:>+12.4f} {p_val:>10.4f} {sig:>5s}") # V3 vs V2 specifically if "hybrid_v2" in comp_arr and "hybrid_v3" in comp_arr: boot_v3_v2 = boot_rho['hybrid_v3'] - boot_rho['hybrid_v2'] p_v3_v2 = (boot_v3_v2 <= 0).mean() delta_v3_v2 = rho_obs['hybrid_v3'] - rho_obs['hybrid_v2'] sig = "***" if p_v3_v2 < 0.001 else "**" if p_v3_v2 < 0.01 else "*" if p_v3_v2 < 0.05 else "ns" print(f"\n V3 vs V2: Δρ = {delta_v3_v2:+.4f} p = {p_v3_v2:.4f} ({sig})") # ══════════════════════════════════════════════════════════════════ # 9. COHEN'S D (composite score separation between bands) # ══════════════════════════════════════════════════════════════════ section("9. BAND SEPARATION (Cohen's d on composite_vad)") print(f"\n {'Mode':<12s} {'LOW→MED d':>12s} {'MED→HIGH d':>12s} {'LOW→HIGH d':>12s}") print(f" {'-'*52}") for mode in MODES: df = dfs[mode] parts = [] for (bl, bh) in [("LOW", "MEDIUM"), ("MEDIUM", "HIGH"), ("LOW", "HIGH")]: a = df[df['expert_band'] == bl]['composite_vad'] b = df[df['expert_band'] == bh]['composite_vad'] if len(a) > 1 and len(b) > 1: pooled = np.sqrt((a.std()**2 + b.std()**2) / 2) d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0 parts.append(f"{d:>+12.3f}") else: parts.append(f"{'--':>12s}") print(f" {mode:<12s} {' '.join(parts)}") # ══════════════════════════════════════════════════════════════════ # 10. VERDICT # ══════════════════════════════════════════════════════════════════ section("10. VERDICT") print(f"\n Dataset: SANDI dev-438, N={len(dfs['silero'])}") # Summary table print(f"\n ┌────────────┬────────────┬────────────┬──────────────┬──────────────┬──────────────┐") print(f" │ {'Mode':<10s} │ {'Spear. ρ':>10s} │ {'Accuracy':>10s} │ {'Macro F1':>12s} │ {'MED→HIGH':>12s} │ {'HIGH→MED':>12s} │") print(f" ├────────────┼────────────┼────────────┼──────────────┼──────────────┼──────────────┤") for mode in MODES: m = metrics[mode] print(f" │ {mode:<10s} │ {m['rho']:>+10.4f} │ {m['acc']:>10.1%} │ {m['macro_f1']:>12.3f} │ {m['mh_fp']:>12d} │ {m['hm_fn']:>12d} │") print(f" └────────────┴────────────┴────────────┴──────────────┴──────────────┴──────────────┘") # MLU safety check mlu_v3 = dfs['hybrid_v3']['mlu'].mean() mlu_sil = dfs['silero']['mlu'].mean() if mlu_v3 > 9.0: print(f"\n ⚠ V3 MLU ({mlu_v3:.2f}) exceeds safety threshold — MLU inflation detected") else: print(f"\n ✅ V3 MLU ({mlu_v3:.2f}) within safe range (Silero: {mlu_sil:.2f})") # MED→HIGH check mh_v3 = metrics['hybrid_v3']['mh_fp'] mh_sil = metrics['silero']['mh_fp'] if mh_v3 > 65: print(f" ⚠ V3 MED→HIGH FP ({mh_v3}) exceeds boundary (65)") else: print(f" ✅ V3 MED→HIGH FP ({mh_v3}) within bounds (Silero: {mh_sil})") # Relaxed banding verdict print(f"\n RELAXED BANDING COMPARISON:") for mode in MODES: m_p = metrics[mode] m_r = metrics_relaxed[mode] delta_acc = (m_r['acc'] - m_p['acc']) * 100 delta_fp = m_r['mh_fp'] - m_p['mh_fp'] delta_fn = m_p['hm_fn'] - m_r['hm_fn'] print(f" {mode:<12s} Δacc={delta_acc:+.1f}pp ΔFP={delta_fp:+d} " f"H→M recovered={delta_fn}") # Recommendation rho_v3 = metrics['hybrid_v3']['rho'] rho_sil = metrics['silero']['rho'] acc_v3 = metrics['hybrid_v3']['acc'] acc_sil = metrics['silero']['acc'] print(f"\n Δρ(V3-Silero) = {rho_v3 - rho_sil:+.4f}") print(f" Δacc(V3-Silero) = {(acc_v3 - acc_sil)*100:+.1f}pp") if rho_v3 >= rho_sil and acc_v3 >= acc_sil: print(f" → V3 IMPROVES on all metrics — candidate for promotion") elif rho_v3 >= rho_sil - 0.01: print(f" → V3 maintains correlation, review accuracy trade-off") else: print(f" → V3 shows regression — investigate before promoting") print(f"\n Total processing time: {total_time/60:.1f} minutes") print(f"\n{'='*80}") print(" BENCHMARK COMPLETE") print(f"{'='*80}\n")