""" Full VAD Comparison: Hybrid Pipeline vs Silero VAD on SANDI dev-438. Compares 4 VAD modes across 3 fluency bands (LOW/MEDIUM/HIGH): 1. Band-segregation performance (precision, recall, F1) 2. Confusion matrix analysis 3. Threshold crossing analysis 4. Segment behavior analysis (per-band feature distributions) 5. Downstream fluency impact (per-dimension, composite) 6. Bootstrap significance testing 7. Summary verdict Uses pre-computed CSVs from prior runs — no heavy VAD reprocessing. Usage: python run_full_vad_comparison.py """ import json import sys import os import numpy as np import pandas as pd from pathlib import Path from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal, ks_2samp BASE = Path(__file__).parent.parent # ══════════════════════════════════════════════════════════════════ # DATA LOADING # ══════════════════════════════════════════════════════════════════ MODES = ["silero", "hybrid", "marblenet", "fusion"] CSV_PATHS = {m: BASE / f"EDA/data/sandi_438_vad_{m}.csv" for m in MODES} print("=" * 80) print(" HYBRID VAD vs SILERO VAD — FULL COMPARISON ON SANDI DEV-438") print("=" * 80) dfs = {} for mode, path in CSV_PATHS.items(): if not path.exists(): print(f"ERROR: Missing {path}") sys.exit(1) dfs[mode] = pd.read_csv(path) print(f" Loaded {mode}: N={len(dfs[mode])}") def expert_band(score): if score < 3.0: return "LOW" elif score < 4.5: return "MEDIUM" else: return "HIGH" # Add expert band to all dataframes for mode in MODES: dfs[mode]["expert_band"] = dfs[mode]["expert_score"].apply(expert_band) # Verify file_id alignment base_ids = set(dfs["silero"]["file_id"]) for mode in MODES[1:]: if set(dfs[mode]["file_id"]) != base_ids: print(f"WARNING: file_id mismatch between silero and {mode}") N = len(dfs["silero"]) print(f"\n Total samples: {N}") band_dist = dfs["silero"]["expert_band"].value_counts() print(f" Expert band distribution: LOW={band_dist.get('LOW',0)}, " f"MEDIUM={band_dist.get('MEDIUM',0)}, HIGH={band_dist.get('HIGH',0)}") BANDS = ["LOW", "MEDIUM", "HIGH"] DIMS = ["dim_continuity", "dim_pause_quality", "dim_articulation", "dim_dominance", "dim_placement", "dim_word_precision"] VAD_FEATURES = ["speech_ratio", "mlu", "mean_pause_dur", "long_pause_ratio", "pause_count", "speech_segments", "speech_duration_sec", "short_pause_share"] results = {} # machine-readable output def section(title): print(f"\n{'=' * 80}") print(f" {title}") print(f"{'=' * 80}") # ══════════════════════════════════════════════════════════════════ # 1. BAND-SEGREGATION PERFORMANCE # ══════════════════════════════════════════════════════════════════ section("1. BAND-SEGREGATION PERFORMANCE (Precision / Recall / F1)") results["band_segregation"] = {} for mode in MODES: df = dfs[mode] mode_results = {"overall_accuracy": 0, "bands": {}} agree = (df["expert_band"] == df["fluency_band"]).sum() accuracy = agree / len(df) mode_results["overall_accuracy"] = round(accuracy, 4) print(f"\n [{mode}] Overall Accuracy: {agree}/{len(df)} ({accuracy:.1%})") print(f" {'Band':<8s} {'Precision':>10s} {'Recall':>10s} {'F1':>10s} " f"{'Support':>8s} {'Predicted':>10s}") print(f" {'-' * 60}") macro_f1_parts = [] for band in BANDS: tp = ((df["expert_band"] == band) & (df["fluency_band"] == band)).sum() fp = ((df["expert_band"] != band) & (df["fluency_band"] == band)).sum() fn = ((df["expert_band"] == band) & (df["fluency_band"] != band)).sum() precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 support = (df["expert_band"] == band).sum() predicted = (df["fluency_band"] == band).sum() macro_f1_parts.append(f1) mode_results["bands"][band] = { "precision": round(precision, 4), "recall": round(recall, 4), "f1": round(f1, 4), "support": int(support), "predicted": int(predicted), "tp": int(tp), "fp": int(fp), "fn": int(fn), } print(f" {band:<8s} {precision:>10.3f} {recall:>10.3f} {f1:>10.3f} " f"{support:>8d} {predicted:>10d}") macro_f1 = np.mean(macro_f1_parts) mode_results["macro_f1"] = round(macro_f1, 4) print(f" {'MACRO':>8s} {'':>10s} {'':>10s} {macro_f1:>10.3f}") results["band_segregation"][mode] = mode_results # ══════════════════════════════════════════════════════════════════ # 2. CONFUSION MATRIX ANALYSIS # ══════════════════════════════════════════════════════════════════ section("2. CONFUSION MATRICES") results["confusion_matrices"] = {} for mode in MODES: df = dfs[mode] print(f"\n [{mode}]") print(f" {'':>16s} Pipeline → {'LOW':>6s} {'MEDIUM':>8s} {'HIGH':>6s} {'Total':>6s}") print(f" {'-' * 55}") cm = {} for eb in BANDS: row = {} row_vals = [] for pb in BANDS: n = ((df["expert_band"] == eb) & (df["fluency_band"] == pb)).sum() row[pb] = int(n) row_vals.append(n) total = sum(row_vals) cm[eb] = row # Highlight the diagonal (correct classifications) parts = [] for pb, n in zip(BANDS, row_vals): marker = f"[{n}]" if pb == eb else f" {n} " parts.append(f"{marker:>6s}" if pb != "MEDIUM" else f"{marker:>8s}") print(f" Expert {eb:>6s}: {''.join(parts)} {total:>6d}") results["confusion_matrices"][mode] = cm # Error pattern analysis print(f"\n Error patterns:") for eb in BANDS: for pb in BANDS: if eb != pb: n = cm[eb][pb] if n > 0: pct = n / sum(cm[eb].values()) * 100 print(f" {eb} → {pb}: {n} ({pct:.1f}% of {eb} samples)") # ══════════════════════════════════════════════════════════════════ # 3. THRESHOLD CROSSING ANALYSIS # ══════════════════════════════════════════════════════════════════ section("3. THRESHOLD CROSSING ANALYSIS") print(" Band thresholds: MLU≥7 + SR≥0.75 + LPR≤0.15 → HIGH") print(" MLU<3 + SR<0.55 | MLU<2 | SR<0.35 → LOW") print(" Otherwise → MEDIUM\n") results["threshold_crossings"] = {} for mode in MODES: df = dfs[mode] mode_tc = {} # Analyze how close borderline cases are to thresholds # Focus on the HIGH/MEDIUM boundary (most impactful) high_expert = df[df["expert_band"] == "HIGH"] med_expert = df[df["expert_band"] == "MEDIUM"] # HIGH threshold: MLU≥7, SR≥0.75, LPR≤0.15 high_meeting_mlu = (high_expert["mlu"] >= 7).sum() high_meeting_sr = (high_expert["speech_ratio"] >= 0.75).sum() high_meeting_lpr = (high_expert["long_pause_ratio"] <= 0.15).sum() high_meeting_all = ( (high_expert["mlu"] >= 7) & (high_expert["speech_ratio"] >= 0.75) & (high_expert["long_pause_ratio"] <= 0.15) ).sum() # MEDIUM misclassified as HIGH med_as_high = df[(df["expert_band"] == "MEDIUM") & (df["fluency_band"] == "HIGH")] n_med_as_high = len(med_as_high) # HIGH misclassified as MEDIUM high_as_med = df[(df["expert_band"] == "HIGH") & (df["fluency_band"] == "MEDIUM")] n_high_as_med = len(high_as_med) mode_tc["high_expert_meeting_mlu7"] = int(high_meeting_mlu) mode_tc["high_expert_meeting_sr75"] = int(high_meeting_sr) mode_tc["high_expert_meeting_lpr15"] = int(high_meeting_lpr) mode_tc["high_expert_meeting_all_thresholds"] = int(high_meeting_all) mode_tc["medium_misclassified_as_high"] = int(n_med_as_high) mode_tc["high_misclassified_as_medium"] = int(n_high_as_med) print(f" [{mode}]") print(f" Expert-HIGH (N={len(high_expert)}) meeting thresholds:") print(f" MLU≥7: {high_meeting_mlu}/{len(high_expert)} ({high_meeting_mlu/len(high_expert)*100:.1f}%)") print(f" SR≥0.75: {high_meeting_sr}/{len(high_expert)} ({high_meeting_sr/len(high_expert)*100:.1f}%)") print(f" LPR≤0.15: {high_meeting_lpr}/{len(high_expert)} ({high_meeting_lpr/len(high_expert)*100:.1f}%)") print(f" ALL 3: {high_meeting_all}/{len(high_expert)} ({high_meeting_all/len(high_expert)*100:.1f}%)") print(f" Errors:") print(f" MEDIUM→HIGH: {n_med_as_high} false positives") print(f" HIGH→MEDIUM: {n_high_as_med} false negatives") if n_med_as_high > 0: print(f" MEDIUM→HIGH error features (mean):") print(f" MLU={med_as_high['mlu'].mean():.2f} " f"SR={med_as_high['speech_ratio'].mean():.3f} " f"LPR={med_as_high['long_pause_ratio'].mean():.3f}") mode_tc["med_as_high_mean_mlu"] = round(med_as_high["mlu"].mean(), 2) mode_tc["med_as_high_mean_sr"] = round(med_as_high["speech_ratio"].mean(), 3) mode_tc["med_as_high_mean_lpr"] = round(med_as_high["long_pause_ratio"].mean(), 3) if n_high_as_med > 0: print(f" HIGH→MEDIUM error features (mean):") print(f" MLU={high_as_med['mlu'].mean():.2f} " f"SR={high_as_med['speech_ratio'].mean():.3f} " f"LPR={high_as_med['long_pause_ratio'].mean():.3f}") mode_tc["high_as_med_mean_mlu"] = round(high_as_med["mlu"].mean(), 2) mode_tc["high_as_med_mean_sr"] = round(high_as_med["speech_ratio"].mean(), 3) mode_tc["high_as_med_mean_lpr"] = round(high_as_med["long_pause_ratio"].mean(), 3) print() results["threshold_crossings"][mode] = mode_tc # ══════════════════════════════════════════════════════════════════ # 4. SEGMENT BEHAVIOR ANALYSIS # ══════════════════════════════════════════════════════════════════ section("4. SEGMENT BEHAVIOR BY BAND (per-band feature distributions)") results["segment_behavior"] = {} for mode in MODES: df = dfs[mode] mode_sb = {} print(f"\n [{mode}]") print(f" {'Feature':<22s}", end="") for band in BANDS: print(f" {band + ' (mean±std)':>18s}", end="") print(f" {'KW-H':>8s} {'p':>10s}") print(f" {'-' * 88}") for feat in VAD_FEATURES: if feat not in df.columns: continue feat_data = {} parts = [] groups = [] for band in BANDS: sub = df[df["expert_band"] == band][feat].dropna() m, s = sub.mean(), sub.std() feat_data[band] = {"mean": round(m, 4), "std": round(s, 4), "n": len(sub)} parts.append(f"{m:>8.3f} ± {s:>6.3f}") if len(sub) > 0: groups.append(sub.values) # Kruskal-Wallis across bands if len(groups) >= 2 and all(len(g) > 0 for g in groups): H, p_kw = kruskal(*groups) else: H, p_kw = 0, 1.0 sig = "***" if p_kw < 0.001 else "**" if p_kw < 0.01 else "*" if p_kw < 0.05 else "ns" feat_data["kruskal_H"] = round(H, 2) feat_data["kruskal_p"] = round(p_kw, 6) print(f" {feat:<22s}", end="") for p in parts: print(f" {p:>18s}", end="") print(f" {H:>8.2f} {p_kw:>10.2e} {sig}") mode_sb[feat] = feat_data results["segment_behavior"][mode] = mode_sb # ══════════════════════════════════════════════════════════════════ # 5. DOWNSTREAM FLUENCY IMPACT # ══════════════════════════════════════════════════════════════════ section("5. DOWNSTREAM FLUENCY IMPACT") results["fluency_impact"] = {} # 5a. Overall Spearman correlation print("\n 5a. OVERALL CORRELATION WITH EXPERT SCORES\n") print(f" {'Mode':<12s} {'Spearman ρ':>12s} {'p':>12s} " f"{'Kendall τ':>12s} {'p':>12s}") print(f" {'-' * 62}") rhos = {} for mode in MODES: df = dfs[mode] rho, p_rho = spearmanr(df["expert_score"], df["composite_raw"]) tau, p_tau = kendalltau(df["expert_score"], df["composite_raw"]) rhos[mode] = rho best = " ◄" if rho == max(rhos.values()) else "" print(f" {mode:<12s} {rho:>+12.4f} {p_rho:>12.2e} " f"{tau:>+12.4f} {p_tau:>12.2e}{best}") results["fluency_impact"]["overall_correlations"] = { m: {"spearman": round(rhos[m], 4)} for m in MODES } # 5b. Per-band correlations print("\n 5b. PER-BAND SPEARMAN CORRELATIONS (composite vs expert)\n") print(f" {'Mode':<12s}", end="") for band in BANDS: print(f" {band:>12s}", end="") print() print(f" {'-' * 50}") results["fluency_impact"]["per_band_correlations"] = {} for mode in MODES: df = dfs[mode] band_corrs = {} print(f" {mode:<12s}", end="") for band in BANDS: sub = df[df["expert_band"] == band] if len(sub) > 5: r, p = spearmanr(sub["expert_score"], sub["composite_raw"]) sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else "" band_corrs[band] = round(r, 4) print(f" {r:>+8.4f}{sig:>4s}", end="") else: band_corrs[band] = None print(f" {'N<5':>12s}", end="") print() results["fluency_impact"]["per_band_correlations"][mode] = band_corrs # 5c. Per-dimension correlations print("\n 5c. PER-DIMENSION CORRELATIONS WITH EXPERT SCORES\n") print(f" {'Dimension':<25s}", end="") for mode in MODES: print(f" {mode:>12s}", end="") print() print(f" {'-' * 75}") results["fluency_impact"]["per_dimension"] = {} for d in DIMS: dim_corrs = {} print(f" {d:<25s}", end="") for mode in MODES: v = dfs[mode][["expert_score", d]].dropna() if len(v) > 10: r, p = spearmanr(v["expert_score"], v[d]) sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else " ns" dim_corrs[mode] = round(r, 4) print(f" {r:>+8.4f}{sig}", end="") else: dim_corrs[mode] = None print(f" {'--':>12s}", end="") print() results["fluency_impact"]["per_dimension"][d] = dim_corrs # 5d. Composite score separation (Cohen's d between adjacent expert bands) print("\n 5d. COMPOSITE SCORE SEPARATION (Cohen's d between adjacent bands)\n") print(f" {'Mode':<12s} {'LOW→MED d':>12s} {'p':>10s} " f"{'MED→HIGH d':>12s} {'p':>10s} {'LOW→HIGH d':>12s}") print(f" {'-' * 72}") results["fluency_impact"]["band_separation"] = {} for mode in MODES: df = dfs[mode] sep = {} parts = [] for band_lo, band_hi in [("LOW", "MEDIUM"), ("MEDIUM", "HIGH"), ("LOW", "HIGH")]: a = df[df["expert_band"] == band_lo]["composite_raw"] b = df[df["expert_band"] == band_hi]["composite_raw"] if len(a) > 1 and len(b) > 1: pooled = np.sqrt((a.std()**2 + b.std()**2) / 2) d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0 u, p = mannwhitneyu(a, b, alternative="less") sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else "ns" sep[f"{band_lo}_to_{band_hi}"] = {"cohens_d": round(d, 3), "p": round(p, 6)} parts.append(f"{d:>+12.3f} {p:>10.4f}") else: parts.append(f"{'--':>12s} {'--':>10s}") print(f" {mode:<12s} {' '.join(parts[:2])} {parts[2].split()[0] if len(parts) > 2 else '--':>12s}") results["fluency_impact"]["band_separation"][mode] = sep # ══════════════════════════════════════════════════════════════════ # 6. BOOTSTRAP SIGNIFICANCE TESTING # ══════════════════════════════════════════════════════════════════ section("6. BOOTSTRAP SIGNIFICANCE TESTING (10,000 iterations)") # Align all modes by file_id common_ids = set(dfs["silero"]["file_id"]) for mode in MODES[1:]: common_ids &= set(dfs[mode]["file_id"]) common_ids = sorted(common_ids) NC = len(common_ids) expert_arr = dfs["silero"].set_index("file_id").loc[common_ids]["expert_score"].values comp_arr = {} for mode in MODES: comp_arr[mode] = dfs[mode].set_index("file_id").loc[common_ids]["composite_raw"].values n_boot = 10000 rng = np.random.default_rng(42) boot_rho = {m: np.zeros(n_boot) for m in MODES} boot_delta = {m: np.zeros(n_boot) for m in MODES if m != "silero"} for i in range(n_boot): idx = rng.choice(NC, NC, replace=True) for mode in MODES: r, _ = spearmanr(expert_arr[idx], comp_arr[mode][idx]) boot_rho[mode][i] = r for mode in [m for m in MODES if m != "silero"]: boot_delta[mode][i] = boot_rho[mode][i] - boot_rho["silero"][i] # 6a. Overall bootstrap print(f"\n 6a. OVERALL (N={NC})\n") print(f" {'Mode':<12s} {'ρ obs':>10s} {'95% CI':>22s} " f"{'Δρ vs silero':>12s} {'p':>10s} {'Sig':>5s}") print(f" {'-' * 76}") results["bootstrap"] = {"overall": {}} for mode in MODES: r_obs, _ = spearmanr(expert_arr, comp_arr[mode]) ci = np.percentile(boot_rho[mode], [2.5, 97.5]) if mode == "silero": print(f" {mode:<12s} {r_obs:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{'baseline':>12s} {'--':>10s} {'--':>5s}") results["bootstrap"]["overall"][mode] = { "rho": round(r_obs, 4), "ci_lo": round(ci[0], 4), "ci_hi": round(ci[1], 4), } else: delta = r_obs - rhos["silero"] p_val = (boot_delta[mode] <= 0).mean() sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns" print(f" {mode:<12s} {r_obs:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{delta:>+12.4f} {p_val:>10.4f} {sig:>5s}") results["bootstrap"]["overall"][mode] = { "rho": round(r_obs, 4), "ci_lo": round(ci[0], 4), "ci_hi": round(ci[1], 4), "delta_vs_silero": round(delta, 4), "p_vs_silero": round(p_val, 4), } # 6b. Per-band bootstrap print(f"\n 6b. PER-BAND BOOTSTRAP\n") results["bootstrap"]["per_band"] = {} for band in BANDS: band_mask = dfs["silero"].set_index("file_id").loc[common_ids]["expert_band"].values band_idx = np.where(band_mask == band)[0] n_band = len(band_idx) if n_band < 10: print(f" [{band}] N={n_band} — too few for bootstrap\n") continue print(f" [{band}] N={n_band}") print(f" {'Mode':<12s} {'ρ obs':>10s} {'95% CI':>22s} " f"{'Δρ':>8s} {'p':>8s}") print(f" {'-' * 65}") band_expert = expert_arr[band_idx] band_results = {} for mode in MODES: band_comp = comp_arr[mode][band_idx] r_obs, _ = spearmanr(band_expert, band_comp) # Band-level bootstrap band_boot = np.zeros(min(n_boot, 5000)) for i in range(len(band_boot)): bi = rng.choice(n_band, n_band, replace=True) br, _ = spearmanr(band_expert[bi], band_comp[bi]) band_boot[i] = br ci = np.percentile(band_boot, [2.5, 97.5]) band_results[mode] = {"rho": round(r_obs, 4), "ci_lo": round(ci[0], 4), "ci_hi": round(ci[1], 4)} if mode == "silero": print(f" {mode:<12s} {r_obs:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{'base':>8s} {'--':>8s}") else: r_sil, _ = spearmanr(band_expert, comp_arr["silero"][band_idx]) delta = r_obs - r_sil # Quick bootstrap p for this band d_boot = np.zeros(min(n_boot, 5000)) for i in range(len(d_boot)): bi = rng.choice(n_band, n_band, replace=True) rs, _ = spearmanr(band_expert[bi], comp_arr["silero"][band_idx][bi]) rm, _ = spearmanr(band_expert[bi], band_comp[bi]) d_boot[i] = rm - rs p_val = (d_boot <= 0).mean() band_results[mode]["delta"] = round(delta, 4) band_results[mode]["p"] = round(p_val, 4) print(f" {mode:<12s} {r_obs:>+10.4f} [{ci[0]:.4f}, {ci[1]:.4f}] " f"{delta:>+8.4f} {p_val:>8.4f}") print() results["bootstrap"]["per_band"][band] = band_results # ══════════════════════════════════════════════════════════════════ # 7. FEATURE DISTRIBUTION SHIFT ACROSS MODES (per band) # ══════════════════════════════════════════════════════════════════ section("7. VAD FEATURE DELTAS: HYBRID/FUSION vs SILERO (per expert band)") results["feature_deltas"] = {} # Merge on file_id for paired comparison for other_mode in ["hybrid", "fusion", "marblenet"]: print(f"\n [{other_mode} - silero]\n") mode_deltas = {} merged = dfs["silero"][["file_id", "expert_band"]].copy() for feat in VAD_FEATURES: if feat in dfs["silero"].columns and feat in dfs[other_mode].columns: merged[f"{feat}_silero"] = dfs["silero"][feat].values merged[f"{feat}_other"] = dfs[other_mode].set_index("file_id").loc[ merged["file_id"] ][feat].values print(f" {'Feature':<22s} {'Band':<8s} {'Δ mean':>10s} " f"{'Δ median':>10s} {'KS stat':>10s} {'KS p':>10s}") print(f" {'-' * 76}") for feat in VAD_FEATURES: sc = f"{feat}_silero" oc = f"{feat}_other" if sc not in merged.columns: continue feat_deltas = {} for band in BANDS: sub = merged[merged["expert_band"] == band] if len(sub) < 3: continue delta = sub[oc] - sub[sc] ks_stat, ks_p = ks_2samp(sub[sc].values, sub[oc].values) sig = "***" if ks_p < 0.001 else "**" if ks_p < 0.01 else "*" if ks_p < 0.05 else "" feat_deltas[band] = { "delta_mean": round(delta.mean(), 4), "delta_median": round(delta.median(), 4), "ks_stat": round(ks_stat, 4), "ks_p": round(ks_p, 6), } print(f" {feat:<22s} {band:<8s} {delta.mean():>+10.4f} " f"{delta.median():>+10.4f} {ks_stat:>10.4f} {ks_p:>10.4f} {sig}") mode_deltas[feat] = feat_deltas results["feature_deltas"][f"{other_mode}_vs_silero"] = mode_deltas # ══════════════════════════════════════════════════════════════════ # 8. SUMMARY VERDICT # ══════════════════════════════════════════════════════════════════ section("8. SUMMARY VERDICT") print(f"\n Dataset: SANDI dev-438 (N={N})") print(f" Expert bands: LOW={band_dist.get('LOW',0)}, " f"MEDIUM={band_dist.get('MEDIUM',0)}, HIGH={band_dist.get('HIGH',0)}") print(f"\n ┌────────────┬────────────┬────────────┬──────────────────┐") print(f" │ {'Mode':<10s} │ {'Spear. ρ':>10s} │ {'Accuracy':>10s} │ {'Macro F1':>16s} │") print(f" ├────────────┼────────────┼────────────┼──────────────────┤") for mode in MODES: acc = results["band_segregation"][mode]["overall_accuracy"] mf1 = results["band_segregation"][mode]["macro_f1"] rho = rhos[mode] best_markers = [] if rho == max(rhos.values()): best_markers.append("ρ") if acc == max(results["band_segregation"][m]["overall_accuracy"] for m in MODES): best_markers.append("acc") if mf1 == max(results["band_segregation"][m]["macro_f1"] for m in MODES): best_markers.append("F1") marker = " ◄ " + ",".join(best_markers) if best_markers else "" print(f" │ {mode:<10s} │ {rho:>+10.4f} │ {acc:>10.1%} │ {mf1:>10.3f}{marker:>6s} │") print(f" └────────────┴────────────┴────────────┴──────────────────┘") # Determine winner best_accuracy_mode = max(MODES, key=lambda m: results["band_segregation"][m]["overall_accuracy"]) best_rho_mode = max(MODES, key=lambda m: rhos[m]) best_f1_mode = max(MODES, key=lambda m: results["band_segregation"][m]["macro_f1"]) print(f"\n Best by accuracy: {best_accuracy_mode} " f"({results['band_segregation'][best_accuracy_mode]['overall_accuracy']:.1%})") print(f" Best by Spearman: {best_rho_mode} ({rhos[best_rho_mode]:.4f})") print(f" Best by macro-F1: {best_f1_mode} " f"({results['band_segregation'][best_f1_mode]['macro_f1']:.3f})") # Key findings print(f"\n KEY FINDINGS:") for mode in [m for m in MODES if m != "silero"]: bsr = results["bootstrap"]["overall"].get(mode, {}) delta = bsr.get("delta_vs_silero", 0) p = bsr.get("p_vs_silero", 1) sig = "SIGNIFICANT" if p < 0.05 else "NOT significant" direction = "better" if delta > 0 else "worse" print(f" {mode} vs silero: Δρ={delta:+.4f} ({direction}, {sig}, p={p:.4f})") # Recommendations print(f"\n RECOMMENDATIONS:") if rhos[best_rho_mode] > rhos["silero"] and \ results["bootstrap"]["overall"].get(best_rho_mode, {}).get("p_vs_silero", 1) < 0.05: print(f" → ADOPT {best_rho_mode}: statistically significant improvement in Spearman ρ") elif rhos[best_rho_mode] > rhos["silero"]: print(f" → {best_rho_mode} shows promise but is NOT significant — needs larger corpus") print(f" → KEEP Silero baseline for now") else: print(f" → KEEP Silero baseline — no alternative improves correlation") if best_accuracy_mode != "silero": acc_sil = results["band_segregation"]["silero"]["overall_accuracy"] acc_best = results["band_segregation"][best_accuracy_mode]["overall_accuracy"] if acc_best > acc_sil: print(f" → For band classification, consider {best_accuracy_mode} " f"(+{(acc_best - acc_sil)*100:.1f}pp accuracy)") # Hybrid-specific diagnosis hybrid_high_fp = results["confusion_matrices"]["hybrid"]["MEDIUM"]["HIGH"] silero_high_fp = results["confusion_matrices"]["silero"]["MEDIUM"]["HIGH"] if hybrid_high_fp > silero_high_fp * 1.5: print(f" ⚠ Hybrid over-classifies MEDIUM as HIGH ({hybrid_high_fp} vs {silero_high_fp} for silero)") print(f" → Hybrid's MarbleNet refinement merges too many segments, inflating MLU") results["verdict"] = { "best_by_accuracy": best_accuracy_mode, "best_by_spearman": best_rho_mode, "best_by_macro_f1": best_f1_mode, "baseline": "silero", } # ── Save results ── out_path = Path("/tmp/vad_comparison_report.json") with open(out_path, "w") as f: json.dump(results, f, indent=2, default=str) print(f"\n Full results saved to: {out_path}") print(f"\n{'=' * 80}") print(" COMPARISON COMPLETE") print(f"{'=' * 80}\n")