"""
Full VAD Comparison: Hybrid Pipeline vs Silero VAD on SANDI dev-438.

Compares 4 VAD modes across 3 fluency bands (LOW/MEDIUM/HIGH):
  1. Band-segregation performance (precision, recall, F1)
  2. Confusion matrix analysis
  3. Threshold crossing analysis
  4. Segment behavior analysis (per-band feature distributions)
  5. Downstream fluency impact (per-dimension, composite)
  6. Bootstrap significance testing
  7. Summary verdict

Uses pre-computed CSVs from prior runs — no heavy VAD reprocessing.

Usage:
    python run_full_vad_comparison.py
"""

import json
import sys
import os
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal, ks_2samp

BASE = Path(__file__).parent.parent

# ══════════════════════════════════════════════════════════════════
# DATA LOADING
# ══════════════════════════════════════════════════════════════════

MODES = ["silero", "hybrid", "marblenet", "fusion"]
CSV_PATHS = {m: BASE / f"EDA/data/sandi_438_vad_{m}.csv" for m in MODES}

print("=" * 80)
print("  HYBRID VAD vs SILERO VAD — FULL COMPARISON ON SANDI DEV-438")
print("=" * 80)

dfs = {}
for mode, path in CSV_PATHS.items():
    if not path.exists():
        print(f"ERROR: Missing {path}")
        sys.exit(1)
    dfs[mode] = pd.read_csv(path)
    print(f"  Loaded {mode}: N={len(dfs[mode])}")


def expert_band(score):
    if score < 3.0:
        return "LOW"
    elif score < 4.5:
        return "MEDIUM"
    else:
        return "HIGH"


# Add expert band to all dataframes
for mode in MODES:
    dfs[mode]["expert_band"] = dfs[mode]["expert_score"].apply(expert_band)

# Verify file_id alignment
base_ids = set(dfs["silero"]["file_id"])
for mode in MODES[1:]:
    if set(dfs[mode]["file_id"]) != base_ids:
        print(f"WARNING: file_id mismatch between silero and {mode}")

N = len(dfs["silero"])
print(f"\n  Total samples: {N}")
band_dist = dfs["silero"]["expert_band"].value_counts()
print(f"  Expert band distribution: LOW={band_dist.get('LOW',0)}, "
      f"MEDIUM={band_dist.get('MEDIUM',0)}, HIGH={band_dist.get('HIGH',0)}")

BANDS = ["LOW", "MEDIUM", "HIGH"]
DIMS = ["dim_continuity", "dim_pause_quality", "dim_articulation",
        "dim_dominance", "dim_placement", "dim_word_precision"]
VAD_FEATURES = ["speech_ratio", "mlu", "mean_pause_dur", "long_pause_ratio",
                "pause_count", "speech_segments", "speech_duration_sec",
                "short_pause_share"]

results = {}  # machine-readable output


def section(title):
    print(f"\n{'=' * 80}")
    print(f"  {title}")
    print(f"{'=' * 80}")


# ══════════════════════════════════════════════════════════════════
# 1. BAND-SEGREGATION PERFORMANCE
# ══════════════════════════════════════════════════════════════════

section("1. BAND-SEGREGATION PERFORMANCE (Precision / Recall / F1)")

results["band_segregation"] = {}

for mode in MODES:
    df = dfs[mode]
    mode_results = {"overall_accuracy": 0, "bands": {}}

    agree = (df["expert_band"] == df["fluency_band"]).sum()
    accuracy = agree / len(df)
    mode_results["overall_accuracy"] = round(accuracy, 4)

    print(f"\n  [{mode}] Overall Accuracy: {agree}/{len(df)} ({accuracy:.1%})")
    print(f"  {'Band':<8s}  {'Precision':>10s}  {'Recall':>10s}  {'F1':>10s}  "
          f"{'Support':>8s}  {'Predicted':>10s}")
    print(f"  {'-' * 60}")

    macro_f1_parts = []
    for band in BANDS:
        tp = ((df["expert_band"] == band) & (df["fluency_band"] == band)).sum()
        fp = ((df["expert_band"] != band) & (df["fluency_band"] == band)).sum()
        fn = ((df["expert_band"] == band) & (df["fluency_band"] != band)).sum()

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        support = (df["expert_band"] == band).sum()
        predicted = (df["fluency_band"] == band).sum()

        macro_f1_parts.append(f1)
        mode_results["bands"][band] = {
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1": round(f1, 4),
            "support": int(support),
            "predicted": int(predicted),
            "tp": int(tp), "fp": int(fp), "fn": int(fn),
        }

        print(f"  {band:<8s}  {precision:>10.3f}  {recall:>10.3f}  {f1:>10.3f}  "
              f"{support:>8d}  {predicted:>10d}")

    macro_f1 = np.mean(macro_f1_parts)
    mode_results["macro_f1"] = round(macro_f1, 4)
    print(f"  {'MACRO':>8s}  {'':>10s}  {'':>10s}  {macro_f1:>10.3f}")

    results["band_segregation"][mode] = mode_results


# ══════════════════════════════════════════════════════════════════
# 2. CONFUSION MATRIX ANALYSIS
# ══════════════════════════════════════════════════════════════════

section("2. CONFUSION MATRICES")

results["confusion_matrices"] = {}

for mode in MODES:
    df = dfs[mode]
    print(f"\n  [{mode}]")
    print(f"  {'':>16s} Pipeline →  {'LOW':>6s} {'MEDIUM':>8s} {'HIGH':>6s}  {'Total':>6s}")
    print(f"  {'-' * 55}")

    cm = {}
    for eb in BANDS:
        row = {}
        row_vals = []
        for pb in BANDS:
            n = ((df["expert_band"] == eb) & (df["fluency_band"] == pb)).sum()
            row[pb] = int(n)
            row_vals.append(n)
        total = sum(row_vals)
        cm[eb] = row

        # Highlight the diagonal (correct classifications)
        parts = []
        for pb, n in zip(BANDS, row_vals):
            marker = f"[{n}]" if pb == eb else f" {n} "
            parts.append(f"{marker:>6s}" if pb != "MEDIUM" else f"{marker:>8s}")
        print(f"  Expert {eb:>6s}:  {''.join(parts)}  {total:>6d}")

    results["confusion_matrices"][mode] = cm

    # Error pattern analysis
    print(f"\n  Error patterns:")
    for eb in BANDS:
        for pb in BANDS:
            if eb != pb:
                n = cm[eb][pb]
                if n > 0:
                    pct = n / sum(cm[eb].values()) * 100
                    print(f"    {eb} → {pb}: {n} ({pct:.1f}% of {eb} samples)")


# ══════════════════════════════════════════════════════════════════
# 3. THRESHOLD CROSSING ANALYSIS
# ══════════════════════════════════════════════════════════════════

section("3. THRESHOLD CROSSING ANALYSIS")
print("  Band thresholds: MLU≥7 + SR≥0.75 + LPR≤0.15 → HIGH")
print("                   MLU<3 + SR<0.55 | MLU<2 | SR<0.35 → LOW")
print("                   Otherwise → MEDIUM\n")

results["threshold_crossings"] = {}

for mode in MODES:
    df = dfs[mode]
    mode_tc = {}

    # Analyze how close borderline cases are to thresholds
    # Focus on the HIGH/MEDIUM boundary (most impactful)
    high_expert = df[df["expert_band"] == "HIGH"]
    med_expert = df[df["expert_band"] == "MEDIUM"]

    # HIGH threshold: MLU≥7, SR≥0.75, LPR≤0.15
    high_meeting_mlu = (high_expert["mlu"] >= 7).sum()
    high_meeting_sr = (high_expert["speech_ratio"] >= 0.75).sum()
    high_meeting_lpr = (high_expert["long_pause_ratio"] <= 0.15).sum()
    high_meeting_all = (
        (high_expert["mlu"] >= 7) &
        (high_expert["speech_ratio"] >= 0.75) &
        (high_expert["long_pause_ratio"] <= 0.15)
    ).sum()

    # MEDIUM misclassified as HIGH
    med_as_high = df[(df["expert_band"] == "MEDIUM") & (df["fluency_band"] == "HIGH")]
    n_med_as_high = len(med_as_high)

    # HIGH misclassified as MEDIUM
    high_as_med = df[(df["expert_band"] == "HIGH") & (df["fluency_band"] == "MEDIUM")]
    n_high_as_med = len(high_as_med)

    mode_tc["high_expert_meeting_mlu7"] = int(high_meeting_mlu)
    mode_tc["high_expert_meeting_sr75"] = int(high_meeting_sr)
    mode_tc["high_expert_meeting_lpr15"] = int(high_meeting_lpr)
    mode_tc["high_expert_meeting_all_thresholds"] = int(high_meeting_all)
    mode_tc["medium_misclassified_as_high"] = int(n_med_as_high)
    mode_tc["high_misclassified_as_medium"] = int(n_high_as_med)

    print(f"  [{mode}]")
    print(f"    Expert-HIGH (N={len(high_expert)}) meeting thresholds:")
    print(f"      MLU≥7:  {high_meeting_mlu}/{len(high_expert)} ({high_meeting_mlu/len(high_expert)*100:.1f}%)")
    print(f"      SR≥0.75: {high_meeting_sr}/{len(high_expert)} ({high_meeting_sr/len(high_expert)*100:.1f}%)")
    print(f"      LPR≤0.15: {high_meeting_lpr}/{len(high_expert)} ({high_meeting_lpr/len(high_expert)*100:.1f}%)")
    print(f"      ALL 3:  {high_meeting_all}/{len(high_expert)} ({high_meeting_all/len(high_expert)*100:.1f}%)")
    print(f"    Errors:")
    print(f"      MEDIUM→HIGH: {n_med_as_high} false positives")
    print(f"      HIGH→MEDIUM: {n_high_as_med} false negatives")

    if n_med_as_high > 0:
        print(f"    MEDIUM→HIGH error features (mean):")
        print(f"      MLU={med_as_high['mlu'].mean():.2f}  "
              f"SR={med_as_high['speech_ratio'].mean():.3f}  "
              f"LPR={med_as_high['long_pause_ratio'].mean():.3f}")
        mode_tc["med_as_high_mean_mlu"] = round(med_as_high["mlu"].mean(), 2)
        mode_tc["med_as_high_mean_sr"] = round(med_as_high["speech_ratio"].mean(), 3)
        mode_tc["med_as_high_mean_lpr"] = round(med_as_high["long_pause_ratio"].mean(), 3)

    if n_high_as_med > 0:
        print(f"    HIGH→MEDIUM error features (mean):")
        print(f"      MLU={high_as_med['mlu'].mean():.2f}  "
              f"SR={high_as_med['speech_ratio'].mean():.3f}  "
              f"LPR={high_as_med['long_pause_ratio'].mean():.3f}")
        mode_tc["high_as_med_mean_mlu"] = round(high_as_med["mlu"].mean(), 2)
        mode_tc["high_as_med_mean_sr"] = round(high_as_med["speech_ratio"].mean(), 3)
        mode_tc["high_as_med_mean_lpr"] = round(high_as_med["long_pause_ratio"].mean(), 3)

    print()
    results["threshold_crossings"][mode] = mode_tc


# ══════════════════════════════════════════════════════════════════
# 4. SEGMENT BEHAVIOR ANALYSIS
# ══════════════════════════════════════════════════════════════════

section("4. SEGMENT BEHAVIOR BY BAND (per-band feature distributions)")

results["segment_behavior"] = {}

for mode in MODES:
    df = dfs[mode]
    mode_sb = {}
    print(f"\n  [{mode}]")
    print(f"  {'Feature':<22s}", end="")
    for band in BANDS:
        print(f"  {band + ' (mean±std)':>18s}", end="")
    print(f"  {'KW-H':>8s}  {'p':>10s}")
    print(f"  {'-' * 88}")

    for feat in VAD_FEATURES:
        if feat not in df.columns:
            continue
        feat_data = {}
        parts = []
        groups = []
        for band in BANDS:
            sub = df[df["expert_band"] == band][feat].dropna()
            m, s = sub.mean(), sub.std()
            feat_data[band] = {"mean": round(m, 4), "std": round(s, 4), "n": len(sub)}
            parts.append(f"{m:>8.3f} ± {s:>6.3f}")
            if len(sub) > 0:
                groups.append(sub.values)

        # Kruskal-Wallis across bands
        if len(groups) >= 2 and all(len(g) > 0 for g in groups):
            H, p_kw = kruskal(*groups)
        else:
            H, p_kw = 0, 1.0

        sig = "***" if p_kw < 0.001 else "**" if p_kw < 0.01 else "*" if p_kw < 0.05 else "ns"
        feat_data["kruskal_H"] = round(H, 2)
        feat_data["kruskal_p"] = round(p_kw, 6)

        print(f"  {feat:<22s}", end="")
        for p in parts:
            print(f"  {p:>18s}", end="")
        print(f"  {H:>8.2f}  {p_kw:>10.2e} {sig}")

        mode_sb[feat] = feat_data

    results["segment_behavior"][mode] = mode_sb


# ══════════════════════════════════════════════════════════════════
# 5. DOWNSTREAM FLUENCY IMPACT
# ══════════════════════════════════════════════════════════════════

section("5. DOWNSTREAM FLUENCY IMPACT")

results["fluency_impact"] = {}

# 5a. Overall Spearman correlation
print("\n  5a. OVERALL CORRELATION WITH EXPERT SCORES\n")
print(f"  {'Mode':<12s}  {'Spearman ρ':>12s}  {'p':>12s}  "
      f"{'Kendall τ':>12s}  {'p':>12s}")
print(f"  {'-' * 62}")

rhos = {}
for mode in MODES:
    df = dfs[mode]
    rho, p_rho = spearmanr(df["expert_score"], df["composite_raw"])
    tau, p_tau = kendalltau(df["expert_score"], df["composite_raw"])
    rhos[mode] = rho
    best = " ◄" if rho == max(rhos.values()) else ""
    print(f"  {mode:<12s}  {rho:>+12.4f}  {p_rho:>12.2e}  "
          f"{tau:>+12.4f}  {p_tau:>12.2e}{best}")

results["fluency_impact"]["overall_correlations"] = {
    m: {"spearman": round(rhos[m], 4)} for m in MODES
}

# 5b. Per-band correlations
print("\n  5b. PER-BAND SPEARMAN CORRELATIONS (composite vs expert)\n")
print(f"  {'Mode':<12s}", end="")
for band in BANDS:
    print(f"  {band:>12s}", end="")
print()
print(f"  {'-' * 50}")

results["fluency_impact"]["per_band_correlations"] = {}
for mode in MODES:
    df = dfs[mode]
    band_corrs = {}
    print(f"  {mode:<12s}", end="")
    for band in BANDS:
        sub = df[df["expert_band"] == band]
        if len(sub) > 5:
            r, p = spearmanr(sub["expert_score"], sub["composite_raw"])
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""
            band_corrs[band] = round(r, 4)
            print(f"  {r:>+8.4f}{sig:>4s}", end="")
        else:
            band_corrs[band] = None
            print(f"  {'N<5':>12s}", end="")
    print()
    results["fluency_impact"]["per_band_correlations"][mode] = band_corrs

# 5c. Per-dimension correlations
print("\n  5c. PER-DIMENSION CORRELATIONS WITH EXPERT SCORES\n")
print(f"  {'Dimension':<25s}", end="")
for mode in MODES:
    print(f"  {mode:>12s}", end="")
print()
print(f"  {'-' * 75}")

results["fluency_impact"]["per_dimension"] = {}
for d in DIMS:
    dim_corrs = {}
    print(f"  {d:<25s}", end="")
    for mode in MODES:
        v = dfs[mode][["expert_score", d]].dropna()
        if len(v) > 10:
            r, p = spearmanr(v["expert_score"], v[d])
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else " ns"
            dim_corrs[mode] = round(r, 4)
            print(f"  {r:>+8.4f}{sig}", end="")
        else:
            dim_corrs[mode] = None
            print(f"  {'--':>12s}", end="")
    print()
    results["fluency_impact"]["per_dimension"][d] = dim_corrs

# 5d. Composite score separation (Cohen's d between adjacent expert bands)
print("\n  5d. COMPOSITE SCORE SEPARATION (Cohen's d between adjacent bands)\n")
print(f"  {'Mode':<12s}  {'LOW→MED d':>12s}  {'p':>10s}  "
      f"{'MED→HIGH d':>12s}  {'p':>10s}  {'LOW→HIGH d':>12s}")
print(f"  {'-' * 72}")

results["fluency_impact"]["band_separation"] = {}
for mode in MODES:
    df = dfs[mode]
    sep = {}
    parts = []
    for band_lo, band_hi in [("LOW", "MEDIUM"), ("MEDIUM", "HIGH"), ("LOW", "HIGH")]:
        a = df[df["expert_band"] == band_lo]["composite_raw"]
        b = df[df["expert_band"] == band_hi]["composite_raw"]
        if len(a) > 1 and len(b) > 1:
            pooled = np.sqrt((a.std()**2 + b.std()**2) / 2)
            d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0
            u, p = mannwhitneyu(a, b, alternative="less")
            sig = "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else "ns"
            sep[f"{band_lo}_to_{band_hi}"] = {"cohens_d": round(d, 3), "p": round(p, 6)}
            parts.append(f"{d:>+12.3f}  {p:>10.4f}")
        else:
            parts.append(f"{'--':>12s}  {'--':>10s}")
    print(f"  {mode:<12s}  {'  '.join(parts[:2])}  {parts[2].split()[0] if len(parts) > 2 else '--':>12s}")
    results["fluency_impact"]["band_separation"][mode] = sep


# ══════════════════════════════════════════════════════════════════
# 6. BOOTSTRAP SIGNIFICANCE TESTING
# ══════════════════════════════════════════════════════════════════

section("6. BOOTSTRAP SIGNIFICANCE TESTING (10,000 iterations)")

# Align all modes by file_id
common_ids = set(dfs["silero"]["file_id"])
for mode in MODES[1:]:
    common_ids &= set(dfs[mode]["file_id"])
common_ids = sorted(common_ids)
NC = len(common_ids)

expert_arr = dfs["silero"].set_index("file_id").loc[common_ids]["expert_score"].values
comp_arr = {}
for mode in MODES:
    comp_arr[mode] = dfs[mode].set_index("file_id").loc[common_ids]["composite_raw"].values

n_boot = 10000
rng = np.random.default_rng(42)
boot_rho = {m: np.zeros(n_boot) for m in MODES}
boot_delta = {m: np.zeros(n_boot) for m in MODES if m != "silero"}

for i in range(n_boot):
    idx = rng.choice(NC, NC, replace=True)
    for mode in MODES:
        r, _ = spearmanr(expert_arr[idx], comp_arr[mode][idx])
        boot_rho[mode][i] = r
    for mode in [m for m in MODES if m != "silero"]:
        boot_delta[mode][i] = boot_rho[mode][i] - boot_rho["silero"][i]

# 6a. Overall bootstrap
print(f"\n  6a. OVERALL (N={NC})\n")
print(f"  {'Mode':<12s}  {'ρ obs':>10s}  {'95% CI':>22s}  "
      f"{'Δρ vs silero':>12s}  {'p':>10s}  {'Sig':>5s}")
print(f"  {'-' * 76}")

results["bootstrap"] = {"overall": {}}
for mode in MODES:
    r_obs, _ = spearmanr(expert_arr, comp_arr[mode])
    ci = np.percentile(boot_rho[mode], [2.5, 97.5])
    if mode == "silero":
        print(f"  {mode:<12s}  {r_obs:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
              f"{'baseline':>12s}  {'--':>10s}  {'--':>5s}")
        results["bootstrap"]["overall"][mode] = {
            "rho": round(r_obs, 4),
            "ci_lo": round(ci[0], 4), "ci_hi": round(ci[1], 4),
        }
    else:
        delta = r_obs - rhos["silero"]
        p_val = (boot_delta[mode] <= 0).mean()
        sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns"
        print(f"  {mode:<12s}  {r_obs:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
              f"{delta:>+12.4f}  {p_val:>10.4f}  {sig:>5s}")
        results["bootstrap"]["overall"][mode] = {
            "rho": round(r_obs, 4),
            "ci_lo": round(ci[0], 4), "ci_hi": round(ci[1], 4),
            "delta_vs_silero": round(delta, 4),
            "p_vs_silero": round(p_val, 4),
        }

# 6b. Per-band bootstrap
print(f"\n  6b. PER-BAND BOOTSTRAP\n")

results["bootstrap"]["per_band"] = {}

for band in BANDS:
    band_mask = dfs["silero"].set_index("file_id").loc[common_ids]["expert_band"].values
    band_idx = np.where(band_mask == band)[0]
    n_band = len(band_idx)

    if n_band < 10:
        print(f"  [{band}] N={n_band} — too few for bootstrap\n")
        continue

    print(f"  [{band}] N={n_band}")
    print(f"  {'Mode':<12s}  {'ρ obs':>10s}  {'95% CI':>22s}  "
          f"{'Δρ':>8s}  {'p':>8s}")
    print(f"  {'-' * 65}")

    band_expert = expert_arr[band_idx]
    band_results = {}

    for mode in MODES:
        band_comp = comp_arr[mode][band_idx]
        r_obs, _ = spearmanr(band_expert, band_comp)

        # Band-level bootstrap
        band_boot = np.zeros(min(n_boot, 5000))
        for i in range(len(band_boot)):
            bi = rng.choice(n_band, n_band, replace=True)
            br, _ = spearmanr(band_expert[bi], band_comp[bi])
            band_boot[i] = br

        ci = np.percentile(band_boot, [2.5, 97.5])
        band_results[mode] = {"rho": round(r_obs, 4),
                               "ci_lo": round(ci[0], 4),
                               "ci_hi": round(ci[1], 4)}

        if mode == "silero":
            print(f"  {mode:<12s}  {r_obs:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
                  f"{'base':>8s}  {'--':>8s}")
        else:
            r_sil, _ = spearmanr(band_expert, comp_arr["silero"][band_idx])
            delta = r_obs - r_sil
            # Quick bootstrap p for this band
            d_boot = np.zeros(min(n_boot, 5000))
            for i in range(len(d_boot)):
                bi = rng.choice(n_band, n_band, replace=True)
                rs, _ = spearmanr(band_expert[bi], comp_arr["silero"][band_idx][bi])
                rm, _ = spearmanr(band_expert[bi], band_comp[bi])
                d_boot[i] = rm - rs
            p_val = (d_boot <= 0).mean()
            band_results[mode]["delta"] = round(delta, 4)
            band_results[mode]["p"] = round(p_val, 4)
            print(f"  {mode:<12s}  {r_obs:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
                  f"{delta:>+8.4f}  {p_val:>8.4f}")

    print()
    results["bootstrap"]["per_band"][band] = band_results


# ══════════════════════════════════════════════════════════════════
# 7. FEATURE DISTRIBUTION SHIFT ACROSS MODES (per band)
# ══════════════════════════════════════════════════════════════════

section("7. VAD FEATURE DELTAS: HYBRID/FUSION vs SILERO (per expert band)")

results["feature_deltas"] = {}

# Merge on file_id for paired comparison
for other_mode in ["hybrid", "fusion", "marblenet"]:
    print(f"\n  [{other_mode} - silero]\n")
    mode_deltas = {}

    merged = dfs["silero"][["file_id", "expert_band"]].copy()
    for feat in VAD_FEATURES:
        if feat in dfs["silero"].columns and feat in dfs[other_mode].columns:
            merged[f"{feat}_silero"] = dfs["silero"][feat].values
            merged[f"{feat}_other"] = dfs[other_mode].set_index("file_id").loc[
                merged["file_id"]
            ][feat].values

    print(f"  {'Feature':<22s}  {'Band':<8s}  {'Δ mean':>10s}  "
          f"{'Δ median':>10s}  {'KS stat':>10s}  {'KS p':>10s}")
    print(f"  {'-' * 76}")

    for feat in VAD_FEATURES:
        sc = f"{feat}_silero"
        oc = f"{feat}_other"
        if sc not in merged.columns:
            continue

        feat_deltas = {}
        for band in BANDS:
            sub = merged[merged["expert_band"] == band]
            if len(sub) < 3:
                continue
            delta = sub[oc] - sub[sc]
            ks_stat, ks_p = ks_2samp(sub[sc].values, sub[oc].values)
            sig = "***" if ks_p < 0.001 else "**" if ks_p < 0.01 else "*" if ks_p < 0.05 else ""
            feat_deltas[band] = {
                "delta_mean": round(delta.mean(), 4),
                "delta_median": round(delta.median(), 4),
                "ks_stat": round(ks_stat, 4),
                "ks_p": round(ks_p, 6),
            }
            print(f"  {feat:<22s}  {band:<8s}  {delta.mean():>+10.4f}  "
                  f"{delta.median():>+10.4f}  {ks_stat:>10.4f}  {ks_p:>10.4f} {sig}")

        mode_deltas[feat] = feat_deltas
    results["feature_deltas"][f"{other_mode}_vs_silero"] = mode_deltas


# ══════════════════════════════════════════════════════════════════
# 8. SUMMARY VERDICT
# ══════════════════════════════════════════════════════════════════

section("8. SUMMARY VERDICT")

print(f"\n  Dataset: SANDI dev-438 (N={N})")
print(f"  Expert bands: LOW={band_dist.get('LOW',0)}, "
      f"MEDIUM={band_dist.get('MEDIUM',0)}, HIGH={band_dist.get('HIGH',0)}")

print(f"\n  ┌────────────┬────────────┬────────────┬──────────────────┐")
print(f"  │ {'Mode':<10s} │ {'Spear. ρ':>10s} │ {'Accuracy':>10s} │ {'Macro F1':>16s} │")
print(f"  ├────────────┼────────────┼────────────┼──────────────────┤")
for mode in MODES:
    acc = results["band_segregation"][mode]["overall_accuracy"]
    mf1 = results["band_segregation"][mode]["macro_f1"]
    rho = rhos[mode]
    best_markers = []
    if rho == max(rhos.values()):
        best_markers.append("ρ")
    if acc == max(results["band_segregation"][m]["overall_accuracy"] for m in MODES):
        best_markers.append("acc")
    if mf1 == max(results["band_segregation"][m]["macro_f1"] for m in MODES):
        best_markers.append("F1")
    marker = " ◄ " + ",".join(best_markers) if best_markers else ""
    print(f"  │ {mode:<10s} │ {rho:>+10.4f} │ {acc:>10.1%} │ {mf1:>10.3f}{marker:>6s} │")
print(f"  └────────────┴────────────┴────────────┴──────────────────┘")

# Determine winner
best_accuracy_mode = max(MODES, key=lambda m: results["band_segregation"][m]["overall_accuracy"])
best_rho_mode = max(MODES, key=lambda m: rhos[m])
best_f1_mode = max(MODES, key=lambda m: results["band_segregation"][m]["macro_f1"])

print(f"\n  Best by accuracy:  {best_accuracy_mode} "
      f"({results['band_segregation'][best_accuracy_mode]['overall_accuracy']:.1%})")
print(f"  Best by Spearman:  {best_rho_mode} ({rhos[best_rho_mode]:.4f})")
print(f"  Best by macro-F1:  {best_f1_mode} "
      f"({results['band_segregation'][best_f1_mode]['macro_f1']:.3f})")

# Key findings
print(f"\n  KEY FINDINGS:")
for mode in [m for m in MODES if m != "silero"]:
    bsr = results["bootstrap"]["overall"].get(mode, {})
    delta = bsr.get("delta_vs_silero", 0)
    p = bsr.get("p_vs_silero", 1)
    sig = "SIGNIFICANT" if p < 0.05 else "NOT significant"
    direction = "better" if delta > 0 else "worse"
    print(f"    {mode} vs silero: Δρ={delta:+.4f} ({direction}, {sig}, p={p:.4f})")

# Recommendations
print(f"\n  RECOMMENDATIONS:")
if rhos[best_rho_mode] > rhos["silero"] and \
   results["bootstrap"]["overall"].get(best_rho_mode, {}).get("p_vs_silero", 1) < 0.05:
    print(f"    → ADOPT {best_rho_mode}: statistically significant improvement in Spearman ρ")
elif rhos[best_rho_mode] > rhos["silero"]:
    print(f"    → {best_rho_mode} shows promise but is NOT significant — needs larger corpus")
    print(f"    → KEEP Silero baseline for now")
else:
    print(f"    → KEEP Silero baseline — no alternative improves correlation")

if best_accuracy_mode != "silero":
    acc_sil = results["band_segregation"]["silero"]["overall_accuracy"]
    acc_best = results["band_segregation"][best_accuracy_mode]["overall_accuracy"]
    if acc_best > acc_sil:
        print(f"    → For band classification, consider {best_accuracy_mode} "
              f"(+{(acc_best - acc_sil)*100:.1f}pp accuracy)")

# Hybrid-specific diagnosis
hybrid_high_fp = results["confusion_matrices"]["hybrid"]["MEDIUM"]["HIGH"]
silero_high_fp = results["confusion_matrices"]["silero"]["MEDIUM"]["HIGH"]
if hybrid_high_fp > silero_high_fp * 1.5:
    print(f"    ⚠ Hybrid over-classifies MEDIUM as HIGH ({hybrid_high_fp} vs {silero_high_fp} for silero)")
    print(f"    → Hybrid's MarbleNet refinement merges too many segments, inflating MLU")

results["verdict"] = {
    "best_by_accuracy": best_accuracy_mode,
    "best_by_spearman": best_rho_mode,
    "best_by_macro_f1": best_f1_mode,
    "baseline": "silero",
}

# ── Save results ──
out_path = Path("/tmp/vad_comparison_report.json")
with open(out_path, "w") as f:
    json.dump(results, f, indent=2, default=str)
print(f"\n  Full results saved to: {out_path}")
print(f"\n{'=' * 80}")
print("  COMPARISON COMPLETE")
print(f"{'=' * 80}\n")