"""
Hybrid V3 VAD-Only Benchmark: Isolated test on SANDI dev-438.

VAD-only comparison: Only swaps the VAD component, computes VAD features
+ banding directly without the full composite pipeline (avoids mord dependency).

Uses cached transcriptions for MLU computation and the reference composite
scores from the existing silero baseline CSV.

Usage: PYTHONPATH=/tmp/pylibs:/tmp/pip_libs \
       /opt/anaconda3/envs/deeplearning/bin/python -u run_hybrid_v3_test.py
"""

import sys, os, time, warnings, pickle
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))

from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu

BASE = Path(__file__).parent.parent
CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"

filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"SANDI files: {len(filelist)}")

cached = sum(1 for _, row in filelist.iterrows()
             if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
print(f"Cached transcriptions: {cached}/{len(filelist)}")

print("Loading VAD models...", flush=True)
from pipeline.hybrid_vad import run_hybrid_vad
print("VAD models loaded.\n")


# ── Banding functions (isolated — NO composite.py dependency) ──

def band_production(sr, mlu, lpr):
    """Current production banding (mirrors composite.py)."""
    if mlu >= 7 and sr >= 0.75 and lpr <= 0.15:
        return "HIGH"
    elif mlu < 3 and sr < 0.55:
        return "LOW"
    elif mlu < 2:
        return "LOW"
    elif sr < 0.35:
        return "LOW"
    else:
        return "MEDIUM"


def band_relaxed(sr, mlu, lpr):
    """Relaxed HIGH threshold (V3 experiment)."""
    if mlu >= 5.5 and sr >= 0.70 and lpr <= 0.20:
        return "HIGH"
    elif mlu < 3 and sr < 0.55:
        return "LOW"
    elif mlu < 2:
        return "LOW"
    elif sr < 0.35:
        return "LOW"
    else:
        return "MEDIUM"


def expert_band(score):
    if score < 3.0: return 'LOW'
    elif score < 4.5: return 'MEDIUM'
    else: return 'HIGH'


# ── Composite-like score from VAD features (no models needed) ──

def vad_composite(sr, mlu, lpr, pf):
    """Simple composite score from VAD features for ranking."""
    return sr * 0.35 + min(mlu / 15.0, 1.0) * 0.30 + (1 - lpr) * 0.20 + max(0, 1 - pf) * 0.15


# ── Process all files ──

MODES = ["silero", "hybrid_v2", "hybrid_v3"]
all_results = {m: [] for m in MODES}
errors = {m: 0 for m in MODES}
v3_diagnostics = []
start_time = time.time()

for idx, row in filelist.iterrows():
    file_id = row['file_id']
    cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
    if not cache_path.exists():
        continue

    audio_path = row['audio_path']
    if not os.path.isabs(audio_path):
        audio_path = str(BASE / audio_path)
    if not os.path.exists(audio_path):
        continue

    with open(cache_path, 'rb') as f:
        tx = pickle.load(f)
    words = tx['words']
    word_count = len(words)

    n = idx + 1
    elapsed = time.time() - start_time
    rate = max(n / max(elapsed, 0.01), 0.01)
    eta = (len(filelist) - n) / rate

    if n % 50 == 0 or n <= 3 or n == len(filelist):
        print(f"  [{n}/{len(filelist)}] {file_id} "
              f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)

    for mode in MODES:
        try:
            vad = run_hybrid_vad(audio_path, mode=mode,
                                 diagnostics=(mode == "hybrid_v3"))
            seg_count = max(vad['speech_segments'], 1)
            mlu = round(word_count / seg_count, 2)
            vad['mlu'] = mlu

            sr = vad['speech_ratio']
            lpr = vad['long_pause_ratio']
            pf = vad.get('pause_frequency_per_sec', 0)

            # Capture V3 diagnostics
            if mode == "hybrid_v3" and "_diagnostics" in vad:
                diag = vad["_diagnostics"]
                v3_diagnostics.append({
                    "file_id": file_id,
                    "energy_recoveries": diag.get("energy_recoveries", 0),
                    "merges_performed": diag.get("merges_performed", 0),
                    "merges_refused_mlu": diag.get("merges_refused_mlu", 0),
                    "silero_segments": diag.get("silero_segments", 0),
                    "final_segments": diag.get("final_segments", 0),
                    "vad_ms": diag.get("vad_ms", 0),
                })

            all_results[mode].append({
                'file_id': file_id,
                'expert_score': row['expert_score'],
                'expert_band': expert_band(row['expert_score']),
                'speech_ratio': sr,
                'mlu': mlu,
                'word_count': word_count,
                'pause_count': vad['pause_count'],
                'mean_pause_dur': vad['mean_pause_duration_sec'],
                'long_pause_ratio': lpr,
                'short_pause_share': vad.get('short_pause_share', 0),
                'speech_segments': vad['speech_segments'],
                'speech_duration_sec': vad['speech_duration_sec'],
                'pause_freq': pf,
                'band_prod': band_production(sr, mlu, lpr),
                'band_relaxed': band_relaxed(sr, mlu, lpr),
                'composite_vad': vad_composite(sr, mlu, lpr, pf),
            })
        except Exception as e:
            errors[mode] += 1
            if errors[mode] <= 5:
                print(f"    ERROR [{mode}] {file_id}: {e}", flush=True)

total_time = time.time() - start_time
print(f"\nDone in {total_time/60:.1f} minutes")
for m in MODES:
    print(f"  {m}: {len(all_results[m])} processed, {errors[m]} errors")

dfs = {mode: pd.DataFrame(all_results[mode]) for mode in MODES}
BANDS = ["LOW", "MEDIUM", "HIGH"]


def section(title):
    print(f"\n{'='*80}")
    print(f"  {title}")
    print(f"{'='*80}")


# ══════════════════════════════════════════════════════════════════
# 1. OVERALL METRICS (production banding)
# ══════════════════════════════════════════════════════════════════

section("1. OVERALL METRICS (production banding)")

metrics = {}
for mode in MODES:
    df = dfs[mode]
    rho, p = spearmanr(df['expert_score'], df['composite_vad'])
    agree = (df['expert_band'] == df['band_prod']).sum()
    acc = agree / len(df)

    f1s = []
    for band in BANDS:
        tp = ((df['expert_band'] == band) & (df['band_prod'] == band)).sum()
        fp = ((df['expert_band'] != band) & (df['band_prod'] == band)).sum()
        fn = ((df['expert_band'] == band) & (df['band_prod'] != band)).sum()
        prec = tp/(tp+fp) if (tp+fp) > 0 else 0
        rec = tp/(tp+fn) if (tp+fn) > 0 else 0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0
        f1s.append(f1)
    macro_f1 = np.mean(f1s)

    mh_fp = ((df['expert_band'] == 'MEDIUM') & (df['band_prod'] == 'HIGH')).sum()
    hm_fn = ((df['expert_band'] == 'HIGH') & (df['band_prod'] == 'MEDIUM')).sum()

    metrics[mode] = {
        'rho': rho, 'acc': acc, 'macro_f1': macro_f1,
        'mh_fp': int(mh_fp), 'hm_fn': int(hm_fn)
    }

    marker = " ◄ V3" if mode == "hybrid_v3" else ""
    print(f"  {mode:<12s}  ρ={rho:+.4f}  acc={acc:.1%}  F1={macro_f1:.3f}  "
          f"MED→HIGH={mh_fp}  HIGH→MED={hm_fn}{marker}")


# ══════════════════════════════════════════════════════════════════
# 2. OVERALL METRICS (relaxed banding — experimental)
# ══════════════════════════════════════════════════════════════════

section("2. OVERALL METRICS (relaxed banding — experimental)")

metrics_relaxed = {}
for mode in MODES:
    df = dfs[mode]
    agree = (df['expert_band'] == df['band_relaxed']).sum()
    acc = agree / len(df)

    f1s = []
    for band in BANDS:
        tp = ((df['expert_band'] == band) & (df['band_relaxed'] == band)).sum()
        fp = ((df['expert_band'] != band) & (df['band_relaxed'] == band)).sum()
        fn = ((df['expert_band'] == band) & (df['band_relaxed'] != band)).sum()
        prec = tp/(tp+fp) if (tp+fp) > 0 else 0
        rec = tp/(tp+fn) if (tp+fn) > 0 else 0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0
        f1s.append(f1)
    macro_f1 = np.mean(f1s)

    mh_fp = ((df['expert_band'] == 'MEDIUM') & (df['band_relaxed'] == 'HIGH')).sum()
    hm_fn = ((df['expert_band'] == 'HIGH') & (df['band_relaxed'] == 'MEDIUM')).sum()

    metrics_relaxed[mode] = {
        'acc': acc, 'macro_f1': macro_f1,
        'mh_fp': int(mh_fp), 'hm_fn': int(hm_fn)
    }

    marker = " ◄ V3" if mode == "hybrid_v3" else ""
    print(f"  {mode:<12s}  acc={acc:.1%}  F1={macro_f1:.3f}  "
          f"MED→HIGH={mh_fp}  HIGH→MED={hm_fn}{marker}")


# ══════════════════════════════════════════════════════════════════
# 3. CONFUSION MATRICES
# ══════════════════════════════════════════════════════════════════

section("3. CONFUSION MATRICES (production banding)")

for mode in MODES:
    df = dfs[mode]
    agree = (df['expert_band'] == df['band_prod']).sum()
    print(f"\n  [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})")
    print(f"  {'':>16} Pipeline→  {'LOW':>5} {'MED':>5} {'HIGH':>5}")
    for eb in BANDS:
        row = [((df['expert_band'] == eb) & (df['band_prod'] == pb)).sum()
               for pb in BANDS]
        print(f"  Expert {eb:>6}:     {row[0]:>5} {row[1]:>5} {row[2]:>5}")


# ══════════════════════════════════════════════════════════════════
# 4. SEGMENT AND MLU COMPARISON
# ══════════════════════════════════════════════════════════════════

section("4. SEGMENT COUNT & MLU COMPARISON")

print(f"\n  {'Mode':<12s}  {'Segments':>10s}  {'MLU':>10s}  {'SR':>10s}  "
      f"{'PauseDur':>10s}  {'LPR':>10s}")
print(f"  {'-'*65}")
for mode in MODES:
    df = dfs[mode]
    print(f"  {mode:<12s}  {df['speech_segments'].mean():>10.2f}  {df['mlu'].mean():>10.2f}  "
          f"{df['speech_ratio'].mean():>10.4f}  {df['mean_pause_dur'].mean():>10.4f}  "
          f"{df['long_pause_ratio'].mean():>10.4f}")


# ══════════════════════════════════════════════════════════════════
# 5. MLU INFLATION CHECK (hard failure mode)
# ══════════════════════════════════════════════════════════════════

section("5. MLU INFLATION CHECK")

for mode in MODES:
    df = dfs[mode]
    mean_mlu = df['mlu'].mean()
    max_mlu = df['mlu'].max()
    p95_mlu = df['mlu'].quantile(0.95)
    status = "✅ SAFE" if mean_mlu < 9.0 else "⚠ WARNING" if mean_mlu < 12.0 else "❌ REJECT"
    print(f"  {mode:<12s}  mean={mean_mlu:.2f}  p95={p95_mlu:.2f}  max={max_mlu:.2f}  {status}")


# ══════════════════════════════════════════════════════════════════
# 6. PER-BAND FEATURE ANALYSIS
# ══════════════════════════════════════════════════════════════════

section("6. PER-BAND FEATURE COMPARISON")

for band in BANDS:
    print(f"\n  [{band}]")
    print(f"  {'Mode':<12s}  {'N':>4s}  {'Segments':>10s}  {'MLU':>10s}  "
          f"{'SR':>10s}  {'LPR':>10s}")
    print(f"  {'-'*55}")
    for mode in MODES:
        df = dfs[mode]
        sub = df[df['expert_band'] == band]
        print(f"  {mode:<12s}  {len(sub):>4d}  {sub['speech_segments'].mean():>10.2f}  "
              f"{sub['mlu'].mean():>10.2f}  {sub['speech_ratio'].mean():>10.4f}  "
              f"{sub['long_pause_ratio'].mean():>10.4f}")


# ══════════════════════════════════════════════════════════════════
# 7. V3 DIAGNOSTICS SUMMARY
# ══════════════════════════════════════════════════════════════════

section("7. V3 DIAGNOSTICS")

if v3_diagnostics:
    diag_df = pd.DataFrame(v3_diagnostics)
    print(f"  Total files: {len(diag_df)}")
    print(f"  Energy recoveries:  total={diag_df['energy_recoveries'].sum()}, "
          f"mean={diag_df['energy_recoveries'].mean():.2f}/file")
    print(f"  Merges performed:   total={diag_df['merges_performed'].sum()}, "
          f"mean={diag_df['merges_performed'].mean():.2f}/file")
    print(f"  Merges refused (MLU): total={diag_df['merges_refused_mlu'].sum()}")
    print(f"  Segment change: Silero={diag_df['silero_segments'].mean():.2f} → "
          f"V3={diag_df['final_segments'].mean():.2f}")
    if 'vad_ms' in diag_df.columns:
        print(f"  V3 latency: mean={diag_df['vad_ms'].mean():.0f}ms, "
              f"p95={diag_df['vad_ms'].quantile(0.95):.0f}ms")
else:
    print("  No V3 diagnostics collected.")


# ══════════════════════════════════════════════════════════════════
# 8. BOOTSTRAP SIGNIFICANCE TEST
# ══════════════════════════════════════════════════════════════════

section("8. BOOTSTRAP SIGNIFICANCE (composite_vad vs expert_score)")

common_ids = set(dfs['silero']['file_id'])
for mode in MODES[1:]:
    common_ids &= set(dfs[mode]['file_id'])
common_ids = sorted(common_ids)
NC = len(common_ids)

expert_arr = dfs['silero'].set_index('file_id').loc[common_ids]['expert_score'].values
comp_arr = {m: dfs[m].set_index('file_id').loc[common_ids]['composite_vad'].values
            for m in MODES}

n_boot = 10000
rng = np.random.default_rng(42)
boot_rho = {m: np.zeros(n_boot) for m in MODES}

for i in range(n_boot):
    idx = rng.choice(NC, NC, replace=True)
    for mode in MODES:
        r, _ = spearmanr(expert_arr[idx], comp_arr[mode][idx])
        boot_rho[mode][i] = r

print(f"\n  N={NC} paired files, {n_boot} bootstrap iterations\n")
print(f"  {'Mode':<12s}  {'ρ obs':>10s}  {'95% CI':>22s}  "
      f"{'Δρ vs silero':>12s}  {'p':>10s}  {'Sig':>5s}")
print(f"  {'-'*76}")

rho_obs = {}
for mode in MODES:
    r, _ = spearmanr(expert_arr, comp_arr[mode])
    rho_obs[mode] = r
    ci = np.percentile(boot_rho[mode], [2.5, 97.5])
    if mode == "silero":
        print(f"  {mode:<12s}  {r:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
              f"{'baseline':>12s}  {'--':>10s}  {'--':>5s}")
    else:
        delta = r - rho_obs['silero']
        boot_delta = boot_rho[mode] - boot_rho['silero']
        p_val = (boot_delta <= 0).mean()
        sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns"
        print(f"  {mode:<12s}  {r:>+10.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  "
              f"{delta:>+12.4f}  {p_val:>10.4f}  {sig:>5s}")

# V3 vs V2 specifically
if "hybrid_v2" in comp_arr and "hybrid_v3" in comp_arr:
    boot_v3_v2 = boot_rho['hybrid_v3'] - boot_rho['hybrid_v2']
    p_v3_v2 = (boot_v3_v2 <= 0).mean()
    delta_v3_v2 = rho_obs['hybrid_v3'] - rho_obs['hybrid_v2']
    sig = "***" if p_v3_v2 < 0.001 else "**" if p_v3_v2 < 0.01 else "*" if p_v3_v2 < 0.05 else "ns"
    print(f"\n  V3 vs V2:  Δρ = {delta_v3_v2:+.4f}  p = {p_v3_v2:.4f}  ({sig})")


# ══════════════════════════════════════════════════════════════════
# 9. COHEN'S D (composite score separation between bands)
# ══════════════════════════════════════════════════════════════════

section("9. BAND SEPARATION (Cohen's d on composite_vad)")

print(f"\n  {'Mode':<12s}  {'LOW→MED d':>12s}  {'MED→HIGH d':>12s}  {'LOW→HIGH d':>12s}")
print(f"  {'-'*52}")
for mode in MODES:
    df = dfs[mode]
    parts = []
    for (bl, bh) in [("LOW", "MEDIUM"), ("MEDIUM", "HIGH"), ("LOW", "HIGH")]:
        a = df[df['expert_band'] == bl]['composite_vad']
        b = df[df['expert_band'] == bh]['composite_vad']
        if len(a) > 1 and len(b) > 1:
            pooled = np.sqrt((a.std()**2 + b.std()**2) / 2)
            d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0
            parts.append(f"{d:>+12.3f}")
        else:
            parts.append(f"{'--':>12s}")
    print(f"  {mode:<12s}  {'  '.join(parts)}")


# ══════════════════════════════════════════════════════════════════
# 10. VERDICT
# ══════════════════════════════════════════════════════════════════

section("10. VERDICT")

print(f"\n  Dataset: SANDI dev-438, N={len(dfs['silero'])}")

# Summary table
print(f"\n  ┌────────────┬────────────┬────────────┬──────────────┬──────────────┬──────────────┐")
print(f"  │ {'Mode':<10s} │ {'Spear. ρ':>10s} │ {'Accuracy':>10s} │ {'Macro F1':>12s} │ {'MED→HIGH':>12s} │ {'HIGH→MED':>12s} │")
print(f"  ├────────────┼────────────┼────────────┼──────────────┼──────────────┼──────────────┤")
for mode in MODES:
    m = metrics[mode]
    print(f"  │ {mode:<10s} │ {m['rho']:>+10.4f} │ {m['acc']:>10.1%} │ {m['macro_f1']:>12.3f} │ {m['mh_fp']:>12d} │ {m['hm_fn']:>12d} │")
print(f"  └────────────┴────────────┴────────────┴──────────────┴──────────────┴──────────────┘")

# MLU safety check
mlu_v3 = dfs['hybrid_v3']['mlu'].mean()
mlu_sil = dfs['silero']['mlu'].mean()
if mlu_v3 > 9.0:
    print(f"\n  ⚠ V3 MLU ({mlu_v3:.2f}) exceeds safety threshold — MLU inflation detected")
else:
    print(f"\n  ✅ V3 MLU ({mlu_v3:.2f}) within safe range (Silero: {mlu_sil:.2f})")

# MED→HIGH check
mh_v3 = metrics['hybrid_v3']['mh_fp']
mh_sil = metrics['silero']['mh_fp']
if mh_v3 > 65:
    print(f"  ⚠ V3 MED→HIGH FP ({mh_v3}) exceeds boundary (65)")
else:
    print(f"  ✅ V3 MED→HIGH FP ({mh_v3}) within bounds (Silero: {mh_sil})")

# Relaxed banding verdict
print(f"\n  RELAXED BANDING COMPARISON:")
for mode in MODES:
    m_p = metrics[mode]
    m_r = metrics_relaxed[mode]
    delta_acc = (m_r['acc'] - m_p['acc']) * 100
    delta_fp = m_r['mh_fp'] - m_p['mh_fp']
    delta_fn = m_p['hm_fn'] - m_r['hm_fn']
    print(f"  {mode:<12s}  Δacc={delta_acc:+.1f}pp  ΔFP={delta_fp:+d}  "
          f"H→M recovered={delta_fn}")

# Recommendation
rho_v3 = metrics['hybrid_v3']['rho']
rho_sil = metrics['silero']['rho']
acc_v3 = metrics['hybrid_v3']['acc']
acc_sil = metrics['silero']['acc']

print(f"\n  Δρ(V3-Silero) = {rho_v3 - rho_sil:+.4f}")
print(f"  Δacc(V3-Silero) = {(acc_v3 - acc_sil)*100:+.1f}pp")

if rho_v3 >= rho_sil and acc_v3 >= acc_sil:
    print(f"  → V3 IMPROVES on all metrics — candidate for promotion")
elif rho_v3 >= rho_sil - 0.01:
    print(f"  → V3 maintains correlation, review accuracy trade-off")
else:
    print(f"  → V3 shows regression — investigate before promoting")

print(f"\n  Total processing time: {total_time/60:.1f} minutes")
print(f"\n{'='*80}")
print("  BENCHMARK COMPLETE")
print(f"{'='*80}\n")