"""
Hybrid VAD A/B Test: Silero vs MarbleNet vs Hybrid on SANDI 438.

Design: Transcription is done ONCE (WhisperX), cached per file.
Only the VAD component is swapped across modes. This isolates the
single variable under test (VAD accuracy) and avoids redundant
transcription.

Statistical validation:
  1. Spearman rho vs expert scores (overall + per-dimension)
  2. Quintile separation (Q1-Q5 spread, Q2-Q3-Q4 discrimination)
  3. Bootstrap significance test (hybrid vs silero baseline)
  4. Effect sizes (Cohen's d per quintile pair)

Usage:
  Step 1 (transcribe): python run_hybrid_vad_test.py --transcribe
  Step 2 (evaluate):   python run_hybrid_vad_test.py --evaluate
  Or both:             python run_hybrid_vad_test.py
"""

import sys, os, time, warnings, json, pickle, argparse
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))

from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal

BASE = Path(__file__).parent.parent
CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"
CACHE_DIR.mkdir(exist_ok=True)

# ── Args ──
parser = argparse.ArgumentParser()
parser.add_argument("--transcribe", action="store_true", help="Run transcription step only")
parser.add_argument("--evaluate", action="store_true", help="Run evaluation step only (requires cached transcriptions)")
args, _ = parser.parse_known_args()
if not args.transcribe and not args.evaluate:
    args.transcribe = True
    args.evaluate = True

# ── Load file list ──
filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"SANDI files: {len(filelist)}")


# ═══════════════════════════════════════════════════════════════════
# STEP 1: Transcribe and cache (only needs ffmpeg — run WITHOUT NeMo)
# ═══════════════════════════════════════════════════════════════════

if args.transcribe:
    print("\n─── STEP 1: Transcription (cached) ───")

    # Monkey-patch whisperx.load_audio to use torchaudio instead of ffmpeg
    # (NeMo install broke ffmpeg's OpenGL dependency on this system)
    import torchaudio
    import whisperx
    _WHISPERX_SR = 16000
    def _torchaudio_load(file: str, sr: int = _WHISPERX_SR):
        wav, orig_sr = torchaudio.load(file)
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if orig_sr != sr:
            wav = torchaudio.functional.resample(wav, orig_sr, sr)
        return wav.squeeze().numpy()
    whisperx.load_audio = _torchaudio_load
    # Also patch the lazy-loaded audio module if it's already imported
    try:
        from whisperx import audio as _wx_audio
        _wx_audio.load_audio = _torchaudio_load
    except (ImportError, AttributeError):
        pass

    from pipeline.transcribe import transcribe_and_align

    tx_errors = 0
    tx_done = 0
    for idx, row in filelist.iterrows():
        file_id = row['file_id']
        cache_path = CACHE_DIR / f"{file_id}_tx.pkl"

        if cache_path.exists():
            tx_done += 1
            continue

        audio_path = row['audio_path']
        if not os.path.isabs(audio_path):
            audio_path = str(BASE / audio_path)
        if not os.path.exists(audio_path):
            tx_errors += 1
            continue

        try:
            tx = transcribe_and_align(audio_path)
            with open(cache_path, 'wb') as f:
                pickle.dump(tx, f)
            tx_done += 1
            if tx_done % 50 == 0:
                print(f"  Transcribed {tx_done}...", flush=True)
        except Exception as e:
            tx_errors += 1
            if tx_errors <= 3:
                print(f"  TX ERROR {file_id}: {e}")

    print(f"  Transcription: {tx_done} cached, {tx_errors} errors")


# ═══════════════════════════════════════════════════════════════════
# STEP 2: VAD evaluation (uses NeMo/MarbleNet — no ffmpeg needed)
# ═══════════════════════════════════════════════════════════════════

if args.evaluate:
    print("\n─── STEP 2: VAD Evaluation (3 modes) ───")
    print("Loading models...", flush=True)

    from pipeline.hybrid_vad import run_hybrid_vad
    from pipeline.placement import classify_pauses
    from pipeline.fa_features import compute_fa_features
    from pipeline.syntactic_features import compute_syntactic_features
    from models.inference import predict
    from pipeline.composite import compute_composite
    print("Models loaded.\n")

    MODES = ["silero", "marblenet", "hybrid"]
    all_results = {m: [] for m in MODES}
    errors = {m: 0 for m in MODES}
    skipped = 0
    start_time = time.time()

    for idx, row in filelist.iterrows():
        file_id = row['file_id']
        cache_path = CACHE_DIR / f"{file_id}_tx.pkl"

        # Skip if no cached transcription
        if not cache_path.exists():
            skipped += 1
            continue

        audio_path = row['audio_path']
        if not os.path.isabs(audio_path):
            audio_path = str(BASE / audio_path)
        if not os.path.exists(audio_path):
            skipped += 1
            continue

        # Load cached transcription
        with open(cache_path, 'rb') as f:
            tx = pickle.load(f)
        words = tx['words']
        word_count = len(words)

        n = idx + 1
        elapsed = time.time() - start_time
        rate = n / max(elapsed, 1)
        eta = (len(filelist) - n) / max(rate, 0.01)

        if n % 50 == 0 or n <= 3 or n == len(filelist):
            print(f"  [{n}/{len(filelist)}] {file_id} "
                  f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True)

        for mode in MODES:
            try:
                vad = run_hybrid_vad(audio_path, mode=mode)
                vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
                placement = classify_pauses(words, vad)
                fa = compute_fa_features(words, vad['total_duration_sec'])
                syn = compute_syntactic_features(words, tx['transcript'])
                all_features = {**vad, **placement, **fa, **syn}
                predictions = predict(all_features)
                composite = compute_composite(all_features, predictions)

                all_results[mode].append({
                    'file_id': file_id,
                    'expert_score': row['expert_score'],
                    'composite_raw': composite['composite_raw'],
                    'composite_percentile': composite['composite_percentile'],
                    'fluency_band': composite['fluency_band'],
                    'speech_ratio': vad['speech_ratio'],
                    'mlu': vad['mlu'],
                    'word_count': word_count,
                    'pause_count': vad['pause_count'],
                    'mean_pause_dur': vad['mean_pause_duration_sec'],
                    'long_pause_ratio': vad['long_pause_ratio'],
                    'short_pause_share': vad['short_pause_share'],
                    'speech_segments': vad['speech_segments'],
                    'speech_duration_sec': vad['speech_duration_sec'],
                    'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
                    'boundary_pause_ratio': placement['boundary_pause_ratio'],
                    'dim_continuity': composite['dim_continuity'],
                    'dim_pause_quality': composite['dim_pause_quality'],
                    'dim_articulation': composite['dim_articulation'],
                    'dim_dominance': composite['dim_dominance'],
                    'dim_placement': composite['dim_placement'],
                    'dim_word_precision': composite['dim_word_precision'],
                })
            except Exception as e:
                errors[mode] += 1
                if errors[mode] <= 3:
                    print(f"    ERROR [{mode}] {file_id}: {e}", flush=True)

    total_time = time.time() - start_time
    print(f"\nDone in {total_time/60:.1f} minutes (skipped {skipped} without transcription)")
    for m in MODES:
        print(f"  {m}: {len(all_results[m])} processed, {errors[m]} errors")

    # ── Save per-mode results ──
    dfs = {}
    for mode in MODES:
        df = pd.DataFrame(all_results[mode])
        out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv"
        df.to_csv(out_path, index=False)
        dfs[mode] = df
        print(f"Saved: {out_path}")

    # Verify we have data
    min_n = min(len(dfs[m]) for m in MODES)
    if min_n == 0:
        print("\nERROR: No results to analyze. Run --transcribe first.")
        sys.exit(1)

    # ══════════════════════════════════════════════════════════════════
    # STATISTICAL ANALYSIS
    # ══════════════════════════════════════════════════════════════════

    def section(title):
        print(f"\n{'='*75}")
        print(f"  {title}")
        print(f"{'='*75}")

    dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
            'dim_dominance', 'dim_placement', 'dim_word_precision']

    # ── 1. Overall correlation with expert scores ──
    section("1. OVERALL CORRELATION WITH EXPERT SCORES")

    print(f"\n  {'Mode':<12s}  {'Spearman ρ':>12s}  {'p-value':>12s}  {'Kendall τ':>12s}  {'p-value':>12s}  {'N':>5s}")
    print(f"  {'-'*65}")
    for mode in MODES:
        df = dfs[mode]
        rho, p_rho = spearmanr(df['expert_score'], df['composite_raw'])
        tau, p_tau = kendalltau(df['expert_score'], df['composite_raw'])
        print(f"  {mode:<12s}  {rho:>12.4f}  {p_rho:>12.2e}  {tau:>12.4f}  {p_tau:>12.2e}  {len(df):>5d}")

    # ── 2. Per-dimension correlations ──
    section("2. PER-DIMENSION CORRELATIONS WITH EXPERT SCORES")

    for mode in MODES:
        df = dfs[mode]
        print(f"\n  [{mode}]")
        for d in dims:
            v = df[['expert_score', d]].dropna()
            if len(v) > 10:
                r, p_ = spearmanr(v['expert_score'], v[d])
                sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns'
                print(f"    {d:<25s}  ρ={r:+.4f}  {sig}")

    # ── 3. Raw VAD feature differences between modes ──
    section("3. RAW VAD FEATURE COMPARISON ACROSS MODES")

    vad_features = ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio',
                    'pause_count', 'speech_segments', 'speech_duration_sec']

    merged = dfs['silero'][['file_id', 'expert_score']].copy()
    for feat in vad_features:
        for mode in MODES:
            merged = merged.merge(
                dfs[mode][['file_id', feat]].rename(columns={feat: f'{feat}_{mode}'}),
                on='file_id', how='inner'
            )

    print(f"\n  Feature deltas (hybrid - silero), N={len(merged)}:")
    print(f"  {'Feature':<25s}  {'Mean Δ':>10s}  {'Median Δ':>10s}  {'Std Δ':>10s}  {'ρ_hybrid':>10s}  {'ρ_silero':>10s}")
    print(f"  {'-'*75}")
    for feat in vad_features:
        delta = merged[f'{feat}_hybrid'] - merged[f'{feat}_silero']
        r_h, _ = spearmanr(merged['expert_score'], merged[f'{feat}_hybrid'])
        r_s, _ = spearmanr(merged['expert_score'], merged[f'{feat}_silero'])
        print(f"  {feat:<25s}  {delta.mean():>+10.4f}  {delta.median():>+10.4f}  {delta.std():>10.4f}  "
              f"{r_h:>+10.3f}  {r_s:>+10.3f}")

    # ── 4. Quintile separation analysis ──
    section("4. QUINTILE SEPARATION (Q1=lowest, Q5=highest expert score)")

    for mode in MODES:
        df = dfs[mode].copy()
        df['quintile'] = pd.qcut(df['expert_score'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
                                  duplicates='drop')
        print(f"\n  [{mode}]")
        print(f"  {'Quintile':<8s}  {'N':>4s}  {'Expert':>8s}  {'Composite':>10s}  {'Speech%':>8s}  "
              f"{'MLU':>8s}  {'PauseDur':>9s}  {'LPR':>8s}")
        print(f"  {'-'*75}")

        q_means = {}
        for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']:
            sub = df[df['quintile'] == q]
            if len(sub) == 0:
                continue
            q_means[q] = sub['composite_raw'].mean()
            print(f"  {q:<8s}  {len(sub):>4d}  {sub['expert_score'].mean():>8.2f}  "
                  f"{sub['composite_raw'].mean():>+10.4f}  {sub['speech_ratio'].mean():>8.3f}  "
                  f"{sub['mlu'].mean():>8.2f}  {sub['mean_pause_dur'].mean():>9.4f}  "
                  f"{sub['long_pause_ratio'].mean():>8.4f}")

        if 'Q5' in q_means and 'Q1' in q_means:
            spread = q_means['Q5'] - q_means['Q1']
            print(f"  Q5-Q1 spread: {spread:+.4f}")

        # Adjacent quintile separation (Cohen's d)
        print(f"\n  Adjacent quintile Cohen's d:")
        pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')]
        for q_lo, q_hi in pairs:
            a = df[df['quintile'] == q_lo]['composite_raw']
            b = df[df['quintile'] == q_hi]['composite_raw']
            if len(a) == 0 or len(b) == 0:
                continue
            pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2)
            d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0
            u, p = mannwhitneyu(a, b, alternative='less')
            sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
            print(f"    {q_lo}→{q_hi}: d={d:+.3f}  U-test p={p:.4f} {sig}")

        # Kruskal-Wallis across all quintiles
        groups = [df[df['quintile'] == q]['composite_raw'].values
                  for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
                  if len(df[df['quintile'] == q]) > 0]
        if len(groups) >= 2:
            H, p_kw = kruskal(*groups)
            print(f"  Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}")

    # ── 5. Bootstrap significance test: hybrid vs silero ──
    section("5. BOOTSTRAP SIGNIFICANCE TEST (Hybrid vs Silero)")

    common = dfs['silero'][['file_id', 'expert_score', 'composite_raw']].merge(
        dfs['hybrid'][['file_id', 'composite_raw']],
        on='file_id', suffixes=('_silero', '_hybrid')
    )
    common = common.merge(
        dfs['marblenet'][['file_id', 'composite_raw']].rename(
            columns={'composite_raw': 'composite_raw_marblenet'}),
        on='file_id'
    )

    n_boot = 10000
    rng = np.random.default_rng(42)
    expert = common['expert_score'].values
    comp_s = common['composite_raw_silero'].values
    comp_h = common['composite_raw_hybrid'].values
    comp_m = common['composite_raw_marblenet'].values
    N = len(common)

    rho_s_obs, _ = spearmanr(expert, comp_s)
    rho_h_obs, _ = spearmanr(expert, comp_h)
    rho_m_obs, _ = spearmanr(expert, comp_m)

    delta_obs = rho_h_obs - rho_s_obs

    boot_rho_s = []
    boot_rho_h = []
    boot_rho_m = []
    boot_deltas_hs = []
    boot_deltas_ms = []

    for _ in range(n_boot):
        idx = rng.choice(N, N, replace=True)
        rs, _ = spearmanr(expert[idx], comp_s[idx])
        rh, _ = spearmanr(expert[idx], comp_h[idx])
        rm, _ = spearmanr(expert[idx], comp_m[idx])
        boot_rho_s.append(rs)
        boot_rho_h.append(rh)
        boot_rho_m.append(rm)
        boot_deltas_hs.append(rh - rs)
        boot_deltas_ms.append(rm - rs)

    boot_deltas_hs = np.array(boot_deltas_hs)
    boot_deltas_ms = np.array(boot_deltas_ms)
    boot_rho_s = np.array(boot_rho_s)
    boot_rho_h = np.array(boot_rho_h)
    boot_rho_m = np.array(boot_rho_m)

    p_hybrid = (boot_deltas_hs <= 0).mean()
    p_marble = (boot_deltas_ms <= 0).mean()

    print(f"\n  Paired comparison on N={N} common files:")
    print(f"\n  {'Mode':<12s}  {'ρ observed':>12s}  {'95% CI':>20s}")
    print(f"  {'-'*50}")
    for label, obs, boot in [('silero', rho_s_obs, boot_rho_s),
                              ('marblenet', rho_m_obs, boot_rho_m),
                              ('hybrid', rho_h_obs, boot_rho_h)]:
        ci_lo, ci_hi = np.percentile(boot, [2.5, 97.5])
        print(f"  {label:<12s}  {obs:>12.4f}  [{ci_lo:.4f}, {ci_hi:.4f}]")

    print(f"\n  Δρ (hybrid - silero):  {delta_obs:+.4f}  "
          f"95% CI [{np.percentile(boot_deltas_hs, 2.5):+.4f}, {np.percentile(boot_deltas_hs, 97.5):+.4f}]  "
          f"p={p_hybrid:.4f}")
    print(f"  Δρ (marble - silero):  {rho_m_obs - rho_s_obs:+.4f}  "
          f"95% CI [{np.percentile(boot_deltas_ms, 2.5):+.4f}, {np.percentile(boot_deltas_ms, 97.5):+.4f}]  "
          f"p={p_marble:.4f}")

    if p_hybrid < 0.05:
        print(f"\n  HYBRID is SIGNIFICANTLY better than Silero (p={p_hybrid:.4f})")
    elif delta_obs > 0:
        print(f"\n  ~ Hybrid shows improvement but NOT significant (p={p_hybrid:.4f})")
    else:
        print(f"\n  Hybrid shows NO improvement over Silero")

    # ── 6. Band agreement ──
    section("6. FLUENCY BAND AGREEMENT WITH EXPERT BANDS")

    def expert_band(score):
        if score < 3.0: return 'LOW'
        elif score < 4.5: return 'MEDIUM'
        else: return 'HIGH'

    for mode in MODES:
        df = dfs[mode].copy()
        df['expert_band'] = df['expert_score'].apply(expert_band)
        agree = (df['expert_band'] == df['fluency_band']).sum()
        print(f"\n  [{mode}] Agreement: {agree}/{len(df)} ({agree/len(df):.1%})")
        print(f"  {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}")
        for eb in ['LOW', 'MEDIUM', 'HIGH']:
            row = []
            for pb in ['LOW', 'MEDIUM', 'HIGH']:
                n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
                row.append(n)
            print(f"  Expert {eb:>6}:  {row[0]:>6} {row[1]:>6} {row[2]:>6}")

    # ── 7. Per-dimension improvement breakdown ──
    section("7. PER-DIMENSION Δρ (hybrid vs silero)")

    dim_merged = dfs['silero'][['file_id', 'expert_score'] + dims].merge(
        dfs['hybrid'][['file_id'] + dims],
        on='file_id', suffixes=('_silero', '_hybrid')
    )

    print(f"\n  {'Dimension':<25s}  {'ρ_silero':>10s}  {'ρ_hybrid':>10s}  {'Δρ':>8s}  {'Direction':>10s}")
    print(f"  {'-'*70}")
    for d in dims:
        rs, _ = spearmanr(dim_merged['expert_score'], dim_merged[f'{d}_silero'])
        rh, _ = spearmanr(dim_merged['expert_score'], dim_merged[f'{d}_hybrid'])
        delta = rh - rs
        direction = '^ better' if delta > 0.005 else 'v worse' if delta < -0.005 else '= same'
        print(f"  {d:<25s}  {rs:>+10.4f}  {rh:>+10.4f}  {delta:>+8.4f}  {direction:>10s}")

    # ── 8. Summary verdict ──
    section("SUMMARY VERDICT")

    print(f"\n  Baseline (Silero):    rho = {rho_s_obs:.4f}")
    print(f"  MarbleNet only:       rho = {rho_m_obs:.4f}  (delta = {rho_m_obs - rho_s_obs:+.4f})")
    print(f"  Hybrid (S+M):         rho = {rho_h_obs:.4f}  (delta = {rho_h_obs - rho_s_obs:+.4f})")
    print(f"  Bootstrap p (hybrid): {p_hybrid:.4f}")

    # Quintile discrimination summary
    for mode in MODES:
        df = dfs[mode].copy()
        df['quintile'] = pd.qcut(df['expert_score'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
                                  duplicates='drop')
        q_means = {}
        for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']:
            sub = df[df['quintile'] == q]
            if len(sub) > 0:
                q_means[q] = sub['composite_raw'].mean()
        if len(q_means) >= 4:
            q23 = q_means.get('Q3', 0) - q_means.get('Q2', 0)
            q34 = q_means.get('Q4', 0) - q_means.get('Q3', 0)
            spread = q_means.get('Q5', 0) - q_means.get('Q1', 0)
            print(f"\n  [{mode}] Q2->Q3 gap: {q23:+.4f}  Q3->Q4 gap: {q34:+.4f}  "
                  f"Q5-Q1 spread: {spread:+.4f}")

    if rho_h_obs > rho_s_obs and p_hybrid < 0.05:
        print(f"\n  -> RECOMMENDATION: Adopt hybrid VAD (+{rho_h_obs - rho_s_obs:.4f} rho, p={p_hybrid:.4f})")
    elif rho_h_obs > rho_s_obs:
        print(f"\n  -> RECOMMENDATION: Hybrid shows promise (+{rho_h_obs - rho_s_obs:.4f} rho) but needs "
              f"larger corpus for significance (p={p_hybrid:.4f})")
    else:
        print(f"\n  -> RECOMMENDATION: Keep Silero baseline — hybrid VAD does not improve correlation")

    print(f"\nTotal evaluation time: {total_time/60:.1f} minutes")