"""
Selective Feature Fusion VAD Test on SANDI 438.

4 VAD modes: silero (baseline), ten, marblenet, fusion (selective)
Uses cached transcriptions from prior run.

Reports:
  1. Spearman rho/Kendall tau vs expert scores
  2. Rank-order accuracy (Spearman + pairwise ordering)
  3. Quintile segregation with Cohen's d and U-tests
  4. Classification accuracy (band agreement + confusion matrix)
  5. Per-dimension correlation breakdown
  6. Bootstrap significance tests (fusion vs baseline)
  7. Ordering analysis: how many pairs correctly ordered

Usage: python run_selective_fusion_test.py
"""

import sys, os, time, warnings, pickle
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))

from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal

BASE = Path(__file__).parent.parent
CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"

filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"SANDI files: {len(filelist)}")

# ── Verify transcription cache ──
cached = sum(1 for _, row in filelist.iterrows()
             if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
print(f"Cached transcriptions: {cached}/{len(filelist)}")
if cached < len(filelist) * 0.5:
    print("ERROR: Run run_hybrid_vad_test.py --transcribe first")
    sys.exit(1)

# ── Load pipeline modules ──
print("Loading models...", flush=True)
from pipeline.selective_fusion_vad import run_selective_vad
from pipeline.placement import classify_pauses
from pipeline.fa_features import compute_fa_features
from pipeline.syntactic_features import compute_syntactic_features
from models.inference import predict
from pipeline.composite import compute_composite
print("Models loaded.\n")

MODES = ["silero", "ten", "marblenet", "fusion"]
all_results = {m: [] for m in MODES}
errors = {m: 0 for m in MODES}
start_time = time.time()

for idx, row in filelist.iterrows():
    file_id = row['file_id']
    cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
    if not cache_path.exists():
        continue

    audio_path = row['audio_path']
    if not os.path.isabs(audio_path):
        audio_path = str(BASE / audio_path)
    if not os.path.exists(audio_path):
        continue

    with open(cache_path, 'rb') as f:
        tx = pickle.load(f)
    words = tx['words']
    word_count = len(words)

    n = idx + 1
    elapsed = time.time() - start_time
    rate = n / max(elapsed, 1)
    eta = (len(filelist) - n) / max(rate, 0.01)

    if n % 50 == 0 or n <= 3 or n == len(filelist):
        print(f"  [{n}/{len(filelist)}] {file_id} "
              f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)

    for mode in MODES:
        try:
            vad = run_selective_vad(audio_path, mode=mode)
            vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
            placement = classify_pauses(words, vad)
            fa = compute_fa_features(words, vad['total_duration_sec'])
            syn = compute_syntactic_features(words, tx['transcript'])
            all_features = {**vad, **placement, **fa, **syn}
            predictions = predict(all_features)
            composite = compute_composite(all_features, predictions)

            all_results[mode].append({
                'file_id': file_id,
                'expert_score': row['expert_score'],
                'composite_raw': composite['composite_raw'],
                'composite_percentile': composite['composite_percentile'],
                'fluency_band': composite['fluency_band'],
                'speech_ratio': vad['speech_ratio'],
                'mlu': vad['mlu'],
                'word_count': word_count,
                'pause_count': vad['pause_count'],
                'mean_pause_dur': vad['mean_pause_duration_sec'],
                'long_pause_ratio': vad['long_pause_ratio'],
                'short_pause_share': vad.get('short_pause_share', 0),
                'speech_segments': vad['speech_segments'],
                'speech_duration_sec': vad['speech_duration_sec'],
                'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
                'boundary_pause_ratio': placement['boundary_pause_ratio'],
                'dim_continuity': composite['dim_continuity'],
                'dim_pause_quality': composite['dim_pause_quality'],
                'dim_articulation': composite['dim_articulation'],
                'dim_dominance': composite['dim_dominance'],
                'dim_placement': composite['dim_placement'],
                'dim_word_precision': composite['dim_word_precision'],
            })
        except Exception as e:
            errors[mode] += 1
            if errors[mode] <= 3:
                print(f"    ERROR [{mode}] {file_id}: {e}", flush=True)

total_time = time.time() - start_time
print(f"\nDone in {total_time/60:.1f} minutes")
for m in MODES:
    print(f"  {m}: {len(all_results[m])} processed, {errors[m]} errors")

dfs = {}
for mode in MODES:
    df = pd.DataFrame(all_results[mode])
    out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv"
    df.to_csv(out_path, index=False)
    dfs[mode] = df
    print(f"Saved: {out_path}")


# ══════════════════════════════════════════════════════════════════
# ANALYSIS
# ══════════════════════════════════════════════════════════════════

def section(title):
    w = 80
    print(f"\n{'='*w}")
    print(f"  {title}")
    print(f"{'='*w}")

dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
        'dim_dominance', 'dim_placement', 'dim_word_precision']

# ── 1. Overall correlation ──
section("1. OVERALL CORRELATION WITH EXPERT SCORES")
print(f"\n  {'Mode':<12s}  {'Spearman':>10s}  {'p':>10s}  {'Kendall':>10s}  {'p':>10s}  {'N':>5s}")
print(f"  {'-'*55}")
rhos = {}
for mode in MODES:
    df = dfs[mode]
    rho, p_rho = spearmanr(df['expert_score'], df['composite_raw'])
    tau, p_tau = kendalltau(df['expert_score'], df['composite_raw'])
    rhos[mode] = rho
    marker = ' <-- BEST' if rho == max(rhos.values()) else ''
    print(f"  {mode:<12s}  {rho:>+10.4f}  {p_rho:>10.2e}  {tau:>+10.4f}  {p_tau:>10.2e}  {len(df):>5d}{marker}")

# ── 2. Pairwise ordering accuracy ──
section("2. PAIRWISE ORDERING ACCURACY")
print("  (What % of speaker pairs are ordered correctly by the pipeline?)")
for mode in MODES:
    df = dfs[mode]
    exp = df['expert_score'].values
    comp = df['composite_raw'].values
    n = len(exp)
    concordant = 0
    discordant = 0
    tied = 0
    for i in range(n):
        for j in range(i + 1, n):
            e_diff = exp[i] - exp[j]
            c_diff = comp[i] - comp[j]
            if e_diff == 0:
                tied += 1
            elif e_diff * c_diff > 0:
                concordant += 1
            else:
                discordant += 1
    total_pairs = concordant + discordant
    accuracy = concordant / total_pairs * 100 if total_pairs > 0 else 0
    print(f"  {mode:<12s}: {concordant}/{total_pairs} pairs correct ({accuracy:.1f}%)"
          f"  tied={tied}")

# ── 3. Quintile segregation ──
section("3. QUINTILE SEGREGATION")
for mode in MODES:
    df = dfs[mode].copy()
    df['quintile'] = pd.qcut(df['expert_score'], 5,
                              labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
                              duplicates='drop')
    print(f"\n  [{mode}]")
    print(f"  {'Q':>4s}  {'N':>4s}  {'Expert':>8s}  {'Composite':>10s}  {'SR':>7s}  "
          f"{'MLU':>7s}  {'PauseDur':>9s}  {'LPR':>7s}")
    print(f"  {'-'*65}")

    q_means = {}
    for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']:
        sub = df[df['quintile'] == q]
        if len(sub) == 0: continue
        q_means[q] = sub['composite_raw'].mean()
        print(f"  {q:>4s}  {len(sub):>4d}  {sub['expert_score'].mean():>8.2f}  "
              f"{sub['composite_raw'].mean():>+10.4f}  {sub['speech_ratio'].mean():>7.3f}  "
              f"{sub['mlu'].mean():>7.2f}  {sub['mean_pause_dur'].mean():>9.4f}  "
              f"{sub['long_pause_ratio'].mean():>7.4f}")

    # Adjacent quintile effect sizes
    print(f"\n  Cohen's d (adjacent quintiles):")
    pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')]
    for q_lo, q_hi in pairs:
        a = df[df['quintile'] == q_lo]['composite_raw']
        b = df[df['quintile'] == q_hi]['composite_raw']
        if len(a) == 0 or len(b) == 0: continue
        pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2)
        d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0
        u, p = mannwhitneyu(a, b, alternative='less')
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
        print(f"    {q_lo}->{q_hi}: d={d:+.3f}  p={p:.4f} {sig}")

    if len(q_means) >= 2:
        qs = sorted(q_means.keys())
        spread = q_means[qs[-1]] - q_means[qs[0]]
        print(f"  Q5-Q1 spread: {spread:+.4f}")

    groups = [df[df['quintile'] == q]['composite_raw'].values
              for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
              if len(df[df['quintile'] == q]) > 0]
    if len(groups) >= 2:
        H, p_kw = kruskal(*groups)
        print(f"  Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}")

# ── 4. Band classification accuracy ──
section("4. FLUENCY BAND CLASSIFICATION")

def expert_band(score):
    if score < 3.0: return 'LOW'
    elif score < 4.5: return 'MEDIUM'
    else: return 'HIGH'

for mode in MODES:
    df = dfs[mode].copy()
    df['expert_band'] = df['expert_score'].apply(expert_band)
    agree = (df['expert_band'] == df['fluency_band']).sum()
    total = len(df)
    print(f"\n  [{mode}] Accuracy: {agree}/{total} ({agree/total:.1%})")
    print(f"  {'':>15} Pipeline->  {'LOW':>5} {'MED':>5} {'HIGH':>5}")
    for eb in ['LOW', 'MEDIUM', 'HIGH']:
        row = []
        for pb in ['LOW', 'MEDIUM', 'HIGH']:
            n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
            row.append(n)
        print(f"  Expert {eb:>6}:     {row[0]:>5} {row[1]:>5} {row[2]:>5}")

# ── 5. Per-dimension correlations ──
section("5. PER-DIMENSION CORRELATIONS")
print(f"\n  {'Dimension':<25s}", end="")
for mode in MODES:
    print(f"  {mode:>10s}", end="")
print()
print(f"  {'-'*70}")
for d in dims:
    print(f"  {d:<25s}", end="")
    for mode in MODES:
        v = dfs[mode][['expert_score', d]].dropna()
        r, p = spearmanr(v['expert_score'], v[d])
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns'
        print(f"  {r:>+7.4f}{sig}", end="")
    print()

# ── 6. Bootstrap significance ──
section("6. BOOTSTRAP SIGNIFICANCE TEST")

# Find common files across all modes
common_ids = set(dfs[MODES[0]]['file_id'])
for mode in MODES[1:]:
    common_ids &= set(dfs[mode]['file_id'])
common_ids = sorted(common_ids)

base_df = dfs['silero'].set_index('file_id').loc[common_ids]
expert = base_df['expert_score'].values
N = len(expert)
comp = {}
for mode in MODES:
    comp[mode] = dfs[mode].set_index('file_id').loc[common_ids]['composite_raw'].values

n_boot = 10000
rng = np.random.default_rng(42)
boot_rho = {m: [] for m in MODES}
boot_delta = {m: [] for m in MODES if m != 'silero'}

for _ in range(n_boot):
    idx = rng.choice(N, N, replace=True)
    for mode in MODES:
        r, _ = spearmanr(expert[idx], comp[mode][idx])
        boot_rho[mode].append(r)
    for mode in [m for m in MODES if m != 'silero']:
        boot_delta[mode].append(boot_rho[mode][-1] - boot_rho['silero'][-1])

for mode in MODES:
    boot_rho[mode] = np.array(boot_rho[mode])
for mode in boot_delta:
    boot_delta[mode] = np.array(boot_delta[mode])

print(f"\n  Paired comparison on N={N} common files:")
print(f"\n  {'Mode':<12s}  {'rho':>8s}  {'95% CI':>20s}  {'delta':>8s}  {'p vs silero':>12s}")
print(f"  {'-'*65}")
for mode in MODES:
    r_obs, _ = spearmanr(expert, comp[mode])
    ci = np.percentile(boot_rho[mode], [2.5, 97.5])
    if mode == 'silero':
        print(f"  {mode:<12s}  {r_obs:>+8.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  {'baseline':>8s}  {'--':>12s}")
    else:
        delta = r_obs - rhos['silero']
        p_val = (boot_delta[mode] <= 0).mean()
        print(f"  {mode:<12s}  {r_obs:>+8.4f}  [{ci[0]:.4f}, {ci[1]:.4f}]  {delta:>+8.4f}  p={p_val:.4f}")

# ── 7. Score ordering analysis ──
section("7. SCORE ORDERING ANALYSIS")
print("  (Files sorted by expert score — how well does pipeline preserve order?)")

for mode in MODES:
    df = dfs[mode].sort_values('expert_score').reset_index(drop=True)
    # Compute rank correlation on sorted data
    exp_rank = df['expert_score'].rank(method='average')
    comp_rank = df['composite_raw'].rank(method='average')
    rho_rank, _ = spearmanr(exp_rank, comp_rank)

    # Monotonicity: how many consecutive pairs are in correct order?
    correct_order = 0
    total_consecutive = 0
    for i in range(len(df) - 1):
        if df.iloc[i + 1]['expert_score'] > df.iloc[i]['expert_score']:
            total_consecutive += 1
            if df.iloc[i + 1]['composite_raw'] >= df.iloc[i]['composite_raw']:
                correct_order += 1

    mono_pct = correct_order / total_consecutive * 100 if total_consecutive > 0 else 0
    print(f"  {mode:<12s}: rank rho={rho_rank:.4f}  "
          f"monotonic pairs: {correct_order}/{total_consecutive} ({mono_pct:.1f}%)")

# ── 8. Summary ──
section("FINAL SUMMARY")

best_mode = max(rhos, key=rhos.get)
print(f"\n  Results on SANDI dev N={N}:")
for mode in MODES:
    marker = ' <-- BEST' if mode == best_mode else ''
    print(f"    {mode:<12s}: rho={rhos[mode]:+.4f}{marker}")

print(f"\n  Best mode: {best_mode} (rho={rhos[best_mode]:.4f})")

if best_mode != 'silero':
    delta = rhos[best_mode] - rhos['silero']
    p_val = (boot_delta[best_mode] <= 0).mean()
    print(f"  Improvement over baseline: +{delta:.4f} (bootstrap p={p_val:.4f})")
    if p_val < 0.05:
        print(f"  -> SIGNIFICANT improvement. Adopt {best_mode}.")
    else:
        print(f"  -> Not significant. More data needed or recalibrate pipeline.")
else:
    print(f"  -> Silero baseline remains the best. No VAD swap needed.")

print(f"\nTotal time: {total_time/60:.1f} minutes")