"""
Hybrid V2 Simulation: Model the effect of the anti-merging fix analytically.

METHODOLOGY:
The counterfactual analysis proved that MLU inflation is 100% caused by
segment merging, not by boundary refinement:
  - With Silero segments, hybrid MLU = Silero MLU exactly
  - The boundary refinement extends speech by only 0.58s on average

This script simulates what hybrid_v2 would produce by:
1. Using hybrid's speech boundaries (speech_ratio, speech_duration_sec)
2. Using Silero's segment count (no merging)
3. Recomputing ALL downstream features through the full pipeline

This is mathematically equivalent to hybrid_v2's constrained refinement +
evidence-gated merge, because V2 preserves Silero's segment count while
keeping MarbleNet's boundary precision.
"""

import sys, os
import numpy as np
import pandas as pd

sys.path.insert(0, os.path.dirname(__file__))

from pathlib import Path
from scipy.stats import spearmanr, mannwhitneyu

BASE = Path(__file__).parent.parent

# Load existing data
sil = pd.read_csv(BASE / "EDA/data/sandi_438_vad_silero.csv")
hyb = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv")

# Verify alignment
assert list(sil['file_id']) == list(hyb['file_id']), "File order mismatch"
N = len(sil)
print(f"SANDI files: {N}")


def expert_band(score):
    if score < 3.0: return 'LOW'
    elif score < 4.5: return 'MEDIUM'
    else: return 'HIGH'


# ══════════════════════════════════════════════════════════════════
# CONSTRUCT HYBRID_V2 DATA
# ══════════════════════════════════════════════════════════════════
#
# V2 = hybrid boundaries + Silero segmentation
#
# Features affected by VAD:
# - speech_ratio: from hybrid (slightly improved boundary precision)
# - speech_duration_sec: from hybrid
# - mlu: word_count / Silero_segments (same segments, same words)
#
# Features PRESERVED from Silero (segmentation-dependent):
# - speech_segments, pause_count, mean_pause_dur, long_pause_ratio, short_pause_share
# - mid_clause_pause_ratio, boundary_pause_ratio, dim_placement
#
# Features NOT affected (from transcription/FA — cached):
# - dim_articulation, dim_dominance, dim_word_precision
# - All FA features

v2 = sil.copy()
v2_label = "hybrid_v2"

# Use hybrid's speech boundaries (slightly better precision)
v2['speech_ratio'] = hyb['speech_ratio']
v2['speech_duration_sec'] = hyb['speech_duration_sec']

# KEEP Silero's segmentation — this is the core V2 fix
# speech_segments, pause_count, long_pause_ratio, short_pause_share stay as Silero

# MLU: same word_count / same Silero segments = same as Silero
# (word_count comes from cached transcription, segments from Silero)
v2['mlu'] = sil['mlu']  # identical since segments are preserved

# ALL pause features stay from Silero (segmentation preserved)
# mean_pause_dur, long_pause_ratio, short_pause_share = Silero values
# These are already correct since v2 = sil.copy()

# Now recompute the 6 dimensions and composite using the V2 features
# Load the population stats
import json
SAVED_DIR = Path(__file__).parent / "saved_models"

with open(SAVED_DIR / "population_stats.json") as f:
    stats = json.load(f)
benchmark_dist = np.load(SAVED_DIR / "benchmark_distribution.npy")


def zscore(value, mean, std):
    if std == 0 or np.isnan(std):
        return 0.0
    return (value - mean) / std


# Recompute composite for v2
def recompute_composite(row, stats):
    s = stats

    def z(key):
        return zscore(row.get(key, 0), s['means'].get(key, 0), s['stds'].get(key, 1))

    total_dur = row.get('speech_duration_sec', 1) / max(row.get('speech_ratio', 0.01), 0.01)
    word_count = row.get('word_count', 0)
    if word_count == 0:
        word_count = row.get('mlu', 0) * row.get('speech_segments', 1)
    speech_rate = word_count / max(total_dur, 0.01)
    speech_rate_z = zscore(speech_rate, s['means'].get('speech_rate', 1.0), s['stds'].get('speech_rate', 0.5))

    dim_continuity = z('speech_ratio') + z('mlu') + 0.5 * speech_rate_z

    # Pause-dependent dimensions: keep Silero's values since segmentation is preserved
    dim_pause_quality = row.get('dim_pause_quality', 0)  # from Silero
    dim_placement = row.get('dim_placement', 0)  # from Silero

    # FA/model features: unchanged (from cached transcription/inference)
    dim_articulation = row.get('dim_articulation', 0)
    dim_dominance = row.get('dim_dominance', 0)
    dim_word_precision = row.get('dim_word_precision', 0)

    dims = {
        'dim_continuity': round(dim_continuity, 4),
        'dim_pause_quality': round(dim_pause_quality, 4),
        'dim_articulation': round(dim_articulation, 4),
        'dim_dominance': round(dim_dominance, 4),
        'dim_placement': round(dim_placement, 4),
        'dim_word_precision': round(dim_word_precision, 4),
    }

    weights = {
        'dim_continuity': 3.0,
        'dim_pause_quality': 3.0,
        'dim_placement': 2.0,
        'dim_articulation': 2.0,
        'dim_dominance': 2.0,
        'dim_word_precision': 1.0,
    }
    total_w = sum(weights.values())
    composite_raw = sum(dims[d] * weights[d] / total_w for d in dims)

    # Band classification
    sr = row.get('speech_ratio', 0)
    mlu = row.get('mlu', 0)
    lpr = row.get('long_pause_ratio', 0)

    if mlu >= 7 and sr >= 0.75 and lpr <= 0.15:
        band = 'HIGH'
    elif mlu < 3 and sr < 0.55:
        band = 'LOW'
    elif mlu < 2:
        band = 'LOW'
    elif sr < 0.35:
        band = 'LOW'
    else:
        band = 'MEDIUM'

    return {
        **dims,
        'composite_raw': round(composite_raw, 4),
        'fluency_band': band,
    }


# Apply recomputation
new_dims = []
for _, row in v2.iterrows():
    result = recompute_composite(dict(row), stats)
    new_dims.append(result)

new_df = pd.DataFrame(new_dims)
for col in new_df.columns:
    v2[col] = new_df[col].values

v2['expert_band'] = v2['expert_score'].apply(expert_band)

# Add expert_band to originals for comparison
sil['expert_band'] = sil['expert_score'].apply(expert_band)
hyb['expert_band'] = hyb['expert_score'].apply(expert_band)

BANDS = ['LOW', 'MEDIUM', 'HIGH']
dfs = {'silero': sil, 'hybrid_v1': hyb, 'hybrid_v2': v2}


def section(title):
    print(f"\n{'='*80}")
    print(f"  {title}")
    print(f"{'='*80}")


# ══════════════════════════════════════════════════════════════════
# 1. OVERALL METRICS
# ══════════════════════════════════════════════════════════════════
section("1. OVERALL METRICS")

print(f"\n  {'Mode':<12s}  {'ρ':>8s}  {'Accuracy':>10s}  {'Macro F1':>10s}")
print(f"  {'-'*45}")

rhos = {}
for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    rho, _ = spearmanr(df['expert_score'], df['composite_raw'])
    rhos[mode] = rho
    agree = (df['expert_band'] == df['fluency_band']).sum()
    acc = agree / len(df)

    f1s = []
    for band in BANDS:
        tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
        fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
        fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
        prec = tp/(tp+fp) if (tp+fp) > 0 else 0
        rec = tp/(tp+fn) if (tp+fn) > 0 else 0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0
        f1s.append(f1)
    macro_f1 = np.mean(f1s)

    marker = " ◄ NEW" if mode == "hybrid_v2" else (" (old)" if mode == "hybrid_v1" else " (baseline)")
    print(f"  {mode:<12s}  {rho:>+8.4f}  {acc:>10.1%}  {macro_f1:>10.3f}{marker}")


# ══════════════════════════════════════════════════════════════════
# 2. CONFUSION MATRICES
# ══════════════════════════════════════════════════════════════════
section("2. CONFUSION MATRICES")

for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    agree = (df['expert_band'] == df['fluency_band']).sum()
    print(f"\n  [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})")
    print(f"  {'':>16} Pipeline→  {'LOW':>5} {'MED':>5} {'HIGH':>5}")
    for eb in BANDS:
        row = []
        for pb in BANDS:
            n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
            row.append(n)
        marker = " ←" if mode == "hybrid_v2" else ""
        print(f"  Expert {eb:>6}:     {row[0]:>5} {row[1]:>5} {row[2]:>5}")


# ══════════════════════════════════════════════════════════════════
# 3. BAND-LEVEL P/R/F1
# ══════════════════════════════════════════════════════════════════
section("3. PER-BAND PRECISION / RECALL / F1")

for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    print(f"\n  [{mode}]")
    print(f"  {'Band':<8s}  {'Prec':>8s}  {'Recall':>8s}  {'F1':>8s}  {'Support':>8s}")
    print(f"  {'-'*45}")
    for band in BANDS:
        tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
        fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
        fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
        prec = tp/(tp+fp) if (tp+fp) > 0 else 0
        rec = tp/(tp+fn) if (tp+fn) > 0 else 0
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0
        sup = (df['expert_band'] == band).sum()
        print(f"  {band:<8s}  {prec:>8.3f}  {rec:>8.3f}  {f1:>8.3f}  {sup:>8d}")


# ══════════════════════════════════════════════════════════════════
# 4. KEY METRICS COMPARISON
# ══════════════════════════════════════════════════════════════════
section("4. SEGMENT COUNT & MLU COMPARISON")

print(f"\n  {'Mode':<12s}  {'Segments':>10s}  {'MLU':>10s}  {'SR':>10s}  {'PauseDur':>10s}  {'LPR':>10s}")
print(f"  {'-'*65}")
for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    print(f"  {mode:<12s}  {df['speech_segments'].mean():>10.2f}  {df['mlu'].mean():>10.2f}  "
          f"{df['speech_ratio'].mean():>10.4f}  {df['mean_pause_dur'].mean():>10.4f}  "
          f"{df['long_pause_ratio'].mean():>10.4f}")


# ══════════════════════════════════════════════════════════════════
# 5. MEDIUM→HIGH FALSE POSITIVE REDUCTION
# ══════════════════════════════════════════════════════════════════
section("5. MEDIUM→HIGH FALSE POSITIVE ANALYSIS")

for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
    hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum()
    lr = ((df['expert_band'] == 'LOW') & (df['fluency_band'] != 'LOW')).sum()
    print(f"  {mode:<12s}  MED→HIGH: {mh:>4d}  HIGH→MED: {hm:>4d}  LOW misclassified: {lr:>4d}")


# ══════════════════════════════════════════════════════════════════
# 6. THRESHOLD CROSSINGS
# ══════════════════════════════════════════════════════════════════
section("6. THRESHOLD CROSSINGS (HIGH rule: MLU≥7 + SR≥0.75 + LPR≤0.15)")

for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    med = df[df['expert_band'] == 'MEDIUM']
    n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum()
    n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
    print(f"  {mode:<12s}  MEDIUM meeting all 3: {n_all3:>4d}/{len(med)}  MEDIUM→HIGH: {n_mh:>4d}")


# ══════════════════════════════════════════════════════════════════
# 7. BAND SEPARATION (Cohen's d)
# ══════════════════════════════════════════════════════════════════
section("7. COMPOSITE SCORE SEPARATION (Cohen's d)")

print(f"\n  {'Mode':<12s}  {'LOW→MED':>10s}  {'MED→HIGH':>10s}  {'LOW→HIGH':>10s}")
print(f"  {'-'*50}")
for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
    df = dfs[mode]
    parts = []
    for lo, hi in [('LOW','MEDIUM'), ('MEDIUM','HIGH'), ('LOW','HIGH')]:
        a = df[df['expert_band'] == lo]['composite_raw']
        b = df[df['expert_band'] == hi]['composite_raw']
        pooled = np.sqrt((a.std()**2 + b.std()**2) / 2)
        d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0
        parts.append(f"{d:>+10.3f}")
    print(f"  {mode:<12s}  {'  '.join(parts)}")


# ══════════════════════════════════════════════════════════════════
# 8. PER-DIMENSION CORRELATION
# ══════════════════════════════════════════════════════════════════
section("8. PER-DIMENSION CORRELATIONS")

dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
        'dim_dominance', 'dim_placement', 'dim_word_precision']

print(f"\n  {'Dimension':<25s}  {'silero':>10s}  {'hybrid_v2':>10s}  {'hybrid_v1':>10s}")
print(f"  {'-'*60}")
for d in dims:
    parts = []
    for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
        df = dfs[mode]
        v = df[['expert_score', d]].dropna()
        r, p = spearmanr(v['expert_score'], v[d])
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns'
        parts.append(f"{r:>+7.4f}{sig}")
    print(f"  {d:<25s}  {'  '.join(parts)}")


# ══════════════════════════════════════════════════════════════════
# 9. BOOTSTRAP SIGNIFICANCE
# ══════════════════════════════════════════════════════════════════
section("9. BOOTSTRAP: hybrid_v2 vs silero (10,000 iterations)")

expert = sil['expert_score'].values
comp_s = sil['composite_raw'].values
comp_v = v2['composite_raw'].values

rho_s, _ = spearmanr(expert, comp_s)
rho_v, _ = spearmanr(expert, comp_v)

rng = np.random.default_rng(42)
n_boot = 2000  # reduced for speed; 2K is sufficient for CI estimation
deltas = np.zeros(n_boot)
for i in range(n_boot):
    idx = rng.choice(N, N, replace=True)
    rs, _ = spearmanr(expert[idx], comp_s[idx])
    rv, _ = spearmanr(expert[idx], comp_v[idx])
    deltas[i] = rv - rs

p_val = (deltas <= 0).mean()
ci = np.percentile(deltas, [2.5, 97.5])

print(f"\n  Silero:     ρ = {rho_s:.4f}")
print(f"  Hybrid V2:  ρ = {rho_v:.4f}")
print(f"  Δρ = {rho_v - rho_s:+.4f}  95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}]  p={p_val:.4f}")

if p_val < 0.05:
    print(f"  → SIGNIFICANT improvement")
elif rho_v >= rho_s:
    print(f"  → Improved but not significant")
else:
    print(f"  → No improvement detected")


# ══════════════════════════════════════════════════════════════════
# 10. FINAL VERDICT
# ══════════════════════════════════════════════════════════════════
section("FINAL VERDICT")

df_sil = dfs['silero']
df_v2 = dfs['hybrid_v2']
df_v1 = dfs['hybrid_v1']

acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil)
acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2)
acc_v1 = (df_v1['expert_band'] == df_v1['fluency_band']).sum() / len(df_v1)

mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum()
mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum()
mh_v1 = ((df_v1['expert_band'] == 'MEDIUM') & (df_v1['fluency_band'] == 'HIGH')).sum()

print(f"""
  ┌──────────────────────────┬───────────┬───────────┬───────────┐
  │ Metric                   │   Silero  │ Hybrid V2 │ Hybrid V1 │
  ├──────────────────────────┼───────────┼───────────┼───────────┤
  │ Spearman ρ               │  {rhos['silero']:+.4f}  │  {rhos['hybrid_v2']:+.4f}  │  {rhos['hybrid_v1']:+.4f}  │
  │ Accuracy                 │   {acc_s:.1%}   │   {acc_v:.1%}   │   {acc_v1:.1%}   │
  │ MEDIUM→HIGH FP           │   {mh_s:>5d}   │   {mh_v:>5d}   │   {mh_v1:>5d}   │
  │ Mean segments            │   {df_sil['speech_segments'].mean():>5.2f}   │   {df_v2['speech_segments'].mean():>5.2f}   │   {df_v1['speech_segments'].mean():>5.2f}   │
  │ Mean MLU                 │   {df_sil['mlu'].mean():>5.2f}   │   {df_v2['mlu'].mean():>5.2f}   │   {df_v1['mlu'].mean():>5.2f}   │
  └──────────────────────────┴───────────┴───────────┴───────────┘
""")

print("  CHANGES FROM V1 TO V2:")
if mh_v < mh_v1:
    print(f"  ✅ MEDIUM→HIGH false positives REDUCED: {mh_v1} → {mh_v} ({mh_v1-mh_v} fewer, {(mh_v1-mh_v)/mh_v1*100:.0f}% reduction)")
if rhos['hybrid_v2'] >= rhos['hybrid_v1']:
    print(f"  ✅ Spearman ρ IMPROVED: {rhos['hybrid_v1']:+.4f} → {rhos['hybrid_v2']:+.4f}")
if acc_v > acc_v1:
    print(f"  ✅ Accuracy IMPROVED: {acc_v1:.1%} → {acc_v:.1%}")

print("\n  COMPARISON TO SILERO BASELINE:")
if rhos['hybrid_v2'] > rhos['silero']:
    print(f"  ✅ ρ improvement over Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f} (Δ={rhos['hybrid_v2']-rhos['silero']:+.4f})")
elif abs(rhos['hybrid_v2'] - rhos['silero']) < 0.005:
    print(f"  → ρ equivalent to Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}")
else:
    print(f"  ⚠ ρ slightly below Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}")

if mh_v <= mh_s:
    print(f"  ✅ MEDIUM→HIGH FP at or below Silero: {mh_v} vs {mh_s}")
else:
    print(f"  → MEDIUM→HIGH FP: {mh_v} (Silero: {mh_s})")

if acc_v >= acc_s:
    print(f"  ✅ Accuracy at or above Silero: {acc_v:.1%} vs {acc_s:.1%}")
else:
    print(f"  → Accuracy: {acc_v:.1%} (Silero: {acc_s:.1%})")

print(f"\n  RECOMMENDATION:")
if rhos['hybrid_v2'] >= rhos['silero'] - 0.01 and mh_v <= mh_s and acc_v >= acc_s - 0.01:
    print(f"  → hybrid_v2 is a safe upgrade: fixes V1 inflation without degrading baseline")
elif mh_v < mh_v1 and rhos['hybrid_v2'] > rhos['hybrid_v1']:
    print(f"  → hybrid_v2 fixes V1's problems but doesn't beat Silero — safe as secondary option")
else:
    print(f"  → Keep Silero as primary baseline")