"""
Process all 438 SANDI dev files through the full 6-dimension pipeline.
Uses Apple MPS acceleration for WhisperX where supported.

Usage: python run_sandi_438.py
Output: ../EDA/data/sandi_438_full_pipeline.csv
"""

import sys, os, time, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))

from pathlib import Path
from scipy.stats import spearmanr, kendalltau
from scipy.optimize import minimize
from sklearn.model_selection import KFold

BASE = Path(__file__).parent.parent

# ── Load file list ──
filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"Files to process: {len(filelist)}")

# ── Load pipeline modules ──
print("Loading models...", flush=True)
from pipeline.vad import run_vad
from pipeline.transcribe import transcribe_and_align
from pipeline.placement import classify_pauses
from pipeline.fa_features import compute_fa_features
from pipeline.syntactic_features import compute_syntactic_features
from models.inference import predict
from pipeline.composite import compute_composite
print("Models loaded.")

# ── Process all files ──
results = []
errors = 0
start_time = time.time()

for idx, row in filelist.iterrows():
    audio_path = row['audio_path']
    if not os.path.exists(audio_path):
        audio_path = str(BASE / audio_path)
    if not os.path.exists(audio_path):
        errors += 1
        continue

    n = idx + 1
    elapsed = time.time() - start_time
    rate = n / max(elapsed, 1)
    eta = (len(filelist) - n) / max(rate, 0.01)

    if n % 10 == 0 or n <= 3 or n == len(filelist):
        print(f"  [{n}/{len(filelist)}] {row['file_id']} (expert={row['expert_score']}) "
              f"[{elapsed/60:.1f}m elapsed, ~{eta/60:.0f}m remaining]", flush=True)

    try:
        vad = run_vad(audio_path)
        tx = transcribe_and_align(audio_path)
        words = tx['words']
        vad['mlu'] = round(len(words) / max(vad['speech_segments'], 1), 2)
        placement = classify_pauses(words, vad)
        fa = compute_fa_features(words, vad['total_duration_sec'])
        syn = compute_syntactic_features(words, tx['transcript'])
        all_features = {**vad, **placement, **fa, **syn}
        predictions = predict(all_features)
        composite = compute_composite(all_features, predictions)

        results.append({
            'file_id': row['file_id'],
            'expert_score': row['expert_score'],
            'composite_raw': composite['composite_raw'],
            'composite_percentile': composite['composite_percentile'],
            'fluency_band': composite['fluency_band'],
            'speech_ratio': vad['speech_ratio'],
            'mlu': vad['mlu'],
            'word_count': len(words),
            'pause_count': vad['pause_count'],
            'mean_pause_dur': vad['mean_pause_duration_sec'],
            'long_pause_ratio': vad['long_pause_ratio'],
            'short_pause_share': vad['short_pause_share'],
            'boundary_pause_ratio': placement['boundary_pause_ratio'],
            'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
            'articulation_pred': predictions.get('articulation_ordinal_pred', np.nan),
            'articulation_label': predictions.get('articulation_ordinal_label', '?'),
            'pause_freq_label': predictions.get('pause_freq_ordinal_label', '?'),
            'pause_dur_label': predictions.get('pause_dur_ordinal_label', '?'),
            'pause_place_label': predictions.get('pause_place_ordinal_label', '?'),
            'cognitive_load_label': predictions.get('cognitive_load_ordinal_label', '?'),
            'utt_constraints_label': predictions.get('utterance_constraints_ordinal_label', '?'),
            'prop_unplanned': predictions.get('prop_unplanned_pred', np.nan),
            'prop_planned': predictions.get('prop_planned_pred', np.nan),
            'prop_neutral': predictions.get('prop_neutral_pred', np.nan),
            'dim_continuity': composite['dim_continuity'],
            'dim_pause_quality': composite['dim_pause_quality'],
            'dim_articulation': composite['dim_articulation'],
            'dim_dominance': composite['dim_dominance'],
            'dim_placement': composite['dim_placement'],
            'dim_word_precision': composite['dim_word_precision'],
        })
    except Exception as e:
        errors += 1
        if errors <= 10:
            print(f"    ERROR {row['file_id']}: {str(e)[:80]}")

total_time = time.time() - start_time
print(f"\nDone: {len(results)}/{len(filelist)} processed, {errors} errors, {total_time/60:.1f} minutes")

# ── Save results ──
sf = pd.DataFrame(results)
out_path = BASE / "EDA/data/sandi_438_full_pipeline.csv"
sf.to_csv(out_path, index=False)
print(f"Saved: {out_path}")

# ══════════════════════════════════════════════════════════════
# ANALYSIS
# ══════════════════════════════════════════════════════════════

dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
        'dim_dominance', 'dim_placement', 'dim_word_precision']

print(f"\n{'='*70}")
print(f"SANDI FULL VALIDATION — 6-DIMENSION PIPELINE (N={len(sf)})")
print(f"{'='*70}")

# 1. Overall correlation
rho, p = spearmanr(sf['expert_score'], sf['composite_raw'])
tau, tp = kendalltau(sf['expert_score'], sf['composite_raw'])
print(f"\n  Spearman rho:  {rho:.3f} (p={p:.2e})")
print(f"  Kendall tau:   {tau:.3f} (p={tp:.2e})")

# 2. Per-dimension
print(f"\n  Per-dimension correlations with expert score:")
for d in dims:
    v = sf[['expert_score', d]].dropna()
    if len(v) > 10:
        r, p_ = spearmanr(v['expert_score'], v[d])
        sig = '***' if p_ < 0.001 else '**' if p_ < 0.01 else '*' if p_ < 0.05 else 'ns'
        print(f"    {d:<25s}  rho={r:+.3f}  {sig}")

# 3. Raw features
print(f"\n  Raw feature correlations:")
for feat in ['speech_ratio', 'mlu', 'mean_pause_dur', 'long_pause_ratio', 'mid_clause_pause_ratio']:
    r, _ = spearmanr(sf['expert_score'], sf[feat])
    print(f"    {feat:<25s}  rho={r:+.3f}")

# 4. Band agreement
def expert_band(score):
    if score < 3.0: return 'LOW'
    elif score < 4.5: return 'MEDIUM'
    else: return 'HIGH'

sf['expert_band'] = sf['expert_score'].apply(expert_band)
agree = (sf['expert_band'] == sf['fluency_band']).sum()
print(f"\n  Band agreement: {agree}/{len(sf)} ({agree/len(sf):.1%})")

print(f"\n  Confusion matrix:")
print(f"  {'':>15} Pipeline→ {'LOW':>6} {'MED':>6} {'HIGH':>6}")
for eb in ['LOW', 'MEDIUM', 'HIGH']:
    row = []
    for pb in ['LOW', 'MEDIUM', 'HIGH']:
        n = ((sf['expert_band'] == eb) & (sf['fluency_band'] == pb)).sum()
        row.append(n)
    print(f"  Expert {eb:>6}:  {row[0]:>6} {row[1]:>6} {row[2]:>6}")

# 5. Expert score by pipeline band
print(f"\n  Expert score by pipeline band:")
for pb in ['LOW', 'MEDIUM', 'HIGH']:
    sub = sf[sf['fluency_band'] == pb]
    if len(sub) > 0:
        print(f"    {pb}: n={len(sub)}, expert_mean={sub['expert_score'].mean():.2f} "
              f"[{sub['expert_score'].min():.1f}-{sub['expert_score'].max():.1f}]")

# 6. Weight optimization
print(f"\n{'='*70}")
print(f"WEIGHT OPTIMIZATION (z-scored dimensions)")
print(f"{'='*70}")

dim_data = sf[dims].copy()
for d in dims:
    mu, sigma = dim_data[d].mean(), dim_data[d].std()
    if sigma > 0:
        dim_data[d] = (dim_data[d] - mu) / sigma
    else:
        dim_data[d] = 0

X = dim_data.values
y = sf['expert_score'].values

eq_w = np.ones(6) / 6
lit_w = np.array([2, 2, 1, 1, 3, 2], dtype=float)
lit_w /= lit_w.sum()

def neg_rho(w):
    wn = np.abs(w) / np.abs(w).sum()
    r, _ = spearmanr(X @ wn, y)
    return -r if not np.isnan(r) else 0

best_w, best_rho = None, -1
for seed in range(200):
    rng = np.random.default_rng(seed)
    w0 = rng.dirichlet(np.ones(6))
    res = minimize(neg_rho, w0, method='Nelder-Mead', options={'maxiter': 10000, 'xatol': 1e-8})
    r = -res.fun
    if r > best_rho:
        best_rho = r
        best_w = np.abs(res.x) / np.abs(res.x).sum()

print(f"  Optimized rho: {best_rho:.3f}")

# Cross-validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv = {'equal': [], 'literature': [], 'optimized': []}
for tri, tei in kf.split(X):
    bw, br = None, -1
    for seed in range(50):
        rng = np.random.default_rng(seed)
        def nr(w):
            wn = np.abs(w) / np.abs(w).sum()
            r, _ = spearmanr(X[tri] @ wn, y[tri])
            return -r if not np.isnan(r) else 0
        res = minimize(nr, rng.dirichlet(np.ones(6)), method='Nelder-Mead', options={'maxiter': 5000})
        if -res.fun > br:
            br = -res.fun
            bw = np.abs(res.x) / np.abs(res.x).sum()
    for label, w in [('equal', eq_w), ('literature', lit_w), ('optimized', bw)]:
        r, _ = spearmanr(X[tei] @ w, y[tei])
        cv[label].append(r if not np.isnan(r) else 0)

print(f"\n  5-fold CV (out-of-fold rho):")
print(f"  {'Method':<15s}  {'Mean':>7s}  {'Std':>7s}  Folds")
for label in ['equal', 'literature', 'optimized']:
    rhos = cv[label]
    print(f"  {label:<15s}  {np.mean(rhos):>7.3f}  {np.std(rhos):>7.3f}  {[round(r, 3) for r in rhos]}")

print(f"\n  Weight comparison:")
print(f"  {'Dimension':<25s}  {'Equal':>8s}  {'Lit':>8s}  {'Opt':>8s}")
for i, d in enumerate(dims):
    print(f"  {d:<25s}  {eq_w[i]:>8.3f}  {lit_w[i]:>8.3f}  {best_w[i]:>8.3f}")

# Recommendation
co = np.mean(cv['optimized'])
cl = np.mean(cv['literature'])
ce = np.mean(cv['equal'])
print(f"\n  Equal={ce:.3f}  Literature={cl:.3f}  Optimized={co:.3f}")
if co > cl + 0.03:
    print(f"  → USE OPTIMIZED (+{co - cl:.3f})")
elif cl > ce + 0.03:
    print(f"  → KEEP LITERATURE (+{cl - ce:.3f})")
else:
    print(f"  → Differences < 0.03 — keep literature for interpretability")

# Compare with 4-dim Colab
colab = pd.read_csv(BASE / "EDA/data/sandi_validation_full.csv")
if len(colab) > 0:
    merged = sf.merge(colab[['file_id', 'composite_raw']], on='file_id', suffixes=('_6dim', '_4dim'))
    rho_6, _ = spearmanr(merged['expert_score'], merged['composite_raw_6dim'])
    rho_4, _ = spearmanr(merged['expert_score'], merged['composite_raw_4dim'])
    print(f"\n  4-dim (Colab) vs 6-dim (local) on {len(merged)} files:")
    print(f"    4-dim rho: {rho_4:.3f}")
    print(f"    6-dim rho: {rho_6:.3f}")
    print(f"    Gain: {rho_6 - rho_4:+.3f}")

print(f"\nTotal processing time: {total_time / 60:.1f} minutes")