fluency-benchmark / run_hybrid_v2_test.py
syt20's picture
Replace with fluency_app_v3: updated models, new pipeline modules, experiments
63fae5b verified
"""
Hybrid V2 Validation: Test the fixed hybrid VAD against silero baseline.
Uses cached transcriptions from prior runs. Only swaps the VAD component.
Compares silero vs hybrid_v2 on SANDI dev-438 with full band analysis.
"""
import sys, os, time, warnings, pickle
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))
from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu
BASE = Path(__file__).parent.parent
CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"
filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"SANDI files: {len(filelist)}")
cached = sum(1 for _, row in filelist.iterrows()
if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
print(f"Cached transcriptions: {cached}/{len(filelist)}")
print("Loading models...", flush=True)
from pipeline.hybrid_vad import run_hybrid_vad
from pipeline.placement import classify_pauses
from pipeline.fa_features import compute_fa_features
from pipeline.syntactic_features import compute_syntactic_features
from models.inference import predict
from pipeline.composite import compute_composite
print("Models loaded.\n")
MODES = ["silero", "hybrid_v2"]
all_results = {m: [] for m in MODES}
errors = {m: 0 for m in MODES}
start_time = time.time()
for idx, row in filelist.iterrows():
file_id = row['file_id']
cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
if not cache_path.exists():
continue
audio_path = row['audio_path']
if not os.path.isabs(audio_path):
audio_path = str(BASE / audio_path)
if not os.path.exists(audio_path):
continue
with open(cache_path, 'rb') as f:
tx = pickle.load(f)
words = tx['words']
word_count = len(words)
n = idx + 1
elapsed = time.time() - start_time
rate = n / max(elapsed, 1)
eta = (len(filelist) - n) / max(rate, 0.01)
if n % 50 == 0 or n <= 3 or n == len(filelist):
print(f" [{n}/{len(filelist)}] {file_id} "
f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)
for mode in MODES:
try:
vad = run_hybrid_vad(audio_path, mode=mode)
vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
placement = classify_pauses(words, vad)
fa = compute_fa_features(words, vad['total_duration_sec'])
syn = compute_syntactic_features(words, tx['transcript'])
all_features = {**vad, **placement, **fa, **syn}
predictions = predict(all_features)
composite = compute_composite(all_features, predictions)
all_results[mode].append({
'file_id': file_id,
'expert_score': row['expert_score'],
'composite_raw': composite['composite_raw'],
'composite_percentile': composite['composite_percentile'],
'fluency_band': composite['fluency_band'],
'speech_ratio': vad['speech_ratio'],
'mlu': vad['mlu'],
'word_count': word_count,
'pause_count': vad['pause_count'],
'mean_pause_dur': vad['mean_pause_duration_sec'],
'long_pause_ratio': vad['long_pause_ratio'],
'short_pause_share': vad.get('short_pause_share', 0),
'speech_segments': vad['speech_segments'],
'speech_duration_sec': vad['speech_duration_sec'],
'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
'boundary_pause_ratio': placement['boundary_pause_ratio'],
'dim_continuity': composite['dim_continuity'],
'dim_pause_quality': composite['dim_pause_quality'],
'dim_articulation': composite['dim_articulation'],
'dim_dominance': composite['dim_dominance'],
'dim_placement': composite['dim_placement'],
'dim_word_precision': composite['dim_word_precision'],
})
except Exception as e:
errors[mode] += 1
if errors[mode] <= 5:
print(f" ERROR [{mode}] {file_id}: {e}", flush=True)
total_time = time.time() - start_time
print(f"\nDone in {total_time/60:.1f} minutes")
for m in MODES:
print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors")
dfs = {}
for mode in MODES:
dfs[mode] = pd.DataFrame(all_results[mode])
# Also load old hybrid for comparison
old_hybrid = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv")
dfs["hybrid_v1"] = old_hybrid
COMPARE_MODES = ["silero", "hybrid_v2", "hybrid_v1"]
def expert_band(score):
if score < 3.0: return 'LOW'
elif score < 4.5: return 'MEDIUM'
else: return 'HIGH'
for mode in COMPARE_MODES:
dfs[mode]['expert_band'] = dfs[mode]['expert_score'].apply(expert_band)
N = min(len(dfs[m]) for m in ["silero", "hybrid_v2"])
def section(title):
print(f"\n{'='*80}")
print(f" {title}")
print(f"{'='*80}")
BANDS = ["LOW", "MEDIUM", "HIGH"]
# ── 1. OVERALL ──
section("1. OVERALL METRICS")
for mode in COMPARE_MODES:
df = dfs[mode]
rho, p = spearmanr(df['expert_score'], df['composite_raw'])
agree = (df['expert_band'] == df['fluency_band']).sum()
acc = agree / len(df)
# Macro F1
f1s = []
for band in BANDS:
tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
prec = tp/(tp+fp) if (tp+fp) > 0 else 0
rec = tp/(tp+fn) if (tp+fn) > 0 else 0
f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0
f1s.append(f1)
macro_f1 = np.mean(f1s)
marker = ""
if mode == "hybrid_v2":
marker = " ◄ NEW"
elif mode == "hybrid_v1":
marker = " (old)"
print(f" {mode:<12s} ρ={rho:+.4f} acc={acc:.1%} macro-F1={macro_f1:.3f}{marker}")
# ── 2. CONFUSION MATRICES ──
section("2. CONFUSION MATRICES")
for mode in COMPARE_MODES:
df = dfs[mode]
agree = (df['expert_band'] == df['fluency_band']).sum()
print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})")
print(f" {'':>16} Pipeline→ {'LOW':>5} {'MED':>5} {'HIGH':>5}")
for eb in BANDS:
row = []
for pb in BANDS:
n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
row.append(n)
print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}")
# ── 3. KEY METRICS ──
section("3. SEGMENT COUNT & MLU COMPARISON")
print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} {'PauseDur':>10s} {'LPR':>10s}")
print(f" {'-'*65}")
for mode in COMPARE_MODES:
df = dfs[mode]
print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} "
f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} "
f"{df['long_pause_ratio'].mean():>10.4f}")
# ── 4. PER-BAND ANALYSIS ──
section("4. PER-BAND FEATURE COMPARISON")
for band in BANDS:
print(f"\n [{band}]")
print(f" {'Mode':<12s} {'N':>4s} {'Segments':>10s} {'MLU':>10s} {'MED→HIGH':>10s} {'Composite':>10s}")
print(f" {'-'*60}")
for mode in COMPARE_MODES:
df = dfs[mode]
sub = df[df['expert_band'] == band]
med_as_high = "N/A"
if band == "MEDIUM":
n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
med_as_high = str(n_mh)
elif band == "HIGH":
n_hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum()
med_as_high = f"H→M:{n_hm}"
print(f" {mode:<12s} {len(sub):>4d} {sub['speech_segments'].mean():>10.2f} "
f"{sub['mlu'].mean():>10.2f} {med_as_high:>10s} {sub['composite_raw'].mean():>+10.4f}")
# ── 5. THRESHOLD CROSSINGS ──
section("5. THRESHOLD CROSSINGS (HIGH: MLU≥7 + SR≥0.75 + LPR≤0.15)")
for mode in COMPARE_MODES:
df = dfs[mode]
med = df[df['expert_band'] == 'MEDIUM']
n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum()
n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
print(f" {mode:<12s} MEDIUM meeting all 3: {n_all3}/{len(med)} MEDIUM→HIGH: {n_mh}")
# ── 6. BOOTSTRAP ──
section("6. BOOTSTRAP (hybrid_v2 vs silero)")
if "hybrid_v2" in dfs and "silero" in dfs:
common = dfs['silero'][['file_id','expert_score','composite_raw']].merge(
dfs['hybrid_v2'][['file_id','composite_raw']],
on='file_id', suffixes=('_sil','_v2'))
expert = common['expert_score'].values
comp_s = common['composite_raw_sil'].values
comp_v = common['composite_raw_v2'].values
NC = len(common)
rho_s, _ = spearmanr(expert, comp_s)
rho_v, _ = spearmanr(expert, comp_v)
n_boot = 10000
rng = np.random.default_rng(42)
deltas = np.zeros(n_boot)
for i in range(n_boot):
idx = rng.choice(NC, NC, replace=True)
rs, _ = spearmanr(expert[idx], comp_s[idx])
rv, _ = spearmanr(expert[idx], comp_v[idx])
deltas[i] = rv - rs
p_val = (deltas <= 0).mean()
ci = np.percentile(deltas, [2.5, 97.5])
print(f"\n N={NC} paired files")
print(f" Silero: ρ = {rho_s:.4f}")
print(f" Hybrid V2: ρ = {rho_v:.4f}")
print(f" Δρ = {rho_v - rho_s:+.4f} 95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}] p={p_val:.4f}")
# ── 7. VERDICT ──
section("VERDICT")
df_sil = dfs['silero']
df_v2 = dfs['hybrid_v2']
rho_s, _ = spearmanr(df_sil['expert_score'], df_sil['composite_raw'])
rho_v, _ = spearmanr(df_v2['expert_score'], df_v2['composite_raw'])
acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil)
acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2)
mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum()
mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum()
mh_v1 = ((old_hybrid['expert_band'] == 'MEDIUM') & (old_hybrid['fluency_band'] == 'HIGH')).sum() if 'expert_band' in old_hybrid.columns else "?"
seg_s = df_sil['speech_segments'].mean()
seg_v = df_v2['speech_segments'].mean()
mlu_s = df_sil['mlu'].mean()
mlu_v = df_v2['mlu'].mean()
print(f"\n METRIC COMPARISON:")
print(f" {'Metric':<25s} {'Silero':>10s} {'Hybrid V2':>10s} {'Hybrid V1':>10s} {'V2 vs Sil':>10s}")
print(f" {'-'*70}")
print(f" {'Spearman ρ':<25s} {rho_s:>+10.4f} {rho_v:>+10.4f} {'--':>10s} {rho_v-rho_s:>+10.4f}")
print(f" {'Accuracy':<25s} {acc_s:>10.1%} {acc_v:>10.1%} {'--':>10s} {(acc_v-acc_s)*100:>+10.1f}pp")
print(f" {'MEDIUM→HIGH FP':<25s} {mh_s:>10d} {mh_v:>10d} {mh_v1:>10d} {mh_v-mh_s:>+10d}")
print(f" {'Mean segments':<25s} {seg_s:>10.2f} {seg_v:>10.2f} {'--':>10s} {seg_v-seg_s:>+10.2f}")
print(f" {'Mean MLU':<25s} {mlu_s:>10.2f} {mlu_v:>10.2f} {'--':>10s} {mlu_v-mlu_s:>+10.2f}")
if mh_v < mh_s:
print(f"\n ✅ MEDIUM→HIGH false positives REDUCED from {mh_s} to {mh_v} ({mh_s-mh_v} fewer)")
elif mh_v == mh_s:
print(f"\n → MEDIUM→HIGH false positives UNCHANGED at {mh_v}")
else:
print(f"\n ⚠ MEDIUM→HIGH false positives INCREASED from {mh_s} to {mh_v}")
if rho_v >= rho_s - 0.01:
print(f" ✅ Spearman ρ maintained ({rho_v:+.4f} vs {rho_s:+.4f})")
else:
print(f" ⚠ Spearman ρ degraded ({rho_v:+.4f} vs {rho_s:+.4f})")
print(f"\n Total time: {total_time/60:.1f} minutes")