Spaces:

stimuler
/

fluency-benchmark

Sleeping

App Files Files Community

fluency-benchmark / run_hybrid_v2_test.py

syt20

Replace with fluency_app_v3: updated models, new pipeline modules, experiments

63fae5b verified 9 days ago

raw

history blame contribute delete

11.7 kB

	"""
	Hybrid V2 Validation: Test the fixed hybrid VAD against silero baseline.

	Uses cached transcriptions from prior runs. Only swaps the VAD component.
	Compares silero vs hybrid_v2 on SANDI dev-438 with full band analysis.
	"""

	import sys, os, time, warnings, pickle
	import numpy as np
	import pandas as pd

	warnings.filterwarnings("ignore")
	sys.path.insert(0, os.path.dirname(__file__))

	from pathlib import Path
	from scipy.stats import spearmanr, kendalltau, mannwhitneyu

	BASE = Path(__file__).parent.parent
	CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"

	filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
	print(f"SANDI files: {len(filelist)}")

	cached = sum(1 for _, row in filelist.iterrows()
	if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
	print(f"Cached transcriptions: {cached}/{len(filelist)}")

	print("Loading models...", flush=True)
	from pipeline.hybrid_vad import run_hybrid_vad
	from pipeline.placement import classify_pauses
	from pipeline.fa_features import compute_fa_features
	from pipeline.syntactic_features import compute_syntactic_features
	from models.inference import predict
	from pipeline.composite import compute_composite
	print("Models loaded.\n")

	MODES = ["silero", "hybrid_v2"]
	all_results = {m: [] for m in MODES}
	errors = {m: 0 for m in MODES}
	start_time = time.time()

	for idx, row in filelist.iterrows():
	file_id = row['file_id']
	cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
	if not cache_path.exists():
	continue

	audio_path = row['audio_path']
	if not os.path.isabs(audio_path):
	audio_path = str(BASE / audio_path)
	if not os.path.exists(audio_path):
	continue

	with open(cache_path, 'rb') as f:
	tx = pickle.load(f)
	words = tx['words']
	word_count = len(words)

	n = idx + 1
	elapsed = time.time() - start_time
	rate = n / max(elapsed, 1)
	eta = (len(filelist) - n) / max(rate, 0.01)

	if n % 50 == 0 or n <= 3 or n == len(filelist):
	print(f" [{n}/{len(filelist)}] {file_id} "
	f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)

	for mode in MODES:
	try:
	vad = run_hybrid_vad(audio_path, mode=mode)
	vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
	placement = classify_pauses(words, vad)
	fa = compute_fa_features(words, vad['total_duration_sec'])
	syn = compute_syntactic_features(words, tx['transcript'])
	all_features = {vad, placement, fa, syn}
	predictions = predict(all_features)
	composite = compute_composite(all_features, predictions)

	all_results[mode].append({
	'file_id': file_id,
	'expert_score': row['expert_score'],
	'composite_raw': composite['composite_raw'],
	'composite_percentile': composite['composite_percentile'],
	'fluency_band': composite['fluency_band'],
	'speech_ratio': vad['speech_ratio'],
	'mlu': vad['mlu'],
	'word_count': word_count,
	'pause_count': vad['pause_count'],
	'mean_pause_dur': vad['mean_pause_duration_sec'],
	'long_pause_ratio': vad['long_pause_ratio'],
	'short_pause_share': vad.get('short_pause_share', 0),
	'speech_segments': vad['speech_segments'],
	'speech_duration_sec': vad['speech_duration_sec'],
	'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
	'boundary_pause_ratio': placement['boundary_pause_ratio'],
	'dim_continuity': composite['dim_continuity'],
	'dim_pause_quality': composite['dim_pause_quality'],
	'dim_articulation': composite['dim_articulation'],
	'dim_dominance': composite['dim_dominance'],
	'dim_placement': composite['dim_placement'],
	'dim_word_precision': composite['dim_word_precision'],
	})
	except Exception as e:
	errors[mode] += 1
	if errors[mode] <= 5:
	print(f" ERROR [{mode}] {file_id}: {e}", flush=True)

	total_time = time.time() - start_time
	print(f"\nDone in {total_time/60:.1f} minutes")
	for m in MODES:
	print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors")

	dfs = {}
	for mode in MODES:
	dfs[mode] = pd.DataFrame(all_results[mode])

	# Also load old hybrid for comparison
	old_hybrid = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv")
	dfs["hybrid_v1"] = old_hybrid
	COMPARE_MODES = ["silero", "hybrid_v2", "hybrid_v1"]


	def expert_band(score):
	if score < 3.0: return 'LOW'
	elif score < 4.5: return 'MEDIUM'
	else: return 'HIGH'


	for mode in COMPARE_MODES:
	dfs[mode]['expert_band'] = dfs[mode]['expert_score'].apply(expert_band)

	N = min(len(dfs[m]) for m in ["silero", "hybrid_v2"])

	def section(title):
	print(f"\n{'='*80}")
	print(f" {title}")
	print(f"{'='*80}")

	BANDS = ["LOW", "MEDIUM", "HIGH"]

	# ── 1. OVERALL ──
	section("1. OVERALL METRICS")

	for mode in COMPARE_MODES:
	df = dfs[mode]
	rho, p = spearmanr(df['expert_score'], df['composite_raw'])
	agree = (df['expert_band'] == df['fluency_band']).sum()
	acc = agree / len(df)

	# Macro F1
	f1s = []
	for band in BANDS:
	tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
	fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
	fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
	prec = tp/(tp+fp) if (tp+fp) > 0 else 0
	rec = tp/(tp+fn) if (tp+fn) > 0 else 0
	f1 = 2precrec/(prec+rec) if (prec+rec) > 0 else 0
	f1s.append(f1)
	macro_f1 = np.mean(f1s)

	marker = ""
	if mode == "hybrid_v2":
	marker = " ◄ NEW"
	elif mode == "hybrid_v1":
	marker = " (old)"
	print(f" {mode:<12s} ρ={rho:+.4f} acc={acc:.1%} macro-F1={macro_f1:.3f}{marker}")

	# ── 2. CONFUSION MATRICES ──
	section("2. CONFUSION MATRICES")

	for mode in COMPARE_MODES:
	df = dfs[mode]
	agree = (df['expert_band'] == df['fluency_band']).sum()
	print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})")
	print(f" {'':>16} Pipeline→ {'LOW':>5} {'MED':>5} {'HIGH':>5}")
	for eb in BANDS:
	row = []
	for pb in BANDS:
	n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
	row.append(n)
	print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}")

	# ── 3. KEY METRICS ──
	section("3. SEGMENT COUNT & MLU COMPARISON")

	print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} {'PauseDur':>10s} {'LPR':>10s}")
	print(f" {'-'*65}")
	for mode in COMPARE_MODES:
	df = dfs[mode]
	print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} "
	f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} "
	f"{df['long_pause_ratio'].mean():>10.4f}")

	# ── 4. PER-BAND ANALYSIS ──
	section("4. PER-BAND FEATURE COMPARISON")

	for band in BANDS:
	print(f"\n [{band}]")
	print(f" {'Mode':<12s} {'N':>4s} {'Segments':>10s} {'MLU':>10s} {'MED→HIGH':>10s} {'Composite':>10s}")
	print(f" {'-'*60}")
	for mode in COMPARE_MODES:
	df = dfs[mode]
	sub = df[df['expert_band'] == band]
	med_as_high = "N/A"
	if band == "MEDIUM":
	n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
	med_as_high = str(n_mh)
	elif band == "HIGH":
	n_hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum()
	med_as_high = f"H→M:{n_hm}"
	print(f" {mode:<12s} {len(sub):>4d} {sub['speech_segments'].mean():>10.2f} "
	f"{sub['mlu'].mean():>10.2f} {med_as_high:>10s} {sub['composite_raw'].mean():>+10.4f}")

	# ── 5. THRESHOLD CROSSINGS ──
	section("5. THRESHOLD CROSSINGS (HIGH: MLU≥7 + SR≥0.75 + LPR≤0.15)")

	for mode in COMPARE_MODES:
	df = dfs[mode]
	med = df[df['expert_band'] == 'MEDIUM']
	n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum()
	n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
	print(f" {mode:<12s} MEDIUM meeting all 3: {n_all3}/{len(med)} MEDIUM→HIGH: {n_mh}")

	# ── 6. BOOTSTRAP ──
	section("6. BOOTSTRAP (hybrid_v2 vs silero)")

	if "hybrid_v2" in dfs and "silero" in dfs:
	common = dfs['silero'][['file_id','expert_score','composite_raw']].merge(
	dfs['hybrid_v2'][['file_id','composite_raw']],
	on='file_id', suffixes=('_sil','_v2'))
	expert = common['expert_score'].values
	comp_s = common['composite_raw_sil'].values
	comp_v = common['composite_raw_v2'].values
	NC = len(common)

	rho_s, _ = spearmanr(expert, comp_s)
	rho_v, _ = spearmanr(expert, comp_v)

	n_boot = 10000
	rng = np.random.default_rng(42)
	deltas = np.zeros(n_boot)
	for i in range(n_boot):
	idx = rng.choice(NC, NC, replace=True)
	rs, _ = spearmanr(expert[idx], comp_s[idx])
	rv, _ = spearmanr(expert[idx], comp_v[idx])
	deltas[i] = rv - rs

	p_val = (deltas <= 0).mean()
	ci = np.percentile(deltas, [2.5, 97.5])

	print(f"\n N={NC} paired files")
	print(f" Silero: ρ = {rho_s:.4f}")
	print(f" Hybrid V2: ρ = {rho_v:.4f}")
	print(f" Δρ = {rho_v - rho_s:+.4f} 95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}] p={p_val:.4f}")

	# ── 7. VERDICT ──
	section("VERDICT")

	df_sil = dfs['silero']
	df_v2 = dfs['hybrid_v2']

	rho_s, _ = spearmanr(df_sil['expert_score'], df_sil['composite_raw'])
	rho_v, _ = spearmanr(df_v2['expert_score'], df_v2['composite_raw'])

	acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil)
	acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2)

	mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum()
	mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum()

	mh_v1 = ((old_hybrid['expert_band'] == 'MEDIUM') & (old_hybrid['fluency_band'] == 'HIGH')).sum() if 'expert_band' in old_hybrid.columns else "?"

	seg_s = df_sil['speech_segments'].mean()
	seg_v = df_v2['speech_segments'].mean()

	mlu_s = df_sil['mlu'].mean()
	mlu_v = df_v2['mlu'].mean()

	print(f"\n METRIC COMPARISON:")
	print(f" {'Metric':<25s} {'Silero':>10s} {'Hybrid V2':>10s} {'Hybrid V1':>10s} {'V2 vs Sil':>10s}")
	print(f" {'-'*70}")
	print(f" {'Spearman ρ':<25s} {rho_s:>+10.4f} {rho_v:>+10.4f} {'--':>10s} {rho_v-rho_s:>+10.4f}")
	print(f" {'Accuracy':<25s} {acc_s:>10.1%} {acc_v:>10.1%} {'--':>10s} {(acc_v-acc_s)*100:>+10.1f}pp")
	print(f" {'MEDIUM→HIGH FP':<25s} {mh_s:>10d} {mh_v:>10d} {mh_v1:>10d} {mh_v-mh_s:>+10d}")
	print(f" {'Mean segments':<25s} {seg_s:>10.2f} {seg_v:>10.2f} {'--':>10s} {seg_v-seg_s:>+10.2f}")
	print(f" {'Mean MLU':<25s} {mlu_s:>10.2f} {mlu_v:>10.2f} {'--':>10s} {mlu_v-mlu_s:>+10.2f}")

	if mh_v < mh_s:
	print(f"\n ✅ MEDIUM→HIGH false positives REDUCED from {mh_s} to {mh_v} ({mh_s-mh_v} fewer)")
	elif mh_v == mh_s:
	print(f"\n → MEDIUM→HIGH false positives UNCHANGED at {mh_v}")
	else:
	print(f"\n ⚠ MEDIUM→HIGH false positives INCREASED from {mh_s} to {mh_v}")

	if rho_v >= rho_s - 0.01:
	print(f" ✅ Spearman ρ maintained ({rho_v:+.4f} vs {rho_s:+.4f})")
	else:
	print(f" ⚠ Spearman ρ degraded ({rho_v:+.4f} vs {rho_s:+.4f})")

	print(f"\n Total time: {total_time/60:.1f} minutes")