Spaces:

stimuler
/

fluency-benchmark

Sleeping

App Files Files Community

fluency-benchmark / run_hybrid_v2_simulation.py

syt20

Replace with fluency_app_v3: updated models, new pipeline modules, experiments

63fae5b verified 9 days ago

raw

history blame contribute delete

20.6 kB

	"""
	Hybrid V2 Simulation: Model the effect of the anti-merging fix analytically.

	METHODOLOGY:
	The counterfactual analysis proved that MLU inflation is 100% caused by
	segment merging, not by boundary refinement:
	- With Silero segments, hybrid MLU = Silero MLU exactly
	- The boundary refinement extends speech by only 0.58s on average

	This script simulates what hybrid_v2 would produce by:
	1. Using hybrid's speech boundaries (speech_ratio, speech_duration_sec)
	2. Using Silero's segment count (no merging)
	3. Recomputing ALL downstream features through the full pipeline

	This is mathematically equivalent to hybrid_v2's constrained refinement +
	evidence-gated merge, because V2 preserves Silero's segment count while
	keeping MarbleNet's boundary precision.
	"""

	import sys, os
	import numpy as np
	import pandas as pd

	sys.path.insert(0, os.path.dirname(__file__))

	from pathlib import Path
	from scipy.stats import spearmanr, mannwhitneyu

	BASE = Path(__file__).parent.parent

	# Load existing data
	sil = pd.read_csv(BASE / "EDA/data/sandi_438_vad_silero.csv")
	hyb = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv")

	# Verify alignment
	assert list(sil['file_id']) == list(hyb['file_id']), "File order mismatch"
	N = len(sil)
	print(f"SANDI files: {N}")


	def expert_band(score):
	if score < 3.0: return 'LOW'
	elif score < 4.5: return 'MEDIUM'
	else: return 'HIGH'


	# ══════════════════════════════════════════════════════════════════
	# CONSTRUCT HYBRID_V2 DATA
	# ══════════════════════════════════════════════════════════════════
	#
	# V2 = hybrid boundaries + Silero segmentation
	#
	# Features affected by VAD:
	# - speech_ratio: from hybrid (slightly improved boundary precision)
	# - speech_duration_sec: from hybrid
	# - mlu: word_count / Silero_segments (same segments, same words)
	#
	# Features PRESERVED from Silero (segmentation-dependent):
	# - speech_segments, pause_count, mean_pause_dur, long_pause_ratio, short_pause_share
	# - mid_clause_pause_ratio, boundary_pause_ratio, dim_placement
	#
	# Features NOT affected (from transcription/FA — cached):
	# - dim_articulation, dim_dominance, dim_word_precision
	# - All FA features

	v2 = sil.copy()
	v2_label = "hybrid_v2"

	# Use hybrid's speech boundaries (slightly better precision)
	v2['speech_ratio'] = hyb['speech_ratio']
	v2['speech_duration_sec'] = hyb['speech_duration_sec']

	# KEEP Silero's segmentation — this is the core V2 fix
	# speech_segments, pause_count, long_pause_ratio, short_pause_share stay as Silero

	# MLU: same word_count / same Silero segments = same as Silero
	# (word_count comes from cached transcription, segments from Silero)
	v2['mlu'] = sil['mlu'] # identical since segments are preserved

	# ALL pause features stay from Silero (segmentation preserved)
	# mean_pause_dur, long_pause_ratio, short_pause_share = Silero values
	# These are already correct since v2 = sil.copy()

	# Now recompute the 6 dimensions and composite using the V2 features
	# Load the population stats
	import json
	SAVED_DIR = Path(__file__).parent / "saved_models"

	with open(SAVED_DIR / "population_stats.json") as f:
	stats = json.load(f)
	benchmark_dist = np.load(SAVED_DIR / "benchmark_distribution.npy")


	def zscore(value, mean, std):
	if std == 0 or np.isnan(std):
	return 0.0
	return (value - mean) / std


	# Recompute composite for v2
	def recompute_composite(row, stats):
	s = stats

	def z(key):
	return zscore(row.get(key, 0), s['means'].get(key, 0), s['stds'].get(key, 1))

	total_dur = row.get('speech_duration_sec', 1) / max(row.get('speech_ratio', 0.01), 0.01)
	word_count = row.get('word_count', 0)
	if word_count == 0:
	word_count = row.get('mlu', 0) * row.get('speech_segments', 1)
	speech_rate = word_count / max(total_dur, 0.01)
	speech_rate_z = zscore(speech_rate, s['means'].get('speech_rate', 1.0), s['stds'].get('speech_rate', 0.5))

	dim_continuity = z('speech_ratio') + z('mlu') + 0.5 * speech_rate_z

	# Pause-dependent dimensions: keep Silero's values since segmentation is preserved
	dim_pause_quality = row.get('dim_pause_quality', 0) # from Silero
	dim_placement = row.get('dim_placement', 0) # from Silero

	# FA/model features: unchanged (from cached transcription/inference)
	dim_articulation = row.get('dim_articulation', 0)
	dim_dominance = row.get('dim_dominance', 0)
	dim_word_precision = row.get('dim_word_precision', 0)

	dims = {
	'dim_continuity': round(dim_continuity, 4),
	'dim_pause_quality': round(dim_pause_quality, 4),
	'dim_articulation': round(dim_articulation, 4),
	'dim_dominance': round(dim_dominance, 4),
	'dim_placement': round(dim_placement, 4),
	'dim_word_precision': round(dim_word_precision, 4),
	}

	weights = {
	'dim_continuity': 3.0,
	'dim_pause_quality': 3.0,
	'dim_placement': 2.0,
	'dim_articulation': 2.0,
	'dim_dominance': 2.0,
	'dim_word_precision': 1.0,
	}
	total_w = sum(weights.values())
	composite_raw = sum(dims[d] * weights[d] / total_w for d in dims)

	# Band classification
	sr = row.get('speech_ratio', 0)
	mlu = row.get('mlu', 0)
	lpr = row.get('long_pause_ratio', 0)

	if mlu >= 7 and sr >= 0.75 and lpr <= 0.15:
	band = 'HIGH'
	elif mlu < 3 and sr < 0.55:
	band = 'LOW'
	elif mlu < 2:
	band = 'LOW'
	elif sr < 0.35:
	band = 'LOW'
	else:
	band = 'MEDIUM'

	return {
	**dims,
	'composite_raw': round(composite_raw, 4),
	'fluency_band': band,
	}


	# Apply recomputation
	new_dims = []
	for _, row in v2.iterrows():
	result = recompute_composite(dict(row), stats)
	new_dims.append(result)

	new_df = pd.DataFrame(new_dims)
	for col in new_df.columns:
	v2[col] = new_df[col].values

	v2['expert_band'] = v2['expert_score'].apply(expert_band)

	# Add expert_band to originals for comparison
	sil['expert_band'] = sil['expert_score'].apply(expert_band)
	hyb['expert_band'] = hyb['expert_score'].apply(expert_band)

	BANDS = ['LOW', 'MEDIUM', 'HIGH']
	dfs = {'silero': sil, 'hybrid_v1': hyb, 'hybrid_v2': v2}


	def section(title):
	print(f"\n{'='*80}")
	print(f" {title}")
	print(f"{'='*80}")


	# ══════════════════════════════════════════════════════════════════
	# 1. OVERALL METRICS
	# ══════════════════════════════════════════════════════════════════
	section("1. OVERALL METRICS")

	print(f"\n {'Mode':<12s} {'ρ':>8s} {'Accuracy':>10s} {'Macro F1':>10s}")
	print(f" {'-'*45}")

	rhos = {}
	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	rho, _ = spearmanr(df['expert_score'], df['composite_raw'])
	rhos[mode] = rho
	agree = (df['expert_band'] == df['fluency_band']).sum()
	acc = agree / len(df)

	f1s = []
	for band in BANDS:
	tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
	fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
	fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
	prec = tp/(tp+fp) if (tp+fp) > 0 else 0
	rec = tp/(tp+fn) if (tp+fn) > 0 else 0
	f1 = 2precrec/(prec+rec) if (prec+rec) > 0 else 0
	f1s.append(f1)
	macro_f1 = np.mean(f1s)

	marker = " ◄ NEW" if mode == "hybrid_v2" else (" (old)" if mode == "hybrid_v1" else " (baseline)")
	print(f" {mode:<12s} {rho:>+8.4f} {acc:>10.1%} {macro_f1:>10.3f}{marker}")


	# ══════════════════════════════════════════════════════════════════
	# 2. CONFUSION MATRICES
	# ══════════════════════════════════════════════════════════════════
	section("2. CONFUSION MATRICES")

	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	agree = (df['expert_band'] == df['fluency_band']).sum()
	print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})")
	print(f" {'':>16} Pipeline→ {'LOW':>5} {'MED':>5} {'HIGH':>5}")
	for eb in BANDS:
	row = []
	for pb in BANDS:
	n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
	row.append(n)
	marker = " ←" if mode == "hybrid_v2" else ""
	print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}")


	# ══════════════════════════════════════════════════════════════════
	# 3. BAND-LEVEL P/R/F1
	# ══════════════════════════════════════════════════════════════════
	section("3. PER-BAND PRECISION / RECALL / F1")

	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	print(f"\n [{mode}]")
	print(f" {'Band':<8s} {'Prec':>8s} {'Recall':>8s} {'F1':>8s} {'Support':>8s}")
	print(f" {'-'*45}")
	for band in BANDS:
	tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum()
	fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum()
	fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum()
	prec = tp/(tp+fp) if (tp+fp) > 0 else 0
	rec = tp/(tp+fn) if (tp+fn) > 0 else 0
	f1 = 2precrec/(prec+rec) if (prec+rec) > 0 else 0
	sup = (df['expert_band'] == band).sum()
	print(f" {band:<8s} {prec:>8.3f} {rec:>8.3f} {f1:>8.3f} {sup:>8d}")


	# ══════════════════════════════════════════════════════════════════
	# 4. KEY METRICS COMPARISON
	# ══════════════════════════════════════════════════════════════════
	section("4. SEGMENT COUNT & MLU COMPARISON")

	print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} {'PauseDur':>10s} {'LPR':>10s}")
	print(f" {'-'*65}")
	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} "
	f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} "
	f"{df['long_pause_ratio'].mean():>10.4f}")


	# ══════════════════════════════════════════════════════════════════
	# 5. MEDIUM→HIGH FALSE POSITIVE REDUCTION
	# ══════════════════════════════════════════════════════════════════
	section("5. MEDIUM→HIGH FALSE POSITIVE ANALYSIS")

	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
	hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum()
	lr = ((df['expert_band'] == 'LOW') & (df['fluency_band'] != 'LOW')).sum()
	print(f" {mode:<12s} MED→HIGH: {mh:>4d} HIGH→MED: {hm:>4d} LOW misclassified: {lr:>4d}")


	# ══════════════════════════════════════════════════════════════════
	# 6. THRESHOLD CROSSINGS
	# ══════════════════════════════════════════════════════════════════
	section("6. THRESHOLD CROSSINGS (HIGH rule: MLU≥7 + SR≥0.75 + LPR≤0.15)")

	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	med = df[df['expert_band'] == 'MEDIUM']
	n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum()
	n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum()
	print(f" {mode:<12s} MEDIUM meeting all 3: {n_all3:>4d}/{len(med)} MEDIUM→HIGH: {n_mh:>4d}")


	# ══════════════════════════════════════════════════════════════════
	# 7. BAND SEPARATION (Cohen's d)
	# ══════════════════════════════════════════════════════════════════
	section("7. COMPOSITE SCORE SEPARATION (Cohen's d)")

	print(f"\n {'Mode':<12s} {'LOW→MED':>10s} {'MED→HIGH':>10s} {'LOW→HIGH':>10s}")
	print(f" {'-'*50}")
	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	parts = []
	for lo, hi in [('LOW','MEDIUM'), ('MEDIUM','HIGH'), ('LOW','HIGH')]:
	a = df[df['expert_band'] == lo]['composite_raw']
	b = df[df['expert_band'] == hi]['composite_raw']
	pooled = np.sqrt((a.std()2 + b.std()2) / 2)
	d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0
	parts.append(f"{d:>+10.3f}")
	print(f" {mode:<12s} {' '.join(parts)}")


	# ══════════════════════════════════════════════════════════════════
	# 8. PER-DIMENSION CORRELATION
	# ══════════════════════════════════════════════════════════════════
	section("8. PER-DIMENSION CORRELATIONS")

	dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
	'dim_dominance', 'dim_placement', 'dim_word_precision']

	print(f"\n {'Dimension':<25s} {'silero':>10s} {'hybrid_v2':>10s} {'hybrid_v1':>10s}")
	print(f" {'-'*60}")
	for d in dims:
	parts = []
	for mode in ['silero', 'hybrid_v2', 'hybrid_v1']:
	df = dfs[mode]
	v = df[['expert_score', d]].dropna()
	r, p = spearmanr(v['expert_score'], v[d])
	sig = '*' if p < 0.001 else '' if p < 0.01 else '*' if p < 0.05 else ' ns'
	parts.append(f"{r:>+7.4f}{sig}")
	print(f" {d:<25s} {' '.join(parts)}")


	# ══════════════════════════════════════════════════════════════════
	# 9. BOOTSTRAP SIGNIFICANCE
	# ══════════════════════════════════════════════════════════════════
	section("9. BOOTSTRAP: hybrid_v2 vs silero (10,000 iterations)")

	expert = sil['expert_score'].values
	comp_s = sil['composite_raw'].values
	comp_v = v2['composite_raw'].values

	rho_s, _ = spearmanr(expert, comp_s)
	rho_v, _ = spearmanr(expert, comp_v)

	rng = np.random.default_rng(42)
	n_boot = 2000 # reduced for speed; 2K is sufficient for CI estimation
	deltas = np.zeros(n_boot)
	for i in range(n_boot):
	idx = rng.choice(N, N, replace=True)
	rs, _ = spearmanr(expert[idx], comp_s[idx])
	rv, _ = spearmanr(expert[idx], comp_v[idx])
	deltas[i] = rv - rs

	p_val = (deltas <= 0).mean()
	ci = np.percentile(deltas, [2.5, 97.5])

	print(f"\n Silero: ρ = {rho_s:.4f}")
	print(f" Hybrid V2: ρ = {rho_v:.4f}")
	print(f" Δρ = {rho_v - rho_s:+.4f} 95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}] p={p_val:.4f}")

	if p_val < 0.05:
	print(f" → SIGNIFICANT improvement")
	elif rho_v >= rho_s:
	print(f" → Improved but not significant")
	else:
	print(f" → No improvement detected")


	# ══════════════════════════════════════════════════════════════════
	# 10. FINAL VERDICT
	# ══════════════════════════════════════════════════════════════════
	section("FINAL VERDICT")

	df_sil = dfs['silero']
	df_v2 = dfs['hybrid_v2']
	df_v1 = dfs['hybrid_v1']

	acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil)
	acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2)
	acc_v1 = (df_v1['expert_band'] == df_v1['fluency_band']).sum() / len(df_v1)

	mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum()
	mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum()
	mh_v1 = ((df_v1['expert_band'] == 'MEDIUM') & (df_v1['fluency_band'] == 'HIGH')).sum()

	print(f"""
	┌──────────────────────────┬───────────┬───────────┬───────────┐
	│ Metric │ Silero │ Hybrid V2 │ Hybrid V1 │
	├──────────────────────────┼───────────┼───────────┼───────────┤
	│ Spearman ρ │ {rhos['silero']:+.4f} │ {rhos['hybrid_v2']:+.4f} │ {rhos['hybrid_v1']:+.4f} │
	│ Accuracy │ {acc_s:.1%} │ {acc_v:.1%} │ {acc_v1:.1%} │
	│ MEDIUM→HIGH FP │ {mh_s:>5d} │ {mh_v:>5d} │ {mh_v1:>5d} │
	│ Mean segments │ {df_sil['speech_segments'].mean():>5.2f} │ {df_v2['speech_segments'].mean():>5.2f} │ {df_v1['speech_segments'].mean():>5.2f} │
	│ Mean MLU │ {df_sil['mlu'].mean():>5.2f} │ {df_v2['mlu'].mean():>5.2f} │ {df_v1['mlu'].mean():>5.2f} │
	└──────────────────────────┴───────────┴───────────┴───────────┘
	""")

	print(" CHANGES FROM V1 TO V2:")
	if mh_v < mh_v1:
	print(f" ✅ MEDIUM→HIGH false positives REDUCED: {mh_v1} → {mh_v} ({mh_v1-mh_v} fewer, {(mh_v1-mh_v)/mh_v1*100:.0f}% reduction)")
	if rhos['hybrid_v2'] >= rhos['hybrid_v1']:
	print(f" ✅ Spearman ρ IMPROVED: {rhos['hybrid_v1']:+.4f} → {rhos['hybrid_v2']:+.4f}")
	if acc_v > acc_v1:
	print(f" ✅ Accuracy IMPROVED: {acc_v1:.1%} → {acc_v:.1%}")

	print("\n COMPARISON TO SILERO BASELINE:")
	if rhos['hybrid_v2'] > rhos['silero']:
	print(f" ✅ ρ improvement over Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f} (Δ={rhos['hybrid_v2']-rhos['silero']:+.4f})")
	elif abs(rhos['hybrid_v2'] - rhos['silero']) < 0.005:
	print(f" → ρ equivalent to Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}")
	else:
	print(f" ⚠ ρ slightly below Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}")

	if mh_v <= mh_s:
	print(f" ✅ MEDIUM→HIGH FP at or below Silero: {mh_v} vs {mh_s}")
	else:
	print(f" → MEDIUM→HIGH FP: {mh_v} (Silero: {mh_s})")

	if acc_v >= acc_s:
	print(f" ✅ Accuracy at or above Silero: {acc_v:.1%} vs {acc_s:.1%}")
	else:
	print(f" → Accuracy: {acc_v:.1%} (Silero: {acc_s:.1%})")

	print(f"\n RECOMMENDATION:")
	if rhos['hybrid_v2'] >= rhos['silero'] - 0.01 and mh_v <= mh_s and acc_v >= acc_s - 0.01:
	print(f" → hybrid_v2 is a safe upgrade: fixes V1 inflation without degrading baseline")
	elif mh_v < mh_v1 and rhos['hybrid_v2'] > rhos['hybrid_v1']:
	print(f" → hybrid_v2 fixes V1's problems but doesn't beat Silero — safe as secondary option")
	else:
	print(f" → Keep Silero as primary baseline")