Spaces:

stimuler
/

fluency-benchmark

Running

App Files Files Community

fluency-benchmark / run_selective_fusion_test.py

syt20

Replace with fluency_app_v3: updated models, new pipeline modules, experiments

63fae5b verified 8 days ago

raw

history blame contribute delete

14.4 kB

	"""
	Selective Feature Fusion VAD Test on SANDI 438.

	4 VAD modes: silero (baseline), ten, marblenet, fusion (selective)
	Uses cached transcriptions from prior run.

	Reports:
	1. Spearman rho/Kendall tau vs expert scores
	2. Rank-order accuracy (Spearman + pairwise ordering)
	3. Quintile segregation with Cohen's d and U-tests
	4. Classification accuracy (band agreement + confusion matrix)
	5. Per-dimension correlation breakdown
	6. Bootstrap significance tests (fusion vs baseline)
	7. Ordering analysis: how many pairs correctly ordered

	Usage: python run_selective_fusion_test.py
	"""

	import sys, os, time, warnings, pickle
	import numpy as np
	import pandas as pd

	warnings.filterwarnings("ignore")
	sys.path.insert(0, os.path.dirname(__file__))

	from pathlib import Path
	from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal

	BASE = Path(__file__).parent.parent
	CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"

	filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
	print(f"SANDI files: {len(filelist)}")

	# ── Verify transcription cache ──
	cached = sum(1 for _, row in filelist.iterrows()
	if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
	print(f"Cached transcriptions: {cached}/{len(filelist)}")
	if cached < len(filelist) * 0.5:
	print("ERROR: Run run_hybrid_vad_test.py --transcribe first")
	sys.exit(1)

	# ── Load pipeline modules ──
	print("Loading models...", flush=True)
	from pipeline.selective_fusion_vad import run_selective_vad
	from pipeline.placement import classify_pauses
	from pipeline.fa_features import compute_fa_features
	from pipeline.syntactic_features import compute_syntactic_features
	from models.inference import predict
	from pipeline.composite import compute_composite
	print("Models loaded.\n")

	MODES = ["silero", "ten", "marblenet", "fusion"]
	all_results = {m: [] for m in MODES}
	errors = {m: 0 for m in MODES}
	start_time = time.time()

	for idx, row in filelist.iterrows():
	file_id = row['file_id']
	cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
	if not cache_path.exists():
	continue

	audio_path = row['audio_path']
	if not os.path.isabs(audio_path):
	audio_path = str(BASE / audio_path)
	if not os.path.exists(audio_path):
	continue

	with open(cache_path, 'rb') as f:
	tx = pickle.load(f)
	words = tx['words']
	word_count = len(words)

	n = idx + 1
	elapsed = time.time() - start_time
	rate = n / max(elapsed, 1)
	eta = (len(filelist) - n) / max(rate, 0.01)

	if n % 50 == 0 or n <= 3 or n == len(filelist):
	print(f" [{n}/{len(filelist)}] {file_id} "
	f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)

	for mode in MODES:
	try:
	vad = run_selective_vad(audio_path, mode=mode)
	vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
	placement = classify_pauses(words, vad)
	fa = compute_fa_features(words, vad['total_duration_sec'])
	syn = compute_syntactic_features(words, tx['transcript'])
	all_features = {vad, placement, fa, syn}
	predictions = predict(all_features)
	composite = compute_composite(all_features, predictions)

	all_results[mode].append({
	'file_id': file_id,
	'expert_score': row['expert_score'],
	'composite_raw': composite['composite_raw'],
	'composite_percentile': composite['composite_percentile'],
	'fluency_band': composite['fluency_band'],
	'speech_ratio': vad['speech_ratio'],
	'mlu': vad['mlu'],
	'word_count': word_count,
	'pause_count': vad['pause_count'],
	'mean_pause_dur': vad['mean_pause_duration_sec'],
	'long_pause_ratio': vad['long_pause_ratio'],
	'short_pause_share': vad.get('short_pause_share', 0),
	'speech_segments': vad['speech_segments'],
	'speech_duration_sec': vad['speech_duration_sec'],
	'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
	'boundary_pause_ratio': placement['boundary_pause_ratio'],
	'dim_continuity': composite['dim_continuity'],
	'dim_pause_quality': composite['dim_pause_quality'],
	'dim_articulation': composite['dim_articulation'],
	'dim_dominance': composite['dim_dominance'],
	'dim_placement': composite['dim_placement'],
	'dim_word_precision': composite['dim_word_precision'],
	})
	except Exception as e:
	errors[mode] += 1
	if errors[mode] <= 3:
	print(f" ERROR [{mode}] {file_id}: {e}", flush=True)

	total_time = time.time() - start_time
	print(f"\nDone in {total_time/60:.1f} minutes")
	for m in MODES:
	print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors")

	dfs = {}
	for mode in MODES:
	df = pd.DataFrame(all_results[mode])
	out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv"
	df.to_csv(out_path, index=False)
	dfs[mode] = df
	print(f"Saved: {out_path}")


	# ══════════════════════════════════════════════════════════════════
	# ANALYSIS
	# ══════════════════════════════════════════════════════════════════

	def section(title):
	w = 80
	print(f"\n{'='*w}")
	print(f" {title}")
	print(f"{'='*w}")

	dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
	'dim_dominance', 'dim_placement', 'dim_word_precision']

	# ── 1. Overall correlation ──
	section("1. OVERALL CORRELATION WITH EXPERT SCORES")
	print(f"\n {'Mode':<12s} {'Spearman':>10s} {'p':>10s} {'Kendall':>10s} {'p':>10s} {'N':>5s}")
	print(f" {'-'*55}")
	rhos = {}
	for mode in MODES:
	df = dfs[mode]
	rho, p_rho = spearmanr(df['expert_score'], df['composite_raw'])
	tau, p_tau = kendalltau(df['expert_score'], df['composite_raw'])
	rhos[mode] = rho
	marker = ' <-- BEST' if rho == max(rhos.values()) else ''
	print(f" {mode:<12s} {rho:>+10.4f} {p_rho:>10.2e} {tau:>+10.4f} {p_tau:>10.2e} {len(df):>5d}{marker}")

	# ── 2. Pairwise ordering accuracy ──
	section("2. PAIRWISE ORDERING ACCURACY")
	print(" (What % of speaker pairs are ordered correctly by the pipeline?)")
	for mode in MODES:
	df = dfs[mode]
	exp = df['expert_score'].values
	comp = df['composite_raw'].values
	n = len(exp)
	concordant = 0
	discordant = 0
	tied = 0
	for i in range(n):
	for j in range(i + 1, n):
	e_diff = exp[i] - exp[j]
	c_diff = comp[i] - comp[j]
	if e_diff == 0:
	tied += 1
	elif e_diff * c_diff > 0:
	concordant += 1
	else:
	discordant += 1
	total_pairs = concordant + discordant
	accuracy = concordant / total_pairs * 100 if total_pairs > 0 else 0
	print(f" {mode:<12s}: {concordant}/{total_pairs} pairs correct ({accuracy:.1f}%)"
	f" tied={tied}")

	# ── 3. Quintile segregation ──
	section("3. QUINTILE SEGREGATION")
	for mode in MODES:
	df = dfs[mode].copy()
	df['quintile'] = pd.qcut(df['expert_score'], 5,
	labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
	duplicates='drop')
	print(f"\n [{mode}]")
	print(f" {'Q':>4s} {'N':>4s} {'Expert':>8s} {'Composite':>10s} {'SR':>7s} "
	f"{'MLU':>7s} {'PauseDur':>9s} {'LPR':>7s}")
	print(f" {'-'*65}")

	q_means = {}
	for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']:
	sub = df[df['quintile'] == q]
	if len(sub) == 0: continue
	q_means[q] = sub['composite_raw'].mean()
	print(f" {q:>4s} {len(sub):>4d} {sub['expert_score'].mean():>8.2f} "
	f"{sub['composite_raw'].mean():>+10.4f} {sub['speech_ratio'].mean():>7.3f} "
	f"{sub['mlu'].mean():>7.2f} {sub['mean_pause_dur'].mean():>9.4f} "
	f"{sub['long_pause_ratio'].mean():>7.4f}")

	# Adjacent quintile effect sizes
	print(f"\n Cohen's d (adjacent quintiles):")
	pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')]
	for q_lo, q_hi in pairs:
	a = df[df['quintile'] == q_lo]['composite_raw']
	b = df[df['quintile'] == q_hi]['composite_raw']
	if len(a) == 0 or len(b) == 0: continue
	pooled_std = np.sqrt((a.std()2 + b.std()2) / 2)
	d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0
	u, p = mannwhitneyu(a, b, alternative='less')
	sig = '*' if p < 0.001 else '' if p < 0.01 else '*' if p < 0.05 else 'ns'
	print(f" {q_lo}->{q_hi}: d={d:+.3f} p={p:.4f} {sig}")

	if len(q_means) >= 2:
	qs = sorted(q_means.keys())
	spread = q_means[qs[-1]] - q_means[qs[0]]
	print(f" Q5-Q1 spread: {spread:+.4f}")

	groups = [df[df['quintile'] == q]['composite_raw'].values
	for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
	if len(df[df['quintile'] == q]) > 0]
	if len(groups) >= 2:
	H, p_kw = kruskal(*groups)
	print(f" Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}")

	# ── 4. Band classification accuracy ──
	section("4. FLUENCY BAND CLASSIFICATION")

	def expert_band(score):
	if score < 3.0: return 'LOW'
	elif score < 4.5: return 'MEDIUM'
	else: return 'HIGH'

	for mode in MODES:
	df = dfs[mode].copy()
	df['expert_band'] = df['expert_score'].apply(expert_band)
	agree = (df['expert_band'] == df['fluency_band']).sum()
	total = len(df)
	print(f"\n [{mode}] Accuracy: {agree}/{total} ({agree/total:.1%})")
	print(f" {'':>15} Pipeline-> {'LOW':>5} {'MED':>5} {'HIGH':>5}")
	for eb in ['LOW', 'MEDIUM', 'HIGH']:
	row = []
	for pb in ['LOW', 'MEDIUM', 'HIGH']:
	n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
	row.append(n)
	print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}")

	# ── 5. Per-dimension correlations ──
	section("5. PER-DIMENSION CORRELATIONS")
	print(f"\n {'Dimension':<25s}", end="")
	for mode in MODES:
	print(f" {mode:>10s}", end="")
	print()
	print(f" {'-'*70}")
	for d in dims:
	print(f" {d:<25s}", end="")
	for mode in MODES:
	v = dfs[mode][['expert_score', d]].dropna()
	r, p = spearmanr(v['expert_score'], v[d])
	sig = '*' if p < 0.001 else '' if p < 0.01 else '*' if p < 0.05 else ' ns'
	print(f" {r:>+7.4f}{sig}", end="")
	print()

	# ── 6. Bootstrap significance ──
	section("6. BOOTSTRAP SIGNIFICANCE TEST")

	# Find common files across all modes
	common_ids = set(dfs[MODES[0]]['file_id'])
	for mode in MODES[1:]:
	common_ids &= set(dfs[mode]['file_id'])
	common_ids = sorted(common_ids)

	base_df = dfs['silero'].set_index('file_id').loc[common_ids]
	expert = base_df['expert_score'].values
	N = len(expert)
	comp = {}
	for mode in MODES:
	comp[mode] = dfs[mode].set_index('file_id').loc[common_ids]['composite_raw'].values

	n_boot = 10000
	rng = np.random.default_rng(42)
	boot_rho = {m: [] for m in MODES}
	boot_delta = {m: [] for m in MODES if m != 'silero'}

	for _ in range(n_boot):
	idx = rng.choice(N, N, replace=True)
	for mode in MODES:
	r, _ = spearmanr(expert[idx], comp[mode][idx])
	boot_rho[mode].append(r)
	for mode in [m for m in MODES if m != 'silero']:
	boot_delta[mode].append(boot_rho[mode][-1] - boot_rho['silero'][-1])

	for mode in MODES:
	boot_rho[mode] = np.array(boot_rho[mode])
	for mode in boot_delta:
	boot_delta[mode] = np.array(boot_delta[mode])

	print(f"\n Paired comparison on N={N} common files:")
	print(f"\n {'Mode':<12s} {'rho':>8s} {'95% CI':>20s} {'delta':>8s} {'p vs silero':>12s}")
	print(f" {'-'*65}")
	for mode in MODES:
	r_obs, _ = spearmanr(expert, comp[mode])
	ci = np.percentile(boot_rho[mode], [2.5, 97.5])
	if mode == 'silero':
	print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {'baseline':>8s} {'--':>12s}")
	else:
	delta = r_obs - rhos['silero']
	p_val = (boot_delta[mode] <= 0).mean()
	print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {delta:>+8.4f} p={p_val:.4f}")

	# ── 7. Score ordering analysis ──
	section("7. SCORE ORDERING ANALYSIS")
	print(" (Files sorted by expert score — how well does pipeline preserve order?)")

	for mode in MODES:
	df = dfs[mode].sort_values('expert_score').reset_index(drop=True)
	# Compute rank correlation on sorted data
	exp_rank = df['expert_score'].rank(method='average')
	comp_rank = df['composite_raw'].rank(method='average')
	rho_rank, _ = spearmanr(exp_rank, comp_rank)

	# Monotonicity: how many consecutive pairs are in correct order?
	correct_order = 0
	total_consecutive = 0
	for i in range(len(df) - 1):
	if df.iloc[i + 1]['expert_score'] > df.iloc[i]['expert_score']:
	total_consecutive += 1
	if df.iloc[i + 1]['composite_raw'] >= df.iloc[i]['composite_raw']:
	correct_order += 1

	mono_pct = correct_order / total_consecutive * 100 if total_consecutive > 0 else 0
	print(f" {mode:<12s}: rank rho={rho_rank:.4f} "
	f"monotonic pairs: {correct_order}/{total_consecutive} ({mono_pct:.1f}%)")

	# ── 8. Summary ──
	section("FINAL SUMMARY")

	best_mode = max(rhos, key=rhos.get)
	print(f"\n Results on SANDI dev N={N}:")
	for mode in MODES:
	marker = ' <-- BEST' if mode == best_mode else ''
	print(f" {mode:<12s}: rho={rhos[mode]:+.4f}{marker}")

	print(f"\n Best mode: {best_mode} (rho={rhos[best_mode]:.4f})")

	if best_mode != 'silero':
	delta = rhos[best_mode] - rhos['silero']
	p_val = (boot_delta[best_mode] <= 0).mean()
	print(f" Improvement over baseline: +{delta:.4f} (bootstrap p={p_val:.4f})")
	if p_val < 0.05:
	print(f" -> SIGNIFICANT improvement. Adopt {best_mode}.")
	else:
	print(f" -> Not significant. More data needed or recalibrate pipeline.")
	else:
	print(f" -> Silero baseline remains the best. No VAD swap needed.")

	print(f"\nTotal time: {total_time/60:.1f} minutes")