fluency-benchmark / run_selective_fusion_test.py
syt20's picture
Replace with fluency_app_v3: updated models, new pipeline modules, experiments
63fae5b verified
"""
Selective Feature Fusion VAD Test on SANDI 438.
4 VAD modes: silero (baseline), ten, marblenet, fusion (selective)
Uses cached transcriptions from prior run.
Reports:
1. Spearman rho/Kendall tau vs expert scores
2. Rank-order accuracy (Spearman + pairwise ordering)
3. Quintile segregation with Cohen's d and U-tests
4. Classification accuracy (band agreement + confusion matrix)
5. Per-dimension correlation breakdown
6. Bootstrap significance tests (fusion vs baseline)
7. Ordering analysis: how many pairs correctly ordered
Usage: python run_selective_fusion_test.py
"""
import sys, os, time, warnings, pickle
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(__file__))
from pathlib import Path
from scipy.stats import spearmanr, kendalltau, mannwhitneyu, kruskal
BASE = Path(__file__).parent.parent
CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache"
filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv")
print(f"SANDI files: {len(filelist)}")
# ── Verify transcription cache ──
cached = sum(1 for _, row in filelist.iterrows()
if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists())
print(f"Cached transcriptions: {cached}/{len(filelist)}")
if cached < len(filelist) * 0.5:
print("ERROR: Run run_hybrid_vad_test.py --transcribe first")
sys.exit(1)
# ── Load pipeline modules ──
print("Loading models...", flush=True)
from pipeline.selective_fusion_vad import run_selective_vad
from pipeline.placement import classify_pauses
from pipeline.fa_features import compute_fa_features
from pipeline.syntactic_features import compute_syntactic_features
from models.inference import predict
from pipeline.composite import compute_composite
print("Models loaded.\n")
MODES = ["silero", "ten", "marblenet", "fusion"]
all_results = {m: [] for m in MODES}
errors = {m: 0 for m in MODES}
start_time = time.time()
for idx, row in filelist.iterrows():
file_id = row['file_id']
cache_path = CACHE_DIR / f"{file_id}_tx.pkl"
if not cache_path.exists():
continue
audio_path = row['audio_path']
if not os.path.isabs(audio_path):
audio_path = str(BASE / audio_path)
if not os.path.exists(audio_path):
continue
with open(cache_path, 'rb') as f:
tx = pickle.load(f)
words = tx['words']
word_count = len(words)
n = idx + 1
elapsed = time.time() - start_time
rate = n / max(elapsed, 1)
eta = (len(filelist) - n) / max(rate, 0.01)
if n % 50 == 0 or n <= 3 or n == len(filelist):
print(f" [{n}/{len(filelist)}] {file_id} "
f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True)
for mode in MODES:
try:
vad = run_selective_vad(audio_path, mode=mode)
vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2)
placement = classify_pauses(words, vad)
fa = compute_fa_features(words, vad['total_duration_sec'])
syn = compute_syntactic_features(words, tx['transcript'])
all_features = {**vad, **placement, **fa, **syn}
predictions = predict(all_features)
composite = compute_composite(all_features, predictions)
all_results[mode].append({
'file_id': file_id,
'expert_score': row['expert_score'],
'composite_raw': composite['composite_raw'],
'composite_percentile': composite['composite_percentile'],
'fluency_band': composite['fluency_band'],
'speech_ratio': vad['speech_ratio'],
'mlu': vad['mlu'],
'word_count': word_count,
'pause_count': vad['pause_count'],
'mean_pause_dur': vad['mean_pause_duration_sec'],
'long_pause_ratio': vad['long_pause_ratio'],
'short_pause_share': vad.get('short_pause_share', 0),
'speech_segments': vad['speech_segments'],
'speech_duration_sec': vad['speech_duration_sec'],
'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'],
'boundary_pause_ratio': placement['boundary_pause_ratio'],
'dim_continuity': composite['dim_continuity'],
'dim_pause_quality': composite['dim_pause_quality'],
'dim_articulation': composite['dim_articulation'],
'dim_dominance': composite['dim_dominance'],
'dim_placement': composite['dim_placement'],
'dim_word_precision': composite['dim_word_precision'],
})
except Exception as e:
errors[mode] += 1
if errors[mode] <= 3:
print(f" ERROR [{mode}] {file_id}: {e}", flush=True)
total_time = time.time() - start_time
print(f"\nDone in {total_time/60:.1f} minutes")
for m in MODES:
print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors")
dfs = {}
for mode in MODES:
df = pd.DataFrame(all_results[mode])
out_path = BASE / f"EDA/data/sandi_438_vad_{mode}.csv"
df.to_csv(out_path, index=False)
dfs[mode] = df
print(f"Saved: {out_path}")
# ══════════════════════════════════════════════════════════════════
# ANALYSIS
# ══════════════════════════════════════════════════════════════════
def section(title):
w = 80
print(f"\n{'='*w}")
print(f" {title}")
print(f"{'='*w}")
dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation',
'dim_dominance', 'dim_placement', 'dim_word_precision']
# ── 1. Overall correlation ──
section("1. OVERALL CORRELATION WITH EXPERT SCORES")
print(f"\n {'Mode':<12s} {'Spearman':>10s} {'p':>10s} {'Kendall':>10s} {'p':>10s} {'N':>5s}")
print(f" {'-'*55}")
rhos = {}
for mode in MODES:
df = dfs[mode]
rho, p_rho = spearmanr(df['expert_score'], df['composite_raw'])
tau, p_tau = kendalltau(df['expert_score'], df['composite_raw'])
rhos[mode] = rho
marker = ' <-- BEST' if rho == max(rhos.values()) else ''
print(f" {mode:<12s} {rho:>+10.4f} {p_rho:>10.2e} {tau:>+10.4f} {p_tau:>10.2e} {len(df):>5d}{marker}")
# ── 2. Pairwise ordering accuracy ──
section("2. PAIRWISE ORDERING ACCURACY")
print(" (What % of speaker pairs are ordered correctly by the pipeline?)")
for mode in MODES:
df = dfs[mode]
exp = df['expert_score'].values
comp = df['composite_raw'].values
n = len(exp)
concordant = 0
discordant = 0
tied = 0
for i in range(n):
for j in range(i + 1, n):
e_diff = exp[i] - exp[j]
c_diff = comp[i] - comp[j]
if e_diff == 0:
tied += 1
elif e_diff * c_diff > 0:
concordant += 1
else:
discordant += 1
total_pairs = concordant + discordant
accuracy = concordant / total_pairs * 100 if total_pairs > 0 else 0
print(f" {mode:<12s}: {concordant}/{total_pairs} pairs correct ({accuracy:.1f}%)"
f" tied={tied}")
# ── 3. Quintile segregation ──
section("3. QUINTILE SEGREGATION")
for mode in MODES:
df = dfs[mode].copy()
df['quintile'] = pd.qcut(df['expert_score'], 5,
labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
duplicates='drop')
print(f"\n [{mode}]")
print(f" {'Q':>4s} {'N':>4s} {'Expert':>8s} {'Composite':>10s} {'SR':>7s} "
f"{'MLU':>7s} {'PauseDur':>9s} {'LPR':>7s}")
print(f" {'-'*65}")
q_means = {}
for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']:
sub = df[df['quintile'] == q]
if len(sub) == 0: continue
q_means[q] = sub['composite_raw'].mean()
print(f" {q:>4s} {len(sub):>4d} {sub['expert_score'].mean():>8.2f} "
f"{sub['composite_raw'].mean():>+10.4f} {sub['speech_ratio'].mean():>7.3f} "
f"{sub['mlu'].mean():>7.2f} {sub['mean_pause_dur'].mean():>9.4f} "
f"{sub['long_pause_ratio'].mean():>7.4f}")
# Adjacent quintile effect sizes
print(f"\n Cohen's d (adjacent quintiles):")
pairs = [('Q1', 'Q2'), ('Q2', 'Q3'), ('Q3', 'Q4'), ('Q4', 'Q5')]
for q_lo, q_hi in pairs:
a = df[df['quintile'] == q_lo]['composite_raw']
b = df[df['quintile'] == q_hi]['composite_raw']
if len(a) == 0 or len(b) == 0: continue
pooled_std = np.sqrt((a.std()**2 + b.std()**2) / 2)
d = (b.mean() - a.mean()) / pooled_std if pooled_std > 0 else 0
u, p = mannwhitneyu(a, b, alternative='less')
sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
print(f" {q_lo}->{q_hi}: d={d:+.3f} p={p:.4f} {sig}")
if len(q_means) >= 2:
qs = sorted(q_means.keys())
spread = q_means[qs[-1]] - q_means[qs[0]]
print(f" Q5-Q1 spread: {spread:+.4f}")
groups = [df[df['quintile'] == q]['composite_raw'].values
for q in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
if len(df[df['quintile'] == q]) > 0]
if len(groups) >= 2:
H, p_kw = kruskal(*groups)
print(f" Kruskal-Wallis H={H:.2f}, p={p_kw:.2e}")
# ── 4. Band classification accuracy ──
section("4. FLUENCY BAND CLASSIFICATION")
def expert_band(score):
if score < 3.0: return 'LOW'
elif score < 4.5: return 'MEDIUM'
else: return 'HIGH'
for mode in MODES:
df = dfs[mode].copy()
df['expert_band'] = df['expert_score'].apply(expert_band)
agree = (df['expert_band'] == df['fluency_band']).sum()
total = len(df)
print(f"\n [{mode}] Accuracy: {agree}/{total} ({agree/total:.1%})")
print(f" {'':>15} Pipeline-> {'LOW':>5} {'MED':>5} {'HIGH':>5}")
for eb in ['LOW', 'MEDIUM', 'HIGH']:
row = []
for pb in ['LOW', 'MEDIUM', 'HIGH']:
n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum()
row.append(n)
print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}")
# ── 5. Per-dimension correlations ──
section("5. PER-DIMENSION CORRELATIONS")
print(f"\n {'Dimension':<25s}", end="")
for mode in MODES:
print(f" {mode:>10s}", end="")
print()
print(f" {'-'*70}")
for d in dims:
print(f" {d:<25s}", end="")
for mode in MODES:
v = dfs[mode][['expert_score', d]].dropna()
r, p = spearmanr(v['expert_score'], v[d])
sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns'
print(f" {r:>+7.4f}{sig}", end="")
print()
# ── 6. Bootstrap significance ──
section("6. BOOTSTRAP SIGNIFICANCE TEST")
# Find common files across all modes
common_ids = set(dfs[MODES[0]]['file_id'])
for mode in MODES[1:]:
common_ids &= set(dfs[mode]['file_id'])
common_ids = sorted(common_ids)
base_df = dfs['silero'].set_index('file_id').loc[common_ids]
expert = base_df['expert_score'].values
N = len(expert)
comp = {}
for mode in MODES:
comp[mode] = dfs[mode].set_index('file_id').loc[common_ids]['composite_raw'].values
n_boot = 10000
rng = np.random.default_rng(42)
boot_rho = {m: [] for m in MODES}
boot_delta = {m: [] for m in MODES if m != 'silero'}
for _ in range(n_boot):
idx = rng.choice(N, N, replace=True)
for mode in MODES:
r, _ = spearmanr(expert[idx], comp[mode][idx])
boot_rho[mode].append(r)
for mode in [m for m in MODES if m != 'silero']:
boot_delta[mode].append(boot_rho[mode][-1] - boot_rho['silero'][-1])
for mode in MODES:
boot_rho[mode] = np.array(boot_rho[mode])
for mode in boot_delta:
boot_delta[mode] = np.array(boot_delta[mode])
print(f"\n Paired comparison on N={N} common files:")
print(f"\n {'Mode':<12s} {'rho':>8s} {'95% CI':>20s} {'delta':>8s} {'p vs silero':>12s}")
print(f" {'-'*65}")
for mode in MODES:
r_obs, _ = spearmanr(expert, comp[mode])
ci = np.percentile(boot_rho[mode], [2.5, 97.5])
if mode == 'silero':
print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {'baseline':>8s} {'--':>12s}")
else:
delta = r_obs - rhos['silero']
p_val = (boot_delta[mode] <= 0).mean()
print(f" {mode:<12s} {r_obs:>+8.4f} [{ci[0]:.4f}, {ci[1]:.4f}] {delta:>+8.4f} p={p_val:.4f}")
# ── 7. Score ordering analysis ──
section("7. SCORE ORDERING ANALYSIS")
print(" (Files sorted by expert score β€” how well does pipeline preserve order?)")
for mode in MODES:
df = dfs[mode].sort_values('expert_score').reset_index(drop=True)
# Compute rank correlation on sorted data
exp_rank = df['expert_score'].rank(method='average')
comp_rank = df['composite_raw'].rank(method='average')
rho_rank, _ = spearmanr(exp_rank, comp_rank)
# Monotonicity: how many consecutive pairs are in correct order?
correct_order = 0
total_consecutive = 0
for i in range(len(df) - 1):
if df.iloc[i + 1]['expert_score'] > df.iloc[i]['expert_score']:
total_consecutive += 1
if df.iloc[i + 1]['composite_raw'] >= df.iloc[i]['composite_raw']:
correct_order += 1
mono_pct = correct_order / total_consecutive * 100 if total_consecutive > 0 else 0
print(f" {mode:<12s}: rank rho={rho_rank:.4f} "
f"monotonic pairs: {correct_order}/{total_consecutive} ({mono_pct:.1f}%)")
# ── 8. Summary ──
section("FINAL SUMMARY")
best_mode = max(rhos, key=rhos.get)
print(f"\n Results on SANDI dev N={N}:")
for mode in MODES:
marker = ' <-- BEST' if mode == best_mode else ''
print(f" {mode:<12s}: rho={rhos[mode]:+.4f}{marker}")
print(f"\n Best mode: {best_mode} (rho={rhos[best_mode]:.4f})")
if best_mode != 'silero':
delta = rhos[best_mode] - rhos['silero']
p_val = (boot_delta[best_mode] <= 0).mean()
print(f" Improvement over baseline: +{delta:.4f} (bootstrap p={p_val:.4f})")
if p_val < 0.05:
print(f" -> SIGNIFICANT improvement. Adopt {best_mode}.")
else:
print(f" -> Not significant. More data needed or recalibrate pipeline.")
else:
print(f" -> Silero baseline remains the best. No VAD swap needed.")
print(f"\nTotal time: {total_time/60:.1f} minutes")