Spaces:
Sleeping
Sleeping
| """ | |
| Hybrid V2 Validation: Test the fixed hybrid VAD against silero baseline. | |
| Uses cached transcriptions from prior runs. Only swaps the VAD component. | |
| Compares silero vs hybrid_v2 on SANDI dev-438 with full band analysis. | |
| """ | |
| import sys, os, time, warnings, pickle | |
| import numpy as np | |
| import pandas as pd | |
| warnings.filterwarnings("ignore") | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from pathlib import Path | |
| from scipy.stats import spearmanr, kendalltau, mannwhitneyu | |
| BASE = Path(__file__).parent.parent | |
| CACHE_DIR = BASE / "EDA/data/hybrid_vad_cache" | |
| filelist = pd.read_csv(BASE / "EDA/data/sandi_dev_438_filelist.csv") | |
| print(f"SANDI files: {len(filelist)}") | |
| cached = sum(1 for _, row in filelist.iterrows() | |
| if (CACHE_DIR / f"{row['file_id']}_tx.pkl").exists()) | |
| print(f"Cached transcriptions: {cached}/{len(filelist)}") | |
| print("Loading models...", flush=True) | |
| from pipeline.hybrid_vad import run_hybrid_vad | |
| from pipeline.placement import classify_pauses | |
| from pipeline.fa_features import compute_fa_features | |
| from pipeline.syntactic_features import compute_syntactic_features | |
| from models.inference import predict | |
| from pipeline.composite import compute_composite | |
| print("Models loaded.\n") | |
| MODES = ["silero", "hybrid_v2"] | |
| all_results = {m: [] for m in MODES} | |
| errors = {m: 0 for m in MODES} | |
| start_time = time.time() | |
| for idx, row in filelist.iterrows(): | |
| file_id = row['file_id'] | |
| cache_path = CACHE_DIR / f"{file_id}_tx.pkl" | |
| if not cache_path.exists(): | |
| continue | |
| audio_path = row['audio_path'] | |
| if not os.path.isabs(audio_path): | |
| audio_path = str(BASE / audio_path) | |
| if not os.path.exists(audio_path): | |
| continue | |
| with open(cache_path, 'rb') as f: | |
| tx = pickle.load(f) | |
| words = tx['words'] | |
| word_count = len(words) | |
| n = idx + 1 | |
| elapsed = time.time() - start_time | |
| rate = n / max(elapsed, 1) | |
| eta = (len(filelist) - n) / max(rate, 0.01) | |
| if n % 50 == 0 or n <= 3 or n == len(filelist): | |
| print(f" [{n}/{len(filelist)}] {file_id} " | |
| f"[{elapsed/60:.1f}m, ~{eta/60:.0f}m left]", flush=True) | |
| for mode in MODES: | |
| try: | |
| vad = run_hybrid_vad(audio_path, mode=mode) | |
| vad['mlu'] = round(word_count / max(vad['speech_segments'], 1), 2) | |
| placement = classify_pauses(words, vad) | |
| fa = compute_fa_features(words, vad['total_duration_sec']) | |
| syn = compute_syntactic_features(words, tx['transcript']) | |
| all_features = {**vad, **placement, **fa, **syn} | |
| predictions = predict(all_features) | |
| composite = compute_composite(all_features, predictions) | |
| all_results[mode].append({ | |
| 'file_id': file_id, | |
| 'expert_score': row['expert_score'], | |
| 'composite_raw': composite['composite_raw'], | |
| 'composite_percentile': composite['composite_percentile'], | |
| 'fluency_band': composite['fluency_band'], | |
| 'speech_ratio': vad['speech_ratio'], | |
| 'mlu': vad['mlu'], | |
| 'word_count': word_count, | |
| 'pause_count': vad['pause_count'], | |
| 'mean_pause_dur': vad['mean_pause_duration_sec'], | |
| 'long_pause_ratio': vad['long_pause_ratio'], | |
| 'short_pause_share': vad.get('short_pause_share', 0), | |
| 'speech_segments': vad['speech_segments'], | |
| 'speech_duration_sec': vad['speech_duration_sec'], | |
| 'mid_clause_pause_ratio': placement['mid_clause_pause_ratio'], | |
| 'boundary_pause_ratio': placement['boundary_pause_ratio'], | |
| 'dim_continuity': composite['dim_continuity'], | |
| 'dim_pause_quality': composite['dim_pause_quality'], | |
| 'dim_articulation': composite['dim_articulation'], | |
| 'dim_dominance': composite['dim_dominance'], | |
| 'dim_placement': composite['dim_placement'], | |
| 'dim_word_precision': composite['dim_word_precision'], | |
| }) | |
| except Exception as e: | |
| errors[mode] += 1 | |
| if errors[mode] <= 5: | |
| print(f" ERROR [{mode}] {file_id}: {e}", flush=True) | |
| total_time = time.time() - start_time | |
| print(f"\nDone in {total_time/60:.1f} minutes") | |
| for m in MODES: | |
| print(f" {m}: {len(all_results[m])} processed, {errors[m]} errors") | |
| dfs = {} | |
| for mode in MODES: | |
| dfs[mode] = pd.DataFrame(all_results[mode]) | |
| # Also load old hybrid for comparison | |
| old_hybrid = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv") | |
| dfs["hybrid_v1"] = old_hybrid | |
| COMPARE_MODES = ["silero", "hybrid_v2", "hybrid_v1"] | |
| def expert_band(score): | |
| if score < 3.0: return 'LOW' | |
| elif score < 4.5: return 'MEDIUM' | |
| else: return 'HIGH' | |
| for mode in COMPARE_MODES: | |
| dfs[mode]['expert_band'] = dfs[mode]['expert_score'].apply(expert_band) | |
| N = min(len(dfs[m]) for m in ["silero", "hybrid_v2"]) | |
| def section(title): | |
| print(f"\n{'='*80}") | |
| print(f" {title}") | |
| print(f"{'='*80}") | |
| BANDS = ["LOW", "MEDIUM", "HIGH"] | |
| # ── 1. OVERALL ── | |
| section("1. OVERALL METRICS") | |
| for mode in COMPARE_MODES: | |
| df = dfs[mode] | |
| rho, p = spearmanr(df['expert_score'], df['composite_raw']) | |
| agree = (df['expert_band'] == df['fluency_band']).sum() | |
| acc = agree / len(df) | |
| # Macro F1 | |
| f1s = [] | |
| for band in BANDS: | |
| tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum() | |
| fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum() | |
| fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum() | |
| prec = tp/(tp+fp) if (tp+fp) > 0 else 0 | |
| rec = tp/(tp+fn) if (tp+fn) > 0 else 0 | |
| f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0 | |
| f1s.append(f1) | |
| macro_f1 = np.mean(f1s) | |
| marker = "" | |
| if mode == "hybrid_v2": | |
| marker = " ◄ NEW" | |
| elif mode == "hybrid_v1": | |
| marker = " (old)" | |
| print(f" {mode:<12s} ρ={rho:+.4f} acc={acc:.1%} macro-F1={macro_f1:.3f}{marker}") | |
| # ── 2. CONFUSION MATRICES ── | |
| section("2. CONFUSION MATRICES") | |
| for mode in COMPARE_MODES: | |
| df = dfs[mode] | |
| agree = (df['expert_band'] == df['fluency_band']).sum() | |
| print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})") | |
| print(f" {'':>16} Pipeline→ {'LOW':>5} {'MED':>5} {'HIGH':>5}") | |
| for eb in BANDS: | |
| row = [] | |
| for pb in BANDS: | |
| n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum() | |
| row.append(n) | |
| print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}") | |
| # ── 3. KEY METRICS ── | |
| section("3. SEGMENT COUNT & MLU COMPARISON") | |
| print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} {'PauseDur':>10s} {'LPR':>10s}") | |
| print(f" {'-'*65}") | |
| for mode in COMPARE_MODES: | |
| df = dfs[mode] | |
| print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} " | |
| f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} " | |
| f"{df['long_pause_ratio'].mean():>10.4f}") | |
| # ── 4. PER-BAND ANALYSIS ── | |
| section("4. PER-BAND FEATURE COMPARISON") | |
| for band in BANDS: | |
| print(f"\n [{band}]") | |
| print(f" {'Mode':<12s} {'N':>4s} {'Segments':>10s} {'MLU':>10s} {'MED→HIGH':>10s} {'Composite':>10s}") | |
| print(f" {'-'*60}") | |
| for mode in COMPARE_MODES: | |
| df = dfs[mode] | |
| sub = df[df['expert_band'] == band] | |
| med_as_high = "N/A" | |
| if band == "MEDIUM": | |
| n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum() | |
| med_as_high = str(n_mh) | |
| elif band == "HIGH": | |
| n_hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum() | |
| med_as_high = f"H→M:{n_hm}" | |
| print(f" {mode:<12s} {len(sub):>4d} {sub['speech_segments'].mean():>10.2f} " | |
| f"{sub['mlu'].mean():>10.2f} {med_as_high:>10s} {sub['composite_raw'].mean():>+10.4f}") | |
| # ── 5. THRESHOLD CROSSINGS ── | |
| section("5. THRESHOLD CROSSINGS (HIGH: MLU≥7 + SR≥0.75 + LPR≤0.15)") | |
| for mode in COMPARE_MODES: | |
| df = dfs[mode] | |
| med = df[df['expert_band'] == 'MEDIUM'] | |
| n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum() | |
| n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum() | |
| print(f" {mode:<12s} MEDIUM meeting all 3: {n_all3}/{len(med)} MEDIUM→HIGH: {n_mh}") | |
| # ── 6. BOOTSTRAP ── | |
| section("6. BOOTSTRAP (hybrid_v2 vs silero)") | |
| if "hybrid_v2" in dfs and "silero" in dfs: | |
| common = dfs['silero'][['file_id','expert_score','composite_raw']].merge( | |
| dfs['hybrid_v2'][['file_id','composite_raw']], | |
| on='file_id', suffixes=('_sil','_v2')) | |
| expert = common['expert_score'].values | |
| comp_s = common['composite_raw_sil'].values | |
| comp_v = common['composite_raw_v2'].values | |
| NC = len(common) | |
| rho_s, _ = spearmanr(expert, comp_s) | |
| rho_v, _ = spearmanr(expert, comp_v) | |
| n_boot = 10000 | |
| rng = np.random.default_rng(42) | |
| deltas = np.zeros(n_boot) | |
| for i in range(n_boot): | |
| idx = rng.choice(NC, NC, replace=True) | |
| rs, _ = spearmanr(expert[idx], comp_s[idx]) | |
| rv, _ = spearmanr(expert[idx], comp_v[idx]) | |
| deltas[i] = rv - rs | |
| p_val = (deltas <= 0).mean() | |
| ci = np.percentile(deltas, [2.5, 97.5]) | |
| print(f"\n N={NC} paired files") | |
| print(f" Silero: ρ = {rho_s:.4f}") | |
| print(f" Hybrid V2: ρ = {rho_v:.4f}") | |
| print(f" Δρ = {rho_v - rho_s:+.4f} 95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}] p={p_val:.4f}") | |
| # ── 7. VERDICT ── | |
| section("VERDICT") | |
| df_sil = dfs['silero'] | |
| df_v2 = dfs['hybrid_v2'] | |
| rho_s, _ = spearmanr(df_sil['expert_score'], df_sil['composite_raw']) | |
| rho_v, _ = spearmanr(df_v2['expert_score'], df_v2['composite_raw']) | |
| acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil) | |
| acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2) | |
| mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum() | |
| mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum() | |
| mh_v1 = ((old_hybrid['expert_band'] == 'MEDIUM') & (old_hybrid['fluency_band'] == 'HIGH')).sum() if 'expert_band' in old_hybrid.columns else "?" | |
| seg_s = df_sil['speech_segments'].mean() | |
| seg_v = df_v2['speech_segments'].mean() | |
| mlu_s = df_sil['mlu'].mean() | |
| mlu_v = df_v2['mlu'].mean() | |
| print(f"\n METRIC COMPARISON:") | |
| print(f" {'Metric':<25s} {'Silero':>10s} {'Hybrid V2':>10s} {'Hybrid V1':>10s} {'V2 vs Sil':>10s}") | |
| print(f" {'-'*70}") | |
| print(f" {'Spearman ρ':<25s} {rho_s:>+10.4f} {rho_v:>+10.4f} {'--':>10s} {rho_v-rho_s:>+10.4f}") | |
| print(f" {'Accuracy':<25s} {acc_s:>10.1%} {acc_v:>10.1%} {'--':>10s} {(acc_v-acc_s)*100:>+10.1f}pp") | |
| print(f" {'MEDIUM→HIGH FP':<25s} {mh_s:>10d} {mh_v:>10d} {mh_v1:>10d} {mh_v-mh_s:>+10d}") | |
| print(f" {'Mean segments':<25s} {seg_s:>10.2f} {seg_v:>10.2f} {'--':>10s} {seg_v-seg_s:>+10.2f}") | |
| print(f" {'Mean MLU':<25s} {mlu_s:>10.2f} {mlu_v:>10.2f} {'--':>10s} {mlu_v-mlu_s:>+10.2f}") | |
| if mh_v < mh_s: | |
| print(f"\n ✅ MEDIUM→HIGH false positives REDUCED from {mh_s} to {mh_v} ({mh_s-mh_v} fewer)") | |
| elif mh_v == mh_s: | |
| print(f"\n → MEDIUM→HIGH false positives UNCHANGED at {mh_v}") | |
| else: | |
| print(f"\n ⚠ MEDIUM→HIGH false positives INCREASED from {mh_s} to {mh_v}") | |
| if rho_v >= rho_s - 0.01: | |
| print(f" ✅ Spearman ρ maintained ({rho_v:+.4f} vs {rho_s:+.4f})") | |
| else: | |
| print(f" ⚠ Spearman ρ degraded ({rho_v:+.4f} vs {rho_s:+.4f})") | |
| print(f"\n Total time: {total_time/60:.1f} minutes") | |