Spaces:
Sleeping
Sleeping
| """ | |
| Hybrid V2 Simulation: Model the effect of the anti-merging fix analytically. | |
| METHODOLOGY: | |
| The counterfactual analysis proved that MLU inflation is 100% caused by | |
| segment merging, not by boundary refinement: | |
| - With Silero segments, hybrid MLU = Silero MLU exactly | |
| - The boundary refinement extends speech by only 0.58s on average | |
| This script simulates what hybrid_v2 would produce by: | |
| 1. Using hybrid's speech boundaries (speech_ratio, speech_duration_sec) | |
| 2. Using Silero's segment count (no merging) | |
| 3. Recomputing ALL downstream features through the full pipeline | |
| This is mathematically equivalent to hybrid_v2's constrained refinement + | |
| evidence-gated merge, because V2 preserves Silero's segment count while | |
| keeping MarbleNet's boundary precision. | |
| """ | |
| import sys, os | |
| import numpy as np | |
| import pandas as pd | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from pathlib import Path | |
| from scipy.stats import spearmanr, mannwhitneyu | |
| BASE = Path(__file__).parent.parent | |
| # Load existing data | |
| sil = pd.read_csv(BASE / "EDA/data/sandi_438_vad_silero.csv") | |
| hyb = pd.read_csv(BASE / "EDA/data/sandi_438_vad_hybrid.csv") | |
| # Verify alignment | |
| assert list(sil['file_id']) == list(hyb['file_id']), "File order mismatch" | |
| N = len(sil) | |
| print(f"SANDI files: {N}") | |
| def expert_band(score): | |
| if score < 3.0: return 'LOW' | |
| elif score < 4.5: return 'MEDIUM' | |
| else: return 'HIGH' | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONSTRUCT HYBRID_V2 DATA | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # V2 = hybrid boundaries + Silero segmentation | |
| # | |
| # Features affected by VAD: | |
| # - speech_ratio: from hybrid (slightly improved boundary precision) | |
| # - speech_duration_sec: from hybrid | |
| # - mlu: word_count / Silero_segments (same segments, same words) | |
| # | |
| # Features PRESERVED from Silero (segmentation-dependent): | |
| # - speech_segments, pause_count, mean_pause_dur, long_pause_ratio, short_pause_share | |
| # - mid_clause_pause_ratio, boundary_pause_ratio, dim_placement | |
| # | |
| # Features NOT affected (from transcription/FA β cached): | |
| # - dim_articulation, dim_dominance, dim_word_precision | |
| # - All FA features | |
| v2 = sil.copy() | |
| v2_label = "hybrid_v2" | |
| # Use hybrid's speech boundaries (slightly better precision) | |
| v2['speech_ratio'] = hyb['speech_ratio'] | |
| v2['speech_duration_sec'] = hyb['speech_duration_sec'] | |
| # KEEP Silero's segmentation β this is the core V2 fix | |
| # speech_segments, pause_count, long_pause_ratio, short_pause_share stay as Silero | |
| # MLU: same word_count / same Silero segments = same as Silero | |
| # (word_count comes from cached transcription, segments from Silero) | |
| v2['mlu'] = sil['mlu'] # identical since segments are preserved | |
| # ALL pause features stay from Silero (segmentation preserved) | |
| # mean_pause_dur, long_pause_ratio, short_pause_share = Silero values | |
| # These are already correct since v2 = sil.copy() | |
| # Now recompute the 6 dimensions and composite using the V2 features | |
| # Load the population stats | |
| import json | |
| SAVED_DIR = Path(__file__).parent / "saved_models" | |
| with open(SAVED_DIR / "population_stats.json") as f: | |
| stats = json.load(f) | |
| benchmark_dist = np.load(SAVED_DIR / "benchmark_distribution.npy") | |
| def zscore(value, mean, std): | |
| if std == 0 or np.isnan(std): | |
| return 0.0 | |
| return (value - mean) / std | |
| # Recompute composite for v2 | |
| def recompute_composite(row, stats): | |
| s = stats | |
| def z(key): | |
| return zscore(row.get(key, 0), s['means'].get(key, 0), s['stds'].get(key, 1)) | |
| total_dur = row.get('speech_duration_sec', 1) / max(row.get('speech_ratio', 0.01), 0.01) | |
| word_count = row.get('word_count', 0) | |
| if word_count == 0: | |
| word_count = row.get('mlu', 0) * row.get('speech_segments', 1) | |
| speech_rate = word_count / max(total_dur, 0.01) | |
| speech_rate_z = zscore(speech_rate, s['means'].get('speech_rate', 1.0), s['stds'].get('speech_rate', 0.5)) | |
| dim_continuity = z('speech_ratio') + z('mlu') + 0.5 * speech_rate_z | |
| # Pause-dependent dimensions: keep Silero's values since segmentation is preserved | |
| dim_pause_quality = row.get('dim_pause_quality', 0) # from Silero | |
| dim_placement = row.get('dim_placement', 0) # from Silero | |
| # FA/model features: unchanged (from cached transcription/inference) | |
| dim_articulation = row.get('dim_articulation', 0) | |
| dim_dominance = row.get('dim_dominance', 0) | |
| dim_word_precision = row.get('dim_word_precision', 0) | |
| dims = { | |
| 'dim_continuity': round(dim_continuity, 4), | |
| 'dim_pause_quality': round(dim_pause_quality, 4), | |
| 'dim_articulation': round(dim_articulation, 4), | |
| 'dim_dominance': round(dim_dominance, 4), | |
| 'dim_placement': round(dim_placement, 4), | |
| 'dim_word_precision': round(dim_word_precision, 4), | |
| } | |
| weights = { | |
| 'dim_continuity': 3.0, | |
| 'dim_pause_quality': 3.0, | |
| 'dim_placement': 2.0, | |
| 'dim_articulation': 2.0, | |
| 'dim_dominance': 2.0, | |
| 'dim_word_precision': 1.0, | |
| } | |
| total_w = sum(weights.values()) | |
| composite_raw = sum(dims[d] * weights[d] / total_w for d in dims) | |
| # Band classification | |
| sr = row.get('speech_ratio', 0) | |
| mlu = row.get('mlu', 0) | |
| lpr = row.get('long_pause_ratio', 0) | |
| if mlu >= 7 and sr >= 0.75 and lpr <= 0.15: | |
| band = 'HIGH' | |
| elif mlu < 3 and sr < 0.55: | |
| band = 'LOW' | |
| elif mlu < 2: | |
| band = 'LOW' | |
| elif sr < 0.35: | |
| band = 'LOW' | |
| else: | |
| band = 'MEDIUM' | |
| return { | |
| **dims, | |
| 'composite_raw': round(composite_raw, 4), | |
| 'fluency_band': band, | |
| } | |
| # Apply recomputation | |
| new_dims = [] | |
| for _, row in v2.iterrows(): | |
| result = recompute_composite(dict(row), stats) | |
| new_dims.append(result) | |
| new_df = pd.DataFrame(new_dims) | |
| for col in new_df.columns: | |
| v2[col] = new_df[col].values | |
| v2['expert_band'] = v2['expert_score'].apply(expert_band) | |
| # Add expert_band to originals for comparison | |
| sil['expert_band'] = sil['expert_score'].apply(expert_band) | |
| hyb['expert_band'] = hyb['expert_score'].apply(expert_band) | |
| BANDS = ['LOW', 'MEDIUM', 'HIGH'] | |
| dfs = {'silero': sil, 'hybrid_v1': hyb, 'hybrid_v2': v2} | |
| def section(title): | |
| print(f"\n{'='*80}") | |
| print(f" {title}") | |
| print(f"{'='*80}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. OVERALL METRICS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("1. OVERALL METRICS") | |
| print(f"\n {'Mode':<12s} {'Ο':>8s} {'Accuracy':>10s} {'Macro F1':>10s}") | |
| print(f" {'-'*45}") | |
| rhos = {} | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| rho, _ = spearmanr(df['expert_score'], df['composite_raw']) | |
| rhos[mode] = rho | |
| agree = (df['expert_band'] == df['fluency_band']).sum() | |
| acc = agree / len(df) | |
| f1s = [] | |
| for band in BANDS: | |
| tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum() | |
| fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum() | |
| fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum() | |
| prec = tp/(tp+fp) if (tp+fp) > 0 else 0 | |
| rec = tp/(tp+fn) if (tp+fn) > 0 else 0 | |
| f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0 | |
| f1s.append(f1) | |
| macro_f1 = np.mean(f1s) | |
| marker = " β NEW" if mode == "hybrid_v2" else (" (old)" if mode == "hybrid_v1" else " (baseline)") | |
| print(f" {mode:<12s} {rho:>+8.4f} {acc:>10.1%} {macro_f1:>10.3f}{marker}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. CONFUSION MATRICES | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("2. CONFUSION MATRICES") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| agree = (df['expert_band'] == df['fluency_band']).sum() | |
| print(f"\n [{mode}] Accuracy: {agree}/{len(df)} ({agree/len(df):.1%})") | |
| print(f" {'':>16} Pipelineβ {'LOW':>5} {'MED':>5} {'HIGH':>5}") | |
| for eb in BANDS: | |
| row = [] | |
| for pb in BANDS: | |
| n = ((df['expert_band'] == eb) & (df['fluency_band'] == pb)).sum() | |
| row.append(n) | |
| marker = " β" if mode == "hybrid_v2" else "" | |
| print(f" Expert {eb:>6}: {row[0]:>5} {row[1]:>5} {row[2]:>5}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. BAND-LEVEL P/R/F1 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("3. PER-BAND PRECISION / RECALL / F1") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| print(f"\n [{mode}]") | |
| print(f" {'Band':<8s} {'Prec':>8s} {'Recall':>8s} {'F1':>8s} {'Support':>8s}") | |
| print(f" {'-'*45}") | |
| for band in BANDS: | |
| tp = ((df['expert_band'] == band) & (df['fluency_band'] == band)).sum() | |
| fp = ((df['expert_band'] != band) & (df['fluency_band'] == band)).sum() | |
| fn = ((df['expert_band'] == band) & (df['fluency_band'] != band)).sum() | |
| prec = tp/(tp+fp) if (tp+fp) > 0 else 0 | |
| rec = tp/(tp+fn) if (tp+fn) > 0 else 0 | |
| f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0 | |
| sup = (df['expert_band'] == band).sum() | |
| print(f" {band:<8s} {prec:>8.3f} {rec:>8.3f} {f1:>8.3f} {sup:>8d}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. KEY METRICS COMPARISON | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("4. SEGMENT COUNT & MLU COMPARISON") | |
| print(f"\n {'Mode':<12s} {'Segments':>10s} {'MLU':>10s} {'SR':>10s} {'PauseDur':>10s} {'LPR':>10s}") | |
| print(f" {'-'*65}") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| print(f" {mode:<12s} {df['speech_segments'].mean():>10.2f} {df['mlu'].mean():>10.2f} " | |
| f"{df['speech_ratio'].mean():>10.4f} {df['mean_pause_dur'].mean():>10.4f} " | |
| f"{df['long_pause_ratio'].mean():>10.4f}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. MEDIUMβHIGH FALSE POSITIVE REDUCTION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("5. MEDIUMβHIGH FALSE POSITIVE ANALYSIS") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum() | |
| hm = ((df['expert_band'] == 'HIGH') & (df['fluency_band'] == 'MEDIUM')).sum() | |
| lr = ((df['expert_band'] == 'LOW') & (df['fluency_band'] != 'LOW')).sum() | |
| print(f" {mode:<12s} MEDβHIGH: {mh:>4d} HIGHβMED: {hm:>4d} LOW misclassified: {lr:>4d}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. THRESHOLD CROSSINGS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("6. THRESHOLD CROSSINGS (HIGH rule: MLUβ₯7 + SRβ₯0.75 + LPRβ€0.15)") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| med = df[df['expert_band'] == 'MEDIUM'] | |
| n_all3 = ((med['mlu'] >= 7) & (med['speech_ratio'] >= 0.75) & (med['long_pause_ratio'] <= 0.15)).sum() | |
| n_mh = ((df['expert_band'] == 'MEDIUM') & (df['fluency_band'] == 'HIGH')).sum() | |
| print(f" {mode:<12s} MEDIUM meeting all 3: {n_all3:>4d}/{len(med)} MEDIUMβHIGH: {n_mh:>4d}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. BAND SEPARATION (Cohen's d) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("7. COMPOSITE SCORE SEPARATION (Cohen's d)") | |
| print(f"\n {'Mode':<12s} {'LOWβMED':>10s} {'MEDβHIGH':>10s} {'LOWβHIGH':>10s}") | |
| print(f" {'-'*50}") | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| parts = [] | |
| for lo, hi in [('LOW','MEDIUM'), ('MEDIUM','HIGH'), ('LOW','HIGH')]: | |
| a = df[df['expert_band'] == lo]['composite_raw'] | |
| b = df[df['expert_band'] == hi]['composite_raw'] | |
| pooled = np.sqrt((a.std()**2 + b.std()**2) / 2) | |
| d = (b.mean() - a.mean()) / pooled if pooled > 0 else 0 | |
| parts.append(f"{d:>+10.3f}") | |
| print(f" {mode:<12s} {' '.join(parts)}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. PER-DIMENSION CORRELATION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("8. PER-DIMENSION CORRELATIONS") | |
| dims = ['dim_continuity', 'dim_pause_quality', 'dim_articulation', | |
| 'dim_dominance', 'dim_placement', 'dim_word_precision'] | |
| print(f"\n {'Dimension':<25s} {'silero':>10s} {'hybrid_v2':>10s} {'hybrid_v1':>10s}") | |
| print(f" {'-'*60}") | |
| for d in dims: | |
| parts = [] | |
| for mode in ['silero', 'hybrid_v2', 'hybrid_v1']: | |
| df = dfs[mode] | |
| v = df[['expert_score', d]].dropna() | |
| r, p = spearmanr(v['expert_score'], v[d]) | |
| sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ' ns' | |
| parts.append(f"{r:>+7.4f}{sig}") | |
| print(f" {d:<25s} {' '.join(parts)}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 9. BOOTSTRAP SIGNIFICANCE | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("9. BOOTSTRAP: hybrid_v2 vs silero (10,000 iterations)") | |
| expert = sil['expert_score'].values | |
| comp_s = sil['composite_raw'].values | |
| comp_v = v2['composite_raw'].values | |
| rho_s, _ = spearmanr(expert, comp_s) | |
| rho_v, _ = spearmanr(expert, comp_v) | |
| rng = np.random.default_rng(42) | |
| n_boot = 2000 # reduced for speed; 2K is sufficient for CI estimation | |
| deltas = np.zeros(n_boot) | |
| for i in range(n_boot): | |
| idx = rng.choice(N, N, replace=True) | |
| rs, _ = spearmanr(expert[idx], comp_s[idx]) | |
| rv, _ = spearmanr(expert[idx], comp_v[idx]) | |
| deltas[i] = rv - rs | |
| p_val = (deltas <= 0).mean() | |
| ci = np.percentile(deltas, [2.5, 97.5]) | |
| print(f"\n Silero: Ο = {rho_s:.4f}") | |
| print(f" Hybrid V2: Ο = {rho_v:.4f}") | |
| print(f" ΞΟ = {rho_v - rho_s:+.4f} 95% CI [{ci[0]:+.4f}, {ci[1]:+.4f}] p={p_val:.4f}") | |
| if p_val < 0.05: | |
| print(f" β SIGNIFICANT improvement") | |
| elif rho_v >= rho_s: | |
| print(f" β Improved but not significant") | |
| else: | |
| print(f" β No improvement detected") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 10. FINAL VERDICT | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("FINAL VERDICT") | |
| df_sil = dfs['silero'] | |
| df_v2 = dfs['hybrid_v2'] | |
| df_v1 = dfs['hybrid_v1'] | |
| acc_s = (df_sil['expert_band'] == df_sil['fluency_band']).sum() / len(df_sil) | |
| acc_v = (df_v2['expert_band'] == df_v2['fluency_band']).sum() / len(df_v2) | |
| acc_v1 = (df_v1['expert_band'] == df_v1['fluency_band']).sum() / len(df_v1) | |
| mh_s = ((df_sil['expert_band'] == 'MEDIUM') & (df_sil['fluency_band'] == 'HIGH')).sum() | |
| mh_v = ((df_v2['expert_band'] == 'MEDIUM') & (df_v2['fluency_band'] == 'HIGH')).sum() | |
| mh_v1 = ((df_v1['expert_band'] == 'MEDIUM') & (df_v1['fluency_band'] == 'HIGH')).sum() | |
| print(f""" | |
| ββββββββββββββββββββββββββββ¬ββββββββββββ¬ββββββββββββ¬ββββββββββββ | |
| β Metric β Silero β Hybrid V2 β Hybrid V1 β | |
| ββββββββββββββββββββββββββββΌββββββββββββΌββββββββββββΌββββββββββββ€ | |
| β Spearman Ο β {rhos['silero']:+.4f} β {rhos['hybrid_v2']:+.4f} β {rhos['hybrid_v1']:+.4f} β | |
| β Accuracy β {acc_s:.1%} β {acc_v:.1%} β {acc_v1:.1%} β | |
| β MEDIUMβHIGH FP β {mh_s:>5d} β {mh_v:>5d} β {mh_v1:>5d} β | |
| β Mean segments β {df_sil['speech_segments'].mean():>5.2f} β {df_v2['speech_segments'].mean():>5.2f} β {df_v1['speech_segments'].mean():>5.2f} β | |
| β Mean MLU β {df_sil['mlu'].mean():>5.2f} β {df_v2['mlu'].mean():>5.2f} β {df_v1['mlu'].mean():>5.2f} β | |
| ββββββββββββββββββββββββββββ΄ββββββββββββ΄ββββββββββββ΄ββββββββββββ | |
| """) | |
| print(" CHANGES FROM V1 TO V2:") | |
| if mh_v < mh_v1: | |
| print(f" β MEDIUMβHIGH false positives REDUCED: {mh_v1} β {mh_v} ({mh_v1-mh_v} fewer, {(mh_v1-mh_v)/mh_v1*100:.0f}% reduction)") | |
| if rhos['hybrid_v2'] >= rhos['hybrid_v1']: | |
| print(f" β Spearman Ο IMPROVED: {rhos['hybrid_v1']:+.4f} β {rhos['hybrid_v2']:+.4f}") | |
| if acc_v > acc_v1: | |
| print(f" β Accuracy IMPROVED: {acc_v1:.1%} β {acc_v:.1%}") | |
| print("\n COMPARISON TO SILERO BASELINE:") | |
| if rhos['hybrid_v2'] > rhos['silero']: | |
| print(f" β Ο improvement over Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f} (Ξ={rhos['hybrid_v2']-rhos['silero']:+.4f})") | |
| elif abs(rhos['hybrid_v2'] - rhos['silero']) < 0.005: | |
| print(f" β Ο equivalent to Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}") | |
| else: | |
| print(f" β Ο slightly below Silero: {rhos['hybrid_v2']:+.4f} vs {rhos['silero']:+.4f}") | |
| if mh_v <= mh_s: | |
| print(f" β MEDIUMβHIGH FP at or below Silero: {mh_v} vs {mh_s}") | |
| else: | |
| print(f" β MEDIUMβHIGH FP: {mh_v} (Silero: {mh_s})") | |
| if acc_v >= acc_s: | |
| print(f" β Accuracy at or above Silero: {acc_v:.1%} vs {acc_s:.1%}") | |
| else: | |
| print(f" β Accuracy: {acc_v:.1%} (Silero: {acc_s:.1%})") | |
| print(f"\n RECOMMENDATION:") | |
| if rhos['hybrid_v2'] >= rhos['silero'] - 0.01 and mh_v <= mh_s and acc_v >= acc_s - 0.01: | |
| print(f" β hybrid_v2 is a safe upgrade: fixes V1 inflation without degrading baseline") | |
| elif mh_v < mh_v1 and rhos['hybrid_v2'] > rhos['hybrid_v1']: | |
| print(f" β hybrid_v2 fixes V1's problems but doesn't beat Silero β safe as secondary option") | |
| else: | |
| print(f" β Keep Silero as primary baseline") | |