import os import math import numpy as np import gradio as gr import librosa import matplotlib.pyplot as plt from dataclasses import dataclass from typing import Dict, Any, Tuple, List # ----------------------------- # Config # ----------------------------- TARGET_SR = 16000 # ----------------------------- # Helpers # ----------------------------- def human_seconds(sec: float) -> str: if sec is None or not math.isfinite(sec): return "—" if sec < 60: return f"{sec:.1f}s" m = int(sec // 60) return f"{m}m {sec - 60*m:.1f}s" def safe_pct(x: float) -> str: if x is None or not math.isfinite(x): return "—" return f"{x*100:.1f}%" # ----------------------------- # Features # ----------------------------- @dataclass class Features: duration_s: float rms_mean: float rms_std: float zcr_mean: float pitch_median_hz: float pitch_iqr_hz: float voiced_ratio: float n_pauses: int pause_total_s: float active_ratio: float def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]: """ Explainable acoustic features + artifacts for plotting. (No medical claims; only measurable signals.) """ if y is None or len(y) == 0: f = Features( duration_s=float("nan"), rms_mean=float("nan"), rms_std=float("nan"), zcr_mean=float("nan"), pitch_median_hz=float("nan"), pitch_iqr_hz=float("nan"), voiced_ratio=float("nan"), n_pauses=0, pause_total_s=0.0, active_ratio=float("nan"), ) return f, {"y": np.array([]), "sr": sr} # Resample to stable SR if sr != TARGET_SR: y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=TARGET_SR) sr = TARGET_SR else: y = y.astype(np.float32) # Normalize [-1, 1] for stable plots mx = float(np.max(np.abs(y))) + 1e-9 y = y / mx duration = float(len(y) / sr) hop = 160 # 10ms @ 16k frame = 400 # 25ms @ 16k rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0] zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0] rms_mean = float(np.mean(rms)) if rms.size else float("nan") rms_std = float(np.std(rms)) if rms.size else float("nan") zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan") # Pitch via pyin (can fail on noise/short clips) try: f0, _, _ = librosa.pyin( y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr, frame_length=frame, hop_length=hop, ) except Exception: f0 = None if f0 is None: pitch = np.array([]) times = np.array([]) pitch_median = float("nan") pitch_iqr = float("nan") voiced_ratio = float("nan") else: pitch = np.asarray(f0, dtype=np.float32) times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop) voiced = np.isfinite(pitch) voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan") if np.any(voiced): pv = pitch[voiced] pitch_median = float(np.median(pv)) q75, q25 = np.percentile(pv, [75, 25]) pitch_iqr = float(q75 - q25) else: pitch_median = float("nan") pitch_iqr = float("nan") # Pause detection: low-RMS frames as silence if rms.size: thr = float(np.percentile(rms, 20)) * 0.8 silent = rms < thr # pauses >= 0.2s min_pause_frames = int(0.2 / (hop / sr)) pauses = [] start = None for i, s in enumerate(silent): if s and start is None: start = i if (not s) and start is not None: end = i if (end - start) >= min_pause_frames: pauses.append((start, end)) start = None if start is not None: end = len(silent) if (end - start) >= min_pause_frames: pauses.append((start, end)) n_pauses = int(len(pauses)) pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses)) active_ratio = float(1.0 - np.mean(silent)) else: thr = None pauses = [] n_pauses = 0 pause_total_s = 0.0 active_ratio = float("nan") feats = Features( duration_s=duration, rms_mean=rms_mean, rms_std=rms_std, zcr_mean=zcr_mean, pitch_median_hz=pitch_median, pitch_iqr_hz=pitch_iqr, voiced_ratio=voiced_ratio, n_pauses=n_pauses, pause_total_s=pause_total_s, active_ratio=active_ratio, ) artifacts = { "y": y, "sr": sr, "hop": hop, "frame": frame, "rms": rms, "zcr": zcr, "times": times, "pitch": pitch, "pauses": pauses, "rms_thr": thr, } return feats, artifacts # ----------------------------- # Plotting # ----------------------------- def plot_waveform_with_pauses(art: Dict[str, Any]) -> plt.Figure: y = art["y"] sr = art["sr"] hop = art["hop"] pauses = art.get("pauses", []) fig = plt.figure(figsize=(10, 3.2)) ax = fig.add_subplot(111) if y.size: t = np.arange(len(y)) / sr ax.plot(t, y, linewidth=0.8) for (s, e) in pauses: ts = s * (hop / sr) te = e * (hop / sr) ax.axvspan(ts, te, alpha=0.2) ax.set_title("Waveform (met gedetecteerde pauzes)") ax.set_xlabel("Tijd (s)") ax.set_ylabel("Amplitude") else: ax.text(0.5, 0.5, "Geen audio", ha="center", va="center") ax.set_axis_off() fig.tight_layout() return fig def plot_pitch(art: Dict[str, Any]) -> plt.Figure: pitch = art.get("pitch", np.array([])) times = art.get("times", np.array([])) fig = plt.figure(figsize=(10, 3.2)) ax = fig.add_subplot(111) if pitch.size and times.size: ax.plot(times, pitch, linewidth=1.0) ax.set_title("Pitch contour (NaN = onvoiced)") ax.set_xlabel("Tijd (s)") ax.set_ylabel("Pitch (Hz)") else: ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort/ruis)", ha="center", va="center") ax.set_axis_off() fig.tight_layout() return fig # ----------------------------- # UI formatting # ----------------------------- def features_table(feats: Features) -> List[List[str]]: def f3(x): return "—" if (x is None or not math.isfinite(x)) else f"{float(x):.3f}" return [ ["Duur", human_seconds(feats.duration_s)], ["Volume (RMS) gemiddeld", f3(feats.rms_mean)], ["Volume (RMS) variatie", f3(feats.rms_std)], ["ZCR (ruis/‘scherpte’) gemiddeld", f3(feats.zcr_mean)], ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"], ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"], ["Voiced ratio", safe_pct(feats.voiced_ratio)], ["Aantal pauzes (≥ 0.2s)", str(int(feats.n_pauses))], ["Totale pauzeduur", human_seconds(feats.pause_total_s)], ["Actieve-spraak ratio", safe_pct(feats.active_ratio)], ] def explain_text(feats: Features) -> str: bullets = [] bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), totaal {human_seconds(feats.pause_total_s)}.") if math.isfinite(feats.pitch_median_hz): bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding {feats.pitch_iqr_hz:.1f} Hz (IQR).") if math.isfinite(feats.rms_mean): bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; vooral binnen dezelfde setup vergelijken).") bullets.append(f"- **Actieve spraak**: {safe_pct(feats.active_ratio)} van de tijd boven drempel.") return ( "### Wat ‘ziet’ de AI hier?\n" "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* (niet ‘waarom’ ze veranderen).\n\n" + "\n".join(bullets) + "\n\n" "**Belangrijk:** dit is **geen diagnose** en **geen medisch hulpmiddel**. " "Gebruik dit als **educatieve visualisatie** of gespreksstarter." ) # ----------------------------- # Callback # ----------------------------- def analyze_one(audio: Tuple[int, np.ndarray]): if audio is None: return ( gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]], headers=["Kenmerk", "Waarde"]), None, None, "### Upload of neem audio op", ) sr, y = audio feats, art = compute_features(y, sr) table = features_table(feats) wf = plot_waveform_with_pauses(art) pc = plot_pitch(art) expl = explain_text(feats) return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl # ----------------------------- # Polished UI # ----------------------------- CSS = """ :root{ --bg: #0b0f19; --panel: rgba(255,255,255,0.06); --text: rgba(255,255,255,0.92); --muted: rgba(255,255,255,0.72); --border: rgba(255,255,255,0.14); --shadow: 0 12px 30px rgba(0,0,0,0.35); } .gradio-container{ background: radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%), radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%), radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%), var(--bg) !important; color: var(--text) !important; } #header{ background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14)); border: 1px solid var(--border); border-radius: 18px; padding: 18px 18px 14px 18px; box-shadow: var(--shadow); } #title{ font-size: 28px; font-weight: 780; letter-spacing: -0.02em; margin: 0; } #subtitle{ margin-top: 8px; color: var(--muted); font-size: 14px; line-height: 1.45; } .badge{ display: inline-flex; align-items: center; gap: 8px; padding: 6px 10px; border-radius: 999px; border: 1px solid var(--border); background: rgba(255,255,255,0.05); color: var(--muted); font-size: 12px; margin-right: 10px; margin-bottom: 8px; } .badge b{ color: var(--text); font-weight: 720; } .card{ background: var(--panel); border: 1px solid var(--border); border-radius: 18px; padding: 14px; box-shadow: var(--shadow); } """ def build_ui(): with gr.Blocks( css=CSS, theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"), title="Explainable Speech Analytics (Demo)", ) as demo: gr.HTML( """
Explainable Speech Analytics
Upload of neem een kort fragment op. Je ziet daarna pauzes, pitch en volume-energie in grafieken en tabellen — bedoeld als uitleg en dialoog, niet als oordeel.