| import os |
| import math |
| import numpy as np |
| import gradio as gr |
| import librosa |
| import matplotlib.pyplot as plt |
|
|
| from dataclasses import dataclass |
| from typing import Dict, Any, Tuple, List |
|
|
| |
| |
| |
| TARGET_SR = 16000 |
|
|
| |
| |
| |
| def human_seconds(sec: float) -> str: |
| if sec is None or not math.isfinite(sec): |
| return "—" |
| if sec < 60: |
| return f"{sec:.1f}s" |
| m = int(sec // 60) |
| return f"{m}m {sec - 60*m:.1f}s" |
|
|
|
|
| def safe_pct(x: float) -> str: |
| if x is None or not math.isfinite(x): |
| return "—" |
| return f"{x*100:.1f}%" |
|
|
|
|
| |
| |
| |
| @dataclass |
| class Features: |
| duration_s: float |
| rms_mean: float |
| rms_std: float |
| zcr_mean: float |
| pitch_median_hz: float |
| pitch_iqr_hz: float |
| voiced_ratio: float |
| n_pauses: int |
| pause_total_s: float |
| active_ratio: float |
|
|
|
|
| def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]: |
| """ |
| Explainable acoustic features + artifacts for plotting. |
| (No medical claims; only measurable signals.) |
| """ |
| if y is None or len(y) == 0: |
| f = Features( |
| duration_s=float("nan"), |
| rms_mean=float("nan"), |
| rms_std=float("nan"), |
| zcr_mean=float("nan"), |
| pitch_median_hz=float("nan"), |
| pitch_iqr_hz=float("nan"), |
| voiced_ratio=float("nan"), |
| n_pauses=0, |
| pause_total_s=0.0, |
| active_ratio=float("nan"), |
| ) |
| return f, {"y": np.array([]), "sr": sr} |
|
|
| |
| if sr != TARGET_SR: |
| y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=TARGET_SR) |
| sr = TARGET_SR |
| else: |
| y = y.astype(np.float32) |
|
|
| |
| mx = float(np.max(np.abs(y))) + 1e-9 |
| y = y / mx |
|
|
| duration = float(len(y) / sr) |
|
|
| hop = 160 |
| frame = 400 |
|
|
| rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0] |
| zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0] |
|
|
| rms_mean = float(np.mean(rms)) if rms.size else float("nan") |
| rms_std = float(np.std(rms)) if rms.size else float("nan") |
| zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan") |
|
|
| |
| try: |
| f0, _, _ = librosa.pyin( |
| y, |
| fmin=librosa.note_to_hz("C2"), |
| fmax=librosa.note_to_hz("C7"), |
| sr=sr, |
| frame_length=frame, |
| hop_length=hop, |
| ) |
| except Exception: |
| f0 = None |
|
|
| if f0 is None: |
| pitch = np.array([]) |
| times = np.array([]) |
| pitch_median = float("nan") |
| pitch_iqr = float("nan") |
| voiced_ratio = float("nan") |
| else: |
| pitch = np.asarray(f0, dtype=np.float32) |
| times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop) |
| voiced = np.isfinite(pitch) |
| voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan") |
| if np.any(voiced): |
| pv = pitch[voiced] |
| pitch_median = float(np.median(pv)) |
| q75, q25 = np.percentile(pv, [75, 25]) |
| pitch_iqr = float(q75 - q25) |
| else: |
| pitch_median = float("nan") |
| pitch_iqr = float("nan") |
|
|
| |
| if rms.size: |
| thr = float(np.percentile(rms, 20)) * 0.8 |
| silent = rms < thr |
|
|
| |
| min_pause_frames = int(0.2 / (hop / sr)) |
|
|
| pauses = [] |
| start = None |
| for i, s in enumerate(silent): |
| if s and start is None: |
| start = i |
| if (not s) and start is not None: |
| end = i |
| if (end - start) >= min_pause_frames: |
| pauses.append((start, end)) |
| start = None |
| if start is not None: |
| end = len(silent) |
| if (end - start) >= min_pause_frames: |
| pauses.append((start, end)) |
|
|
| n_pauses = int(len(pauses)) |
| pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses)) |
| active_ratio = float(1.0 - np.mean(silent)) |
| else: |
| thr = None |
| pauses = [] |
| n_pauses = 0 |
| pause_total_s = 0.0 |
| active_ratio = float("nan") |
|
|
| feats = Features( |
| duration_s=duration, |
| rms_mean=rms_mean, |
| rms_std=rms_std, |
| zcr_mean=zcr_mean, |
| pitch_median_hz=pitch_median, |
| pitch_iqr_hz=pitch_iqr, |
| voiced_ratio=voiced_ratio, |
| n_pauses=n_pauses, |
| pause_total_s=pause_total_s, |
| active_ratio=active_ratio, |
| ) |
|
|
| artifacts = { |
| "y": y, |
| "sr": sr, |
| "hop": hop, |
| "frame": frame, |
| "rms": rms, |
| "zcr": zcr, |
| "times": times, |
| "pitch": pitch, |
| "pauses": pauses, |
| "rms_thr": thr, |
| } |
| return feats, artifacts |
|
|
|
|
| |
| |
| |
| def plot_waveform_with_pauses(art: Dict[str, Any]) -> plt.Figure: |
| y = art["y"] |
| sr = art["sr"] |
| hop = art["hop"] |
| pauses = art.get("pauses", []) |
|
|
| fig = plt.figure(figsize=(10, 3.2)) |
| ax = fig.add_subplot(111) |
|
|
| if y.size: |
| t = np.arange(len(y)) / sr |
| ax.plot(t, y, linewidth=0.8) |
| for (s, e) in pauses: |
| ts = s * (hop / sr) |
| te = e * (hop / sr) |
| ax.axvspan(ts, te, alpha=0.2) |
| ax.set_title("Waveform (met gedetecteerde pauzes)") |
| ax.set_xlabel("Tijd (s)") |
| ax.set_ylabel("Amplitude") |
| else: |
| ax.text(0.5, 0.5, "Geen audio", ha="center", va="center") |
| ax.set_axis_off() |
|
|
| fig.tight_layout() |
| return fig |
|
|
|
|
| def plot_pitch(art: Dict[str, Any]) -> plt.Figure: |
| pitch = art.get("pitch", np.array([])) |
| times = art.get("times", np.array([])) |
|
|
| fig = plt.figure(figsize=(10, 3.2)) |
| ax = fig.add_subplot(111) |
|
|
| if pitch.size and times.size: |
| ax.plot(times, pitch, linewidth=1.0) |
| ax.set_title("Pitch contour (NaN = onvoiced)") |
| ax.set_xlabel("Tijd (s)") |
| ax.set_ylabel("Pitch (Hz)") |
| else: |
| ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort/ruis)", ha="center", va="center") |
| ax.set_axis_off() |
|
|
| fig.tight_layout() |
| return fig |
|
|
|
|
| |
| |
| |
| def features_table(feats: Features) -> List[List[str]]: |
| def f3(x): |
| return "—" if (x is None or not math.isfinite(x)) else f"{float(x):.3f}" |
|
|
| return [ |
| ["Duur", human_seconds(feats.duration_s)], |
| ["Volume (RMS) gemiddeld", f3(feats.rms_mean)], |
| ["Volume (RMS) variatie", f3(feats.rms_std)], |
| ["ZCR (ruis/‘scherpte’) gemiddeld", f3(feats.zcr_mean)], |
| ["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"], |
| ["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"], |
| ["Voiced ratio", safe_pct(feats.voiced_ratio)], |
| ["Aantal pauzes (≥ 0.2s)", str(int(feats.n_pauses))], |
| ["Totale pauzeduur", human_seconds(feats.pause_total_s)], |
| ["Actieve-spraak ratio", safe_pct(feats.active_ratio)], |
| ] |
|
|
|
|
| def explain_text(feats: Features) -> str: |
| bullets = [] |
| bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), totaal {human_seconds(feats.pause_total_s)}.") |
| if math.isfinite(feats.pitch_median_hz): |
| bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding {feats.pitch_iqr_hz:.1f} Hz (IQR).") |
| if math.isfinite(feats.rms_mean): |
| bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; vooral binnen dezelfde setup vergelijken).") |
| bullets.append(f"- **Actieve spraak**: {safe_pct(feats.active_ratio)} van de tijd boven drempel.") |
|
|
| return ( |
| "### Wat ‘ziet’ de AI hier?\n" |
| "Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* (niet ‘waarom’ ze veranderen).\n\n" |
| + "\n".join(bullets) |
| + "\n\n" |
| "**Belangrijk:** dit is **geen diagnose** en **geen medisch hulpmiddel**. " |
| "Gebruik dit als **educatieve visualisatie** of gespreksstarter." |
| ) |
|
|
|
|
| |
| |
| |
| def analyze_one(audio: Tuple[int, np.ndarray]): |
| if audio is None: |
| return ( |
| gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]], headers=["Kenmerk", "Waarde"]), |
| None, |
| None, |
| "### Upload of neem audio op", |
| ) |
|
|
| sr, y = audio |
| feats, art = compute_features(y, sr) |
| table = features_table(feats) |
| wf = plot_waveform_with_pauses(art) |
| pc = plot_pitch(art) |
| expl = explain_text(feats) |
|
|
| return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl |
|
|
|
|
| |
| |
| |
| CSS = """ |
| :root{ |
| --bg: #0b0f19; |
| --panel: rgba(255,255,255,0.06); |
| --text: rgba(255,255,255,0.92); |
| --muted: rgba(255,255,255,0.72); |
| --border: rgba(255,255,255,0.14); |
| --shadow: 0 12px 30px rgba(0,0,0,0.35); |
| } |
| |
| .gradio-container{ |
| background: |
| radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%), |
| radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%), |
| radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%), |
| var(--bg) !important; |
| color: var(--text) !important; |
| } |
| |
| #header{ |
| background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14)); |
| border: 1px solid var(--border); |
| border-radius: 18px; |
| padding: 18px 18px 14px 18px; |
| box-shadow: var(--shadow); |
| } |
| |
| #title{ |
| font-size: 28px; |
| font-weight: 780; |
| letter-spacing: -0.02em; |
| margin: 0; |
| } |
| |
| #subtitle{ |
| margin-top: 8px; |
| color: var(--muted); |
| font-size: 14px; |
| line-height: 1.45; |
| } |
| |
| .badge{ |
| display: inline-flex; |
| align-items: center; |
| gap: 8px; |
| padding: 6px 10px; |
| border-radius: 999px; |
| border: 1px solid var(--border); |
| background: rgba(255,255,255,0.05); |
| color: var(--muted); |
| font-size: 12px; |
| margin-right: 10px; |
| margin-bottom: 8px; |
| } |
| .badge b{ color: var(--text); font-weight: 720; } |
| |
| .card{ |
| background: var(--panel); |
| border: 1px solid var(--border); |
| border-radius: 18px; |
| padding: 14px; |
| box-shadow: var(--shadow); |
| } |
| """ |
|
|
| def build_ui(): |
| with gr.Blocks( |
| css=CSS, |
| theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"), |
| title="Explainable Speech Analytics (Demo)", |
| ) as demo: |
|
|
| gr.HTML( |
| """ |
| <div id="header"> |
| <p id="title">Explainable Speech Analytics</p> |
| <div id="subtitle"> |
| <span class="badge"><b>Doel</b> inzicht in spraaksignalen</span> |
| <span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span> |
| <span class="badge"><b>Anti–black box</b> we tonen signalen, niet alleen scores</span> |
| <p style="margin-top:10px"> |
| Upload of neem een kort fragment op. Je ziet daarna <b>pauzes</b>, <b>pitch</b> en <b>volume-energie</b> |
| in grafieken en tabellen — bedoeld als uitleg en dialoog, niet als oordeel. |
| </p> |
| </div> |
| </div> |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=5): |
| audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="numpy") |
| run = gr.Button("Analyseer", variant="primary") |
| with gr.Accordion("Wat gebeurt er technisch?", open=False): |
| gr.Markdown( |
| """ |
| - We extraheren **akoestische kenmerken** (RMS, ZCR), schatten **pitch** met *pyin*, |
| en detecteren **pauzes** via een adaptieve energiedrempel. |
| - We tonen de gemeten signalen als grafieken zodat het **uitlegbaar** blijft. |
| """ |
| ) |
|
|
| with gr.Column(scale=7): |
| feats_df = gr.Dataframe( |
| headers=["Kenmerk", "Waarde"], |
| datatype=["str", "str"], |
| interactive=False, |
| wrap=True, |
| label="Meetbare kenmerken", |
| ) |
| wf_plot = gr.Plot(label="Waveform + pauzes") |
| pitch_plot = gr.Plot(label="Pitch") |
| explanation = gr.Markdown("### Upload of neem audio op", elem_classes=["card"]) |
|
|
| run.click(analyze_one, inputs=[audio], outputs=[feats_df, wf_plot, pitch_plot, explanation]) |
|
|
| with gr.Accordion("Ethiek & transparantie", open=False): |
| gr.Markdown( |
| """ |
| - Deze demo geeft **geen diagnose** en maakt **geen klinische claim**. |
| - Output is bedoeld als **observatie** (meetbare signalen) om gesprekken te ondersteunen. |
| - In zorgcontext: interpretatie hoort altijd samen met **context + gesprek + klinisch oordeel**. |
| """ |
| ) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = build_ui() |
| demo.queue(max_size=32) |
|
|
| |
| port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860"))) |
| demo.launch(server_name="0.0.0.0", server_port=port) |
|
|