Marcel0123's picture
Update app.py
5216a05 verified
raw
history blame
13.7 kB
import os
import math
import numpy as np
import gradio as gr
import librosa
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Dict, Any, Tuple, List
# -----------------------------
# Config
# -----------------------------
TARGET_SR = 16000
# -----------------------------
# Helpers
# -----------------------------
def human_seconds(sec: float) -> str:
if sec is None or not math.isfinite(sec):
return "—"
if sec < 60:
return f"{sec:.1f}s"
m = int(sec // 60)
return f"{m}m {sec - 60*m:.1f}s"
def safe_pct(x: float) -> str:
if x is None or not math.isfinite(x):
return "—"
return f"{x*100:.1f}%"
# -----------------------------
# Features
# -----------------------------
@dataclass
class Features:
duration_s: float
rms_mean: float
rms_std: float
zcr_mean: float
pitch_median_hz: float
pitch_iqr_hz: float
voiced_ratio: float
n_pauses: int
pause_total_s: float
active_ratio: float
def compute_features(y: np.ndarray, sr: int) -> Tuple[Features, Dict[str, Any]]:
"""
Explainable acoustic features + artifacts for plotting.
(No medical claims; only measurable signals.)
"""
if y is None or len(y) == 0:
f = Features(
duration_s=float("nan"),
rms_mean=float("nan"),
rms_std=float("nan"),
zcr_mean=float("nan"),
pitch_median_hz=float("nan"),
pitch_iqr_hz=float("nan"),
voiced_ratio=float("nan"),
n_pauses=0,
pause_total_s=0.0,
active_ratio=float("nan"),
)
return f, {"y": np.array([]), "sr": sr}
# Resample to stable SR
if sr != TARGET_SR:
y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=TARGET_SR)
sr = TARGET_SR
else:
y = y.astype(np.float32)
# Normalize [-1, 1] for stable plots
mx = float(np.max(np.abs(y))) + 1e-9
y = y / mx
duration = float(len(y) / sr)
hop = 160 # 10ms @ 16k
frame = 400 # 25ms @ 16k
rms = librosa.feature.rms(y=y, frame_length=frame, hop_length=hop)[0]
zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame, hop_length=hop)[0]
rms_mean = float(np.mean(rms)) if rms.size else float("nan")
rms_std = float(np.std(rms)) if rms.size else float("nan")
zcr_mean = float(np.mean(zcr)) if zcr.size else float("nan")
# Pitch via pyin (can fail on noise/short clips)
try:
f0, _, _ = librosa.pyin(
y,
fmin=librosa.note_to_hz("C2"),
fmax=librosa.note_to_hz("C7"),
sr=sr,
frame_length=frame,
hop_length=hop,
)
except Exception:
f0 = None
if f0 is None:
pitch = np.array([])
times = np.array([])
pitch_median = float("nan")
pitch_iqr = float("nan")
voiced_ratio = float("nan")
else:
pitch = np.asarray(f0, dtype=np.float32)
times = librosa.frames_to_time(np.arange(len(pitch)), sr=sr, hop_length=hop)
voiced = np.isfinite(pitch)
voiced_ratio = float(np.mean(voiced)) if voiced.size else float("nan")
if np.any(voiced):
pv = pitch[voiced]
pitch_median = float(np.median(pv))
q75, q25 = np.percentile(pv, [75, 25])
pitch_iqr = float(q75 - q25)
else:
pitch_median = float("nan")
pitch_iqr = float("nan")
# Pause detection: low-RMS frames as silence
if rms.size:
thr = float(np.percentile(rms, 20)) * 0.8
silent = rms < thr
# pauses >= 0.2s
min_pause_frames = int(0.2 / (hop / sr))
pauses = []
start = None
for i, s in enumerate(silent):
if s and start is None:
start = i
if (not s) and start is not None:
end = i
if (end - start) >= min_pause_frames:
pauses.append((start, end))
start = None
if start is not None:
end = len(silent)
if (end - start) >= min_pause_frames:
pauses.append((start, end))
n_pauses = int(len(pauses))
pause_total_s = float(sum((e - s) * (hop / sr) for s, e in pauses))
active_ratio = float(1.0 - np.mean(silent))
else:
thr = None
pauses = []
n_pauses = 0
pause_total_s = 0.0
active_ratio = float("nan")
feats = Features(
duration_s=duration,
rms_mean=rms_mean,
rms_std=rms_std,
zcr_mean=zcr_mean,
pitch_median_hz=pitch_median,
pitch_iqr_hz=pitch_iqr,
voiced_ratio=voiced_ratio,
n_pauses=n_pauses,
pause_total_s=pause_total_s,
active_ratio=active_ratio,
)
artifacts = {
"y": y,
"sr": sr,
"hop": hop,
"frame": frame,
"rms": rms,
"zcr": zcr,
"times": times,
"pitch": pitch,
"pauses": pauses,
"rms_thr": thr,
}
return feats, artifacts
# -----------------------------
# Plotting
# -----------------------------
def plot_waveform_with_pauses(art: Dict[str, Any]) -> plt.Figure:
y = art["y"]
sr = art["sr"]
hop = art["hop"]
pauses = art.get("pauses", [])
fig = plt.figure(figsize=(10, 3.2))
ax = fig.add_subplot(111)
if y.size:
t = np.arange(len(y)) / sr
ax.plot(t, y, linewidth=0.8)
for (s, e) in pauses:
ts = s * (hop / sr)
te = e * (hop / sr)
ax.axvspan(ts, te, alpha=0.2)
ax.set_title("Waveform (met gedetecteerde pauzes)")
ax.set_xlabel("Tijd (s)")
ax.set_ylabel("Amplitude")
else:
ax.text(0.5, 0.5, "Geen audio", ha="center", va="center")
ax.set_axis_off()
fig.tight_layout()
return fig
def plot_pitch(art: Dict[str, Any]) -> plt.Figure:
pitch = art.get("pitch", np.array([]))
times = art.get("times", np.array([]))
fig = plt.figure(figsize=(10, 3.2))
ax = fig.add_subplot(111)
if pitch.size and times.size:
ax.plot(times, pitch, linewidth=1.0)
ax.set_title("Pitch contour (NaN = onvoiced)")
ax.set_xlabel("Tijd (s)")
ax.set_ylabel("Pitch (Hz)")
else:
ax.text(0.5, 0.5, "Pitch niet beschikbaar (te kort/ruis)", ha="center", va="center")
ax.set_axis_off()
fig.tight_layout()
return fig
# -----------------------------
# UI formatting
# -----------------------------
def features_table(feats: Features) -> List[List[str]]:
def f3(x):
return "—" if (x is None or not math.isfinite(x)) else f"{float(x):.3f}"
return [
["Duur", human_seconds(feats.duration_s)],
["Volume (RMS) gemiddeld", f3(feats.rms_mean)],
["Volume (RMS) variatie", f3(feats.rms_std)],
["ZCR (ruis/‘scherpte’) gemiddeld", f3(feats.zcr_mean)],
["Pitch mediaan", "—" if not math.isfinite(feats.pitch_median_hz) else f"{feats.pitch_median_hz:.1f} Hz"],
["Pitch spreiding (IQR)", "—" if not math.isfinite(feats.pitch_iqr_hz) else f"{feats.pitch_iqr_hz:.1f} Hz"],
["Voiced ratio", safe_pct(feats.voiced_ratio)],
["Aantal pauzes (≥ 0.2s)", str(int(feats.n_pauses))],
["Totale pauzeduur", human_seconds(feats.pause_total_s)],
["Actieve-spraak ratio", safe_pct(feats.active_ratio)],
]
def explain_text(feats: Features) -> str:
bullets = []
bullets.append(f"- **Pauzes**: {feats.n_pauses} pauzes (≥0.2s), totaal {human_seconds(feats.pause_total_s)}.")
if math.isfinite(feats.pitch_median_hz):
bullets.append(f"- **Pitch**: mediaan ~ {feats.pitch_median_hz:.1f} Hz, spreiding {feats.pitch_iqr_hz:.1f} Hz (IQR).")
if math.isfinite(feats.rms_mean):
bullets.append(f"- **Volume**: RMS gemiddeld {feats.rms_mean:.3f} (relatief; vooral binnen dezelfde setup vergelijken).")
bullets.append(f"- **Actieve spraak**: {safe_pct(feats.active_ratio)} van de tijd boven drempel.")
return (
"### Wat ‘ziet’ de AI hier?\n"
"Dit is een **uitleg-demo**: we tonen *meetbare spraaksignalen* (niet ‘waarom’ ze veranderen).\n\n"
+ "\n".join(bullets)
+ "\n\n"
"**Belangrijk:** dit is **geen diagnose** en **geen medisch hulpmiddel**. "
"Gebruik dit als **educatieve visualisatie** of gespreksstarter."
)
# -----------------------------
# Callback
# -----------------------------
def analyze_one(audio: Tuple[int, np.ndarray]):
if audio is None:
return (
gr.Dataframe(value=[["—", "Upload of neem audio op om te starten."]], headers=["Kenmerk", "Waarde"]),
None,
None,
"### Upload of neem audio op",
)
sr, y = audio
feats, art = compute_features(y, sr)
table = features_table(feats)
wf = plot_waveform_with_pauses(art)
pc = plot_pitch(art)
expl = explain_text(feats)
return gr.Dataframe(value=table, headers=["Kenmerk", "Waarde"]), wf, pc, expl
# -----------------------------
# Polished UI
# -----------------------------
CSS = """
:root{
--bg: #0b0f19;
--panel: rgba(255,255,255,0.06);
--text: rgba(255,255,255,0.92);
--muted: rgba(255,255,255,0.72);
--border: rgba(255,255,255,0.14);
--shadow: 0 12px 30px rgba(0,0,0,0.35);
}
.gradio-container{
background:
radial-gradient(1200px 700px at 10% 10%, rgba(124,58,237,0.25), transparent 55%),
radial-gradient(900px 600px at 90% 20%, rgba(34,197,94,0.18), transparent 55%),
radial-gradient(1100px 800px at 40% 100%, rgba(59,130,246,0.15), transparent 60%),
var(--bg) !important;
color: var(--text) !important;
}
#header{
background: linear-gradient(135deg, rgba(124,58,237,0.22), rgba(34,197,94,0.14));
border: 1px solid var(--border);
border-radius: 18px;
padding: 18px 18px 14px 18px;
box-shadow: var(--shadow);
}
#title{
font-size: 28px;
font-weight: 780;
letter-spacing: -0.02em;
margin: 0;
}
#subtitle{
margin-top: 8px;
color: var(--muted);
font-size: 14px;
line-height: 1.45;
}
.badge{
display: inline-flex;
align-items: center;
gap: 8px;
padding: 6px 10px;
border-radius: 999px;
border: 1px solid var(--border);
background: rgba(255,255,255,0.05);
color: var(--muted);
font-size: 12px;
margin-right: 10px;
margin-bottom: 8px;
}
.badge b{ color: var(--text); font-weight: 720; }
.card{
background: var(--panel);
border: 1px solid var(--border);
border-radius: 18px;
padding: 14px;
box-shadow: var(--shadow);
}
"""
def build_ui():
with gr.Blocks(
css=CSS,
theme=gr.themes.Soft(primary_hue="violet", secondary_hue="emerald"),
title="Explainable Speech Analytics (Demo)",
) as demo:
gr.HTML(
"""
<div id="header">
<p id="title">Explainable Speech Analytics</p>
<div id="subtitle">
<span class="badge"><b>Doel</b> inzicht in spraaksignalen</span>
<span class="badge"><b>Geen diagnose</b> geen medisch hulpmiddel</span>
<span class="badge"><b>Anti–black box</b> we tonen signalen, niet alleen scores</span>
<p style="margin-top:10px">
Upload of neem een kort fragment op. Je ziet daarna <b>pauzes</b>, <b>pitch</b> en <b>volume-energie</b>
in grafieken en tabellen — bedoeld als uitleg en dialoog, niet als oordeel.
</p>
</div>
</div>
"""
)
with gr.Row():
with gr.Column(scale=5):
audio = gr.Audio(label="Audio", sources=["upload", "microphone"], type="numpy")
run = gr.Button("Analyseer", variant="primary")
with gr.Accordion("Wat gebeurt er technisch?", open=False):
gr.Markdown(
"""
- We extraheren **akoestische kenmerken** (RMS, ZCR), schatten **pitch** met *pyin*,
en detecteren **pauzes** via een adaptieve energiedrempel.
- We tonen de gemeten signalen als grafieken zodat het **uitlegbaar** blijft.
"""
)
with gr.Column(scale=7):
feats_df = gr.Dataframe(
headers=["Kenmerk", "Waarde"],
datatype=["str", "str"],
interactive=False,
wrap=True,
label="Meetbare kenmerken",
)
wf_plot = gr.Plot(label="Waveform + pauzes")
pitch_plot = gr.Plot(label="Pitch")
explanation = gr.Markdown("### Upload of neem audio op", elem_classes=["card"])
run.click(analyze_one, inputs=[audio], outputs=[feats_df, wf_plot, pitch_plot, explanation])
with gr.Accordion("Ethiek & transparantie", open=False):
gr.Markdown(
"""
- Deze demo geeft **geen diagnose** en maakt **geen klinische claim**.
- Output is bedoeld als **observatie** (meetbare signalen) om gesprekken te ondersteunen.
- In zorgcontext: interpretatie hoort altijd samen met **context + gesprek + klinisch oordeel**.
"""
)
return demo
if __name__ == "__main__":
demo = build_ui()
demo.queue(max_size=32)
# HF Spaces-proof: use the port provided by the platform
port = int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860")))
demo.launch(server_name="0.0.0.0", server_port=port)