subtext-arena / server /audio_tools.py
aamrinder's picture
Upload folder using huggingface_hub
225e725 verified
"""Audio-tool wrappers for the Subtext Arena environment.
These functions render the cached prosody features and ground-truth transcript
into the human-readable text that the agent sees when it calls a tool.
Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio
tools (mirrors the AudioToolAgent / Path A architecture). The trained model
never sees raw audio β€” it reads these descriptions and reasons over them.
Inputs:
- sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}]
(loaded from MUStARD/data/sarcasm_data.json)
- prosody_cache: dict[clip_id -> prosody json] keyed by clip id
(loaded from data/prosody_cache/utterances/<id>.json)
The "span" argument on prosody / pitch tools is optional. Clips are short
(~3-5s), so for the hackathon we always render features over the whole clip;
span is accepted for forward compatibility but currently ignored.
"""
from __future__ import annotations
from typing import Any, Dict, Optional
def _bucket(value: float, low: float, high: float) -> str:
if value < low:
return "LOW"
if value > high:
return "HIGH"
return "MID"
def render_transcript(
clip_id: str,
sarcasm_data: Dict[str, dict],
) -> str:
"""Return the literal transcript + preceding conversational context.
Format:
Speaker context turns:
[LEONARD] I never would have identified the fingerprints of string theory ...
[SHELDON] My apologies. What's your plan?
Target utterance:
[SHELDON] It's just a privilege to watch your mind at work.
"""
entry = sarcasm_data.get(clip_id)
if entry is None:
return f"[error] clip {clip_id} not in sarcasm_data"
lines = []
ctx_turns = entry.get("context", [])
ctx_speakers = entry.get("context_speakers", [])
if ctx_turns:
lines.append("Conversational context (lines spoken just before):")
for spk, line in zip(ctx_speakers, ctx_turns):
lines.append(f" [{spk}] {line}")
spk = entry.get("speaker", "?")
lines.append("Target utterance (the line you must classify):")
lines.append(f" [{spk}] {entry.get('utterance', '')}")
return "\n".join(lines)
def render_prosody_features(
clip_id: str,
prosody: Dict[str, Any],
span: Optional[Dict[str, float]] = None,
) -> str:
"""Render pitch_var, energy, pauses, voiced_ratio as a text summary.
Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz
(sarcastic delivery typically shows higher pitch variability).
When the source audio is dominated by music or laugh tracks (~13% of MUStARD
clips), pyin produces unreliable f0 estimates. We detect this via
voiced_ratio < 0.1 and tell the agent the features are unreliable rather
than feeding it noise.
"""
if not prosody:
return f"[error] no prosody features cached for clip {clip_id}"
duration = prosody.get("duration_s", 0.0)
pitch_var = prosody.get("pitch_var_hz", 0.0)
pitch_mean = prosody.get("pitch_mean_hz", 0.0)
energy_var = prosody.get("energy_var", 0.0)
energy_mean = prosody.get("energy_mean", 0.0)
voiced_ratio = prosody.get("voiced_ratio", 0.0)
pre_pause_ms = prosody.get("pre_pause_ms", 0)
pauses = prosody.get("pauses", []) or []
if voiced_ratio < 0.1:
return (
f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
f" [WARNING] only {voiced_ratio:.0%} of frames have detected pitch β€” "
f"the audio may be dominated by music, laughter, or background noise. "
f"Prosody features for this clip are unreliable; rely primarily on "
f"the transcript and conversational context."
)
pitch_var_b = _bucket(pitch_var, 25.0, 45.0)
energy_var_b = _bucket(energy_var, 0.02, 0.05)
voiced_b = _bucket(voiced_ratio, 0.45, 0.75)
pause_lines = []
for (s, e) in pauses[:5]:
pause_lines.append(f" {s:.2f}s -> {e:.2f}s ({(e - s) * 1000:.0f}ms)")
pause_block = (
" Internal pauses >150ms:\n" + "\n".join(pause_lines)
if pause_lines
else " Internal pauses >150ms: none"
)
return (
f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
f" Pitch: mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n"
f" Energy: mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n"
f" Voiced: {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n"
f" Pre-utterance silence: {pre_pause_ms} ms\n"
f"{pause_block}"
)
def render_pitch_contour(
clip_id: str,
prosody: Dict[str, Any],
span: Optional[Dict[str, float]] = None,
) -> str:
"""Render the coarse pitch contour as a sparkline-like string."""
if not prosody:
return f"[error] no prosody features cached for clip {clip_id}"
contour = prosody.get("pitch_contour_hz", []) or []
if not contour:
return f"Pitch contour for clip {clip_id}: <unvoiced or silent>"
# ASCII sparkline in 8 levels
levels = "β–β–‚β–ƒβ–„β–…β–†β–‡β–ˆ"
lo, hi = min(contour), max(contour)
rng = max(hi - lo, 1.0)
spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour)
direction = "rising" if contour[-1] > contour[0] + 5 else (
"falling" if contour[-1] < contour[0] - 5 else "flat"
)
return (
f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n"
f" range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n"
f" shape: {spark}"
)