Spaces:
Sleeping
Sleeping
File size: 5,676 Bytes
225e725 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """Audio-tool wrappers for the Subtext Arena environment.
These functions render the cached prosody features and ground-truth transcript
into the human-readable text that the agent sees when it calls a tool.
Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio
tools (mirrors the AudioToolAgent / Path A architecture). The trained model
never sees raw audio β it reads these descriptions and reasons over them.
Inputs:
- sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}]
(loaded from MUStARD/data/sarcasm_data.json)
- prosody_cache: dict[clip_id -> prosody json] keyed by clip id
(loaded from data/prosody_cache/utterances/<id>.json)
The "span" argument on prosody / pitch tools is optional. Clips are short
(~3-5s), so for the hackathon we always render features over the whole clip;
span is accepted for forward compatibility but currently ignored.
"""
from __future__ import annotations
from typing import Any, Dict, Optional
def _bucket(value: float, low: float, high: float) -> str:
if value < low:
return "LOW"
if value > high:
return "HIGH"
return "MID"
def render_transcript(
clip_id: str,
sarcasm_data: Dict[str, dict],
) -> str:
"""Return the literal transcript + preceding conversational context.
Format:
Speaker context turns:
[LEONARD] I never would have identified the fingerprints of string theory ...
[SHELDON] My apologies. What's your plan?
Target utterance:
[SHELDON] It's just a privilege to watch your mind at work.
"""
entry = sarcasm_data.get(clip_id)
if entry is None:
return f"[error] clip {clip_id} not in sarcasm_data"
lines = []
ctx_turns = entry.get("context", [])
ctx_speakers = entry.get("context_speakers", [])
if ctx_turns:
lines.append("Conversational context (lines spoken just before):")
for spk, line in zip(ctx_speakers, ctx_turns):
lines.append(f" [{spk}] {line}")
spk = entry.get("speaker", "?")
lines.append("Target utterance (the line you must classify):")
lines.append(f" [{spk}] {entry.get('utterance', '')}")
return "\n".join(lines)
def render_prosody_features(
clip_id: str,
prosody: Dict[str, Any],
span: Optional[Dict[str, float]] = None,
) -> str:
"""Render pitch_var, energy, pauses, voiced_ratio as a text summary.
Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz
(sarcastic delivery typically shows higher pitch variability).
When the source audio is dominated by music or laugh tracks (~13% of MUStARD
clips), pyin produces unreliable f0 estimates. We detect this via
voiced_ratio < 0.1 and tell the agent the features are unreliable rather
than feeding it noise.
"""
if not prosody:
return f"[error] no prosody features cached for clip {clip_id}"
duration = prosody.get("duration_s", 0.0)
pitch_var = prosody.get("pitch_var_hz", 0.0)
pitch_mean = prosody.get("pitch_mean_hz", 0.0)
energy_var = prosody.get("energy_var", 0.0)
energy_mean = prosody.get("energy_mean", 0.0)
voiced_ratio = prosody.get("voiced_ratio", 0.0)
pre_pause_ms = prosody.get("pre_pause_ms", 0)
pauses = prosody.get("pauses", []) or []
if voiced_ratio < 0.1:
return (
f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
f" [WARNING] only {voiced_ratio:.0%} of frames have detected pitch β "
f"the audio may be dominated by music, laughter, or background noise. "
f"Prosody features for this clip are unreliable; rely primarily on "
f"the transcript and conversational context."
)
pitch_var_b = _bucket(pitch_var, 25.0, 45.0)
energy_var_b = _bucket(energy_var, 0.02, 0.05)
voiced_b = _bucket(voiced_ratio, 0.45, 0.75)
pause_lines = []
for (s, e) in pauses[:5]:
pause_lines.append(f" {s:.2f}s -> {e:.2f}s ({(e - s) * 1000:.0f}ms)")
pause_block = (
" Internal pauses >150ms:\n" + "\n".join(pause_lines)
if pause_lines
else " Internal pauses >150ms: none"
)
return (
f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
f" Pitch: mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n"
f" Energy: mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n"
f" Voiced: {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n"
f" Pre-utterance silence: {pre_pause_ms} ms\n"
f"{pause_block}"
)
def render_pitch_contour(
clip_id: str,
prosody: Dict[str, Any],
span: Optional[Dict[str, float]] = None,
) -> str:
"""Render the coarse pitch contour as a sparkline-like string."""
if not prosody:
return f"[error] no prosody features cached for clip {clip_id}"
contour = prosody.get("pitch_contour_hz", []) or []
if not contour:
return f"Pitch contour for clip {clip_id}: <unvoiced or silent>"
# ASCII sparkline in 8 levels
levels = "βββββ
βββ"
lo, hi = min(contour), max(contour)
rng = max(hi - lo, 1.0)
spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour)
direction = "rising" if contour[-1] > contour[0] + 5 else (
"falling" if contour[-1] < contour[0] - 5 else "flat"
)
return (
f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n"
f" range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n"
f" shape: {spark}"
)
|