Spaces:
Sleeping
Sleeping
| """Audio-tool wrappers for the Subtext Arena environment. | |
| These functions render the cached prosody features and ground-truth transcript | |
| into the human-readable text that the agent sees when it calls a tool. | |
| Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio | |
| tools (mirrors the AudioToolAgent / Path A architecture). The trained model | |
| never sees raw audio β it reads these descriptions and reasons over them. | |
| Inputs: | |
| - sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}] | |
| (loaded from MUStARD/data/sarcasm_data.json) | |
| - prosody_cache: dict[clip_id -> prosody json] keyed by clip id | |
| (loaded from data/prosody_cache/utterances/<id>.json) | |
| The "span" argument on prosody / pitch tools is optional. Clips are short | |
| (~3-5s), so for the hackathon we always render features over the whole clip; | |
| span is accepted for forward compatibility but currently ignored. | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, Dict, Optional | |
| def _bucket(value: float, low: float, high: float) -> str: | |
| if value < low: | |
| return "LOW" | |
| if value > high: | |
| return "HIGH" | |
| return "MID" | |
| def render_transcript( | |
| clip_id: str, | |
| sarcasm_data: Dict[str, dict], | |
| ) -> str: | |
| """Return the literal transcript + preceding conversational context. | |
| Format: | |
| Speaker context turns: | |
| [LEONARD] I never would have identified the fingerprints of string theory ... | |
| [SHELDON] My apologies. What's your plan? | |
| Target utterance: | |
| [SHELDON] It's just a privilege to watch your mind at work. | |
| """ | |
| entry = sarcasm_data.get(clip_id) | |
| if entry is None: | |
| return f"[error] clip {clip_id} not in sarcasm_data" | |
| lines = [] | |
| ctx_turns = entry.get("context", []) | |
| ctx_speakers = entry.get("context_speakers", []) | |
| if ctx_turns: | |
| lines.append("Conversational context (lines spoken just before):") | |
| for spk, line in zip(ctx_speakers, ctx_turns): | |
| lines.append(f" [{spk}] {line}") | |
| spk = entry.get("speaker", "?") | |
| lines.append("Target utterance (the line you must classify):") | |
| lines.append(f" [{spk}] {entry.get('utterance', '')}") | |
| return "\n".join(lines) | |
| def render_prosody_features( | |
| clip_id: str, | |
| prosody: Dict[str, Any], | |
| span: Optional[Dict[str, float]] = None, | |
| ) -> str: | |
| """Render pitch_var, energy, pauses, voiced_ratio as a text summary. | |
| Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz | |
| (sarcastic delivery typically shows higher pitch variability). | |
| When the source audio is dominated by music or laugh tracks (~13% of MUStARD | |
| clips), pyin produces unreliable f0 estimates. We detect this via | |
| voiced_ratio < 0.1 and tell the agent the features are unreliable rather | |
| than feeding it noise. | |
| """ | |
| if not prosody: | |
| return f"[error] no prosody features cached for clip {clip_id}" | |
| duration = prosody.get("duration_s", 0.0) | |
| pitch_var = prosody.get("pitch_var_hz", 0.0) | |
| pitch_mean = prosody.get("pitch_mean_hz", 0.0) | |
| energy_var = prosody.get("energy_var", 0.0) | |
| energy_mean = prosody.get("energy_mean", 0.0) | |
| voiced_ratio = prosody.get("voiced_ratio", 0.0) | |
| pre_pause_ms = prosody.get("pre_pause_ms", 0) | |
| pauses = prosody.get("pauses", []) or [] | |
| if voiced_ratio < 0.1: | |
| return ( | |
| f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n" | |
| f" [WARNING] only {voiced_ratio:.0%} of frames have detected pitch β " | |
| f"the audio may be dominated by music, laughter, or background noise. " | |
| f"Prosody features for this clip are unreliable; rely primarily on " | |
| f"the transcript and conversational context." | |
| ) | |
| pitch_var_b = _bucket(pitch_var, 25.0, 45.0) | |
| energy_var_b = _bucket(energy_var, 0.02, 0.05) | |
| voiced_b = _bucket(voiced_ratio, 0.45, 0.75) | |
| pause_lines = [] | |
| for (s, e) in pauses[:5]: | |
| pause_lines.append(f" {s:.2f}s -> {e:.2f}s ({(e - s) * 1000:.0f}ms)") | |
| pause_block = ( | |
| " Internal pauses >150ms:\n" + "\n".join(pause_lines) | |
| if pause_lines | |
| else " Internal pauses >150ms: none" | |
| ) | |
| return ( | |
| f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n" | |
| f" Pitch: mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n" | |
| f" Energy: mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n" | |
| f" Voiced: {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n" | |
| f" Pre-utterance silence: {pre_pause_ms} ms\n" | |
| f"{pause_block}" | |
| ) | |
| def render_pitch_contour( | |
| clip_id: str, | |
| prosody: Dict[str, Any], | |
| span: Optional[Dict[str, float]] = None, | |
| ) -> str: | |
| """Render the coarse pitch contour as a sparkline-like string.""" | |
| if not prosody: | |
| return f"[error] no prosody features cached for clip {clip_id}" | |
| contour = prosody.get("pitch_contour_hz", []) or [] | |
| if not contour: | |
| return f"Pitch contour for clip {clip_id}: <unvoiced or silent>" | |
| # ASCII sparkline in 8 levels | |
| levels = "βββββ βββ" | |
| lo, hi = min(contour), max(contour) | |
| rng = max(hi - lo, 1.0) | |
| spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour) | |
| direction = "rising" if contour[-1] > contour[0] + 5 else ( | |
| "falling" if contour[-1] < contour[0] - 5 else "flat" | |
| ) | |
| return ( | |
| f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n" | |
| f" range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n" | |
| f" shape: {spark}" | |
| ) | |