"""Audio-tool wrappers for the Subtext Arena environment. These functions render the cached prosody features and ground-truth transcript into the human-readable text that the agent sees when it calls a tool. Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio tools (mirrors the AudioToolAgent / Path A architecture). The trained model never sees raw audio — it reads these descriptions and reasons over them. Inputs: - sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}] (loaded from MUStARD/data/sarcasm_data.json) - prosody_cache: dict[clip_id -> prosody json] keyed by clip id (loaded from data/prosody_cache/utterances/.json) The "span" argument on prosody / pitch tools is optional. Clips are short (~3-5s), so for the hackathon we always render features over the whole clip; span is accepted for forward compatibility but currently ignored. """ from __future__ import annotations from typing import Any, Dict, Optional def _bucket(value: float, low: float, high: float) -> str: if value < low: return "LOW" if value > high: return "HIGH" return "MID" def render_transcript( clip_id: str, sarcasm_data: Dict[str, dict], ) -> str: """Return the literal transcript + preceding conversational context. Format: Speaker context turns: [LEONARD] I never would have identified the fingerprints of string theory ... [SHELDON] My apologies. What's your plan? Target utterance: [SHELDON] It's just a privilege to watch your mind at work. """ entry = sarcasm_data.get(clip_id) if entry is None: return f"[error] clip {clip_id} not in sarcasm_data" lines = [] ctx_turns = entry.get("context", []) ctx_speakers = entry.get("context_speakers", []) if ctx_turns: lines.append("Conversational context (lines spoken just before):") for spk, line in zip(ctx_speakers, ctx_turns): lines.append(f" [{spk}] {line}") spk = entry.get("speaker", "?") lines.append("Target utterance (the line you must classify):") lines.append(f" [{spk}] {entry.get('utterance', '')}") return "\n".join(lines) def render_prosody_features( clip_id: str, prosody: Dict[str, Any], span: Optional[Dict[str, float]] = None, ) -> str: """Render pitch_var, energy, pauses, voiced_ratio as a text summary. Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz (sarcastic delivery typically shows higher pitch variability). When the source audio is dominated by music or laugh tracks (~13% of MUStARD clips), pyin produces unreliable f0 estimates. We detect this via voiced_ratio < 0.1 and tell the agent the features are unreliable rather than feeding it noise. """ if not prosody: return f"[error] no prosody features cached for clip {clip_id}" duration = prosody.get("duration_s", 0.0) pitch_var = prosody.get("pitch_var_hz", 0.0) pitch_mean = prosody.get("pitch_mean_hz", 0.0) energy_var = prosody.get("energy_var", 0.0) energy_mean = prosody.get("energy_mean", 0.0) voiced_ratio = prosody.get("voiced_ratio", 0.0) pre_pause_ms = prosody.get("pre_pause_ms", 0) pauses = prosody.get("pauses", []) or [] if voiced_ratio < 0.1: return ( f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n" f" [WARNING] only {voiced_ratio:.0%} of frames have detected pitch — " f"the audio may be dominated by music, laughter, or background noise. " f"Prosody features for this clip are unreliable; rely primarily on " f"the transcript and conversational context." ) pitch_var_b = _bucket(pitch_var, 25.0, 45.0) energy_var_b = _bucket(energy_var, 0.02, 0.05) voiced_b = _bucket(voiced_ratio, 0.45, 0.75) pause_lines = [] for (s, e) in pauses[:5]: pause_lines.append(f" {s:.2f}s -> {e:.2f}s ({(e - s) * 1000:.0f}ms)") pause_block = ( " Internal pauses >150ms:\n" + "\n".join(pause_lines) if pause_lines else " Internal pauses >150ms: none" ) return ( f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n" f" Pitch: mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n" f" Energy: mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n" f" Voiced: {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n" f" Pre-utterance silence: {pre_pause_ms} ms\n" f"{pause_block}" ) def render_pitch_contour( clip_id: str, prosody: Dict[str, Any], span: Optional[Dict[str, float]] = None, ) -> str: """Render the coarse pitch contour as a sparkline-like string.""" if not prosody: return f"[error] no prosody features cached for clip {clip_id}" contour = prosody.get("pitch_contour_hz", []) or [] if not contour: return f"Pitch contour for clip {clip_id}: " # ASCII sparkline in 8 levels levels = "▁▂▃▄▅▆▇█" lo, hi = min(contour), max(contour) rng = max(hi - lo, 1.0) spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour) direction = "rising" if contour[-1] > contour[0] + 5 else ( "falling" if contour[-1] < contour[0] - 5 else "flat" ) return ( f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n" f" range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n" f" shape: {spark}" )