Spaces:

aamrinder
/

subtext-arena

Sleeping

File size: 5,676 Bytes

225e725

"""Audio-tool wrappers for the Subtext Arena environment.

These functions render the cached prosody features and ground-truth transcript
into the human-readable text that the agent sees when it calls a tool.

Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio
tools (mirrors the AudioToolAgent / Path A architecture). The trained model
never sees raw audio — it reads these descriptions and reasons over them.

Inputs:
  - sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}]
                  (loaded from MUStARD/data/sarcasm_data.json)
  - prosody_cache: dict[clip_id -> prosody json] keyed by clip id
                  (loaded from data/prosody_cache/utterances/<id>.json)

The "span" argument on prosody / pitch tools is optional. Clips are short
(~3-5s), so for the hackathon we always render features over the whole clip;
span is accepted for forward compatibility but currently ignored.
"""
from __future__ import annotations

from typing import Any, Dict, Optional


def _bucket(value: float, low: float, high: float) -> str:
    if value < low:
        return "LOW"
    if value > high:
        return "HIGH"
    return "MID"


def render_transcript(
    clip_id: str,
    sarcasm_data: Dict[str, dict],
) -> str:
    """Return the literal transcript + preceding conversational context.

    Format:
      Speaker context turns:
        [LEONARD] I never would have identified the fingerprints of string theory ...
        [SHELDON] My apologies. What's your plan?
      Target utterance:
        [SHELDON] It's just a privilege to watch your mind at work.
    """
    entry = sarcasm_data.get(clip_id)
    if entry is None:
        return f"[error] clip {clip_id} not in sarcasm_data"

    lines = []
    ctx_turns = entry.get("context", [])
    ctx_speakers = entry.get("context_speakers", [])
    if ctx_turns:
        lines.append("Conversational context (lines spoken just before):")
        for spk, line in zip(ctx_speakers, ctx_turns):
            lines.append(f"  [{spk}] {line}")
    spk = entry.get("speaker", "?")
    lines.append("Target utterance (the line you must classify):")
    lines.append(f"  [{spk}] {entry.get('utterance', '')}")
    return "\n".join(lines)


def render_prosody_features(
    clip_id: str,
    prosody: Dict[str, Any],
    span: Optional[Dict[str, float]] = None,
) -> str:
    """Render pitch_var, energy, pauses, voiced_ratio as a text summary.

    Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz
    (sarcastic delivery typically shows higher pitch variability).

    When the source audio is dominated by music or laugh tracks (~13% of MUStARD
    clips), pyin produces unreliable f0 estimates. We detect this via
    voiced_ratio < 0.1 and tell the agent the features are unreliable rather
    than feeding it noise.
    """
    if not prosody:
        return f"[error] no prosody features cached for clip {clip_id}"

    duration = prosody.get("duration_s", 0.0)
    pitch_var = prosody.get("pitch_var_hz", 0.0)
    pitch_mean = prosody.get("pitch_mean_hz", 0.0)
    energy_var = prosody.get("energy_var", 0.0)
    energy_mean = prosody.get("energy_mean", 0.0)
    voiced_ratio = prosody.get("voiced_ratio", 0.0)
    pre_pause_ms = prosody.get("pre_pause_ms", 0)
    pauses = prosody.get("pauses", []) or []

    if voiced_ratio < 0.1:
        return (
            f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
            f"  [WARNING] only {voiced_ratio:.0%} of frames have detected pitch — "
            f"the audio may be dominated by music, laughter, or background noise. "
            f"Prosody features for this clip are unreliable; rely primarily on "
            f"the transcript and conversational context."
        )

    pitch_var_b = _bucket(pitch_var, 25.0, 45.0)
    energy_var_b = _bucket(energy_var, 0.02, 0.05)
    voiced_b = _bucket(voiced_ratio, 0.45, 0.75)

    pause_lines = []
    for (s, e) in pauses[:5]:
        pause_lines.append(f"    {s:.2f}s -> {e:.2f}s  ({(e - s) * 1000:.0f}ms)")
    pause_block = (
        "  Internal pauses >150ms:\n" + "\n".join(pause_lines)
        if pause_lines
        else "  Internal pauses >150ms: none"
    )

    return (
        f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
        f"  Pitch:    mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n"
        f"  Energy:   mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n"
        f"  Voiced:   {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n"
        f"  Pre-utterance silence: {pre_pause_ms} ms\n"
        f"{pause_block}"
    )


def render_pitch_contour(
    clip_id: str,
    prosody: Dict[str, Any],
    span: Optional[Dict[str, float]] = None,
) -> str:
    """Render the coarse pitch contour as a sparkline-like string."""
    if not prosody:
        return f"[error] no prosody features cached for clip {clip_id}"
    contour = prosody.get("pitch_contour_hz", []) or []
    if not contour:
        return f"Pitch contour for clip {clip_id}: <unvoiced or silent>"

    # ASCII sparkline in 8 levels
    levels = "▁▂▃▄▅▆▇█"
    lo, hi = min(contour), max(contour)
    rng = max(hi - lo, 1.0)
    spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour)
    direction = "rising" if contour[-1] > contour[0] + 5 else (
        "falling" if contour[-1] < contour[0] - 5 else "flat"
    )
    return (
        f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n"
        f"  range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n"
        f"  shape: {spark}"
    )