File size: 5,676 Bytes
225e725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Audio-tool wrappers for the Subtext Arena environment.

These functions render the cached prosody features and ground-truth transcript
into the human-readable text that the agent sees when it calls a tool.

Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio
tools (mirrors the AudioToolAgent / Path A architecture). The trained model
never sees raw audio β€” it reads these descriptions and reasons over them.

Inputs:
  - sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}]
                  (loaded from MUStARD/data/sarcasm_data.json)
  - prosody_cache: dict[clip_id -> prosody json] keyed by clip id
                  (loaded from data/prosody_cache/utterances/<id>.json)

The "span" argument on prosody / pitch tools is optional. Clips are short
(~3-5s), so for the hackathon we always render features over the whole clip;
span is accepted for forward compatibility but currently ignored.
"""
from __future__ import annotations

from typing import Any, Dict, Optional


def _bucket(value: float, low: float, high: float) -> str:
    if value < low:
        return "LOW"
    if value > high:
        return "HIGH"
    return "MID"


def render_transcript(
    clip_id: str,
    sarcasm_data: Dict[str, dict],
) -> str:
    """Return the literal transcript + preceding conversational context.

    Format:
      Speaker context turns:
        [LEONARD] I never would have identified the fingerprints of string theory ...
        [SHELDON] My apologies. What's your plan?
      Target utterance:
        [SHELDON] It's just a privilege to watch your mind at work.
    """
    entry = sarcasm_data.get(clip_id)
    if entry is None:
        return f"[error] clip {clip_id} not in sarcasm_data"

    lines = []
    ctx_turns = entry.get("context", [])
    ctx_speakers = entry.get("context_speakers", [])
    if ctx_turns:
        lines.append("Conversational context (lines spoken just before):")
        for spk, line in zip(ctx_speakers, ctx_turns):
            lines.append(f"  [{spk}] {line}")
    spk = entry.get("speaker", "?")
    lines.append("Target utterance (the line you must classify):")
    lines.append(f"  [{spk}] {entry.get('utterance', '')}")
    return "\n".join(lines)


def render_prosody_features(
    clip_id: str,
    prosody: Dict[str, Any],
    span: Optional[Dict[str, float]] = None,
) -> str:
    """Render pitch_var, energy, pauses, voiced_ratio as a text summary.

    Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz
    (sarcastic delivery typically shows higher pitch variability).

    When the source audio is dominated by music or laugh tracks (~13% of MUStARD
    clips), pyin produces unreliable f0 estimates. We detect this via
    voiced_ratio < 0.1 and tell the agent the features are unreliable rather
    than feeding it noise.
    """
    if not prosody:
        return f"[error] no prosody features cached for clip {clip_id}"

    duration = prosody.get("duration_s", 0.0)
    pitch_var = prosody.get("pitch_var_hz", 0.0)
    pitch_mean = prosody.get("pitch_mean_hz", 0.0)
    energy_var = prosody.get("energy_var", 0.0)
    energy_mean = prosody.get("energy_mean", 0.0)
    voiced_ratio = prosody.get("voiced_ratio", 0.0)
    pre_pause_ms = prosody.get("pre_pause_ms", 0)
    pauses = prosody.get("pauses", []) or []

    if voiced_ratio < 0.1:
        return (
            f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
            f"  [WARNING] only {voiced_ratio:.0%} of frames have detected pitch β€” "
            f"the audio may be dominated by music, laughter, or background noise. "
            f"Prosody features for this clip are unreliable; rely primarily on "
            f"the transcript and conversational context."
        )

    pitch_var_b = _bucket(pitch_var, 25.0, 45.0)
    energy_var_b = _bucket(energy_var, 0.02, 0.05)
    voiced_b = _bucket(voiced_ratio, 0.45, 0.75)

    pause_lines = []
    for (s, e) in pauses[:5]:
        pause_lines.append(f"    {s:.2f}s -> {e:.2f}s  ({(e - s) * 1000:.0f}ms)")
    pause_block = (
        "  Internal pauses >150ms:\n" + "\n".join(pause_lines)
        if pause_lines
        else "  Internal pauses >150ms: none"
    )

    return (
        f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
        f"  Pitch:    mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n"
        f"  Energy:   mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n"
        f"  Voiced:   {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n"
        f"  Pre-utterance silence: {pre_pause_ms} ms\n"
        f"{pause_block}"
    )


def render_pitch_contour(
    clip_id: str,
    prosody: Dict[str, Any],
    span: Optional[Dict[str, float]] = None,
) -> str:
    """Render the coarse pitch contour as a sparkline-like string."""
    if not prosody:
        return f"[error] no prosody features cached for clip {clip_id}"
    contour = prosody.get("pitch_contour_hz", []) or []
    if not contour:
        return f"Pitch contour for clip {clip_id}: <unvoiced or silent>"

    # ASCII sparkline in 8 levels
    levels = "β–β–‚β–ƒβ–„β–…β–†β–‡β–ˆ"
    lo, hi = min(contour), max(contour)
    rng = max(hi - lo, 1.0)
    spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour)
    direction = "rising" if contour[-1] > contour[0] + 5 else (
        "falling" if contour[-1] < contour[0] - 5 else "flat"
    )
    return (
        f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n"
        f"  range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n"
        f"  shape: {spark}"
    )