Spaces:

aamrinder
/

subtext-arena

Sleeping

App Files Files Community

subtext-arena / server /audio_tools.py

aamrinder

Upload folder using huggingface_hub

225e725 verified 12 days ago

raw

history blame contribute delete

5.68 kB

	"""Audio-tool wrappers for the Subtext Arena environment.

	These functions render the cached prosody features and ground-truth transcript
	into the human-readable text that the agent sees when it calls a tool.

	Why text-formatted: Subtext Arena trains a TEXT policy that orchestrates audio
	tools (mirrors the AudioToolAgent / Path A architecture). The trained model
	never sees raw audio — it reads these descriptions and reasons over them.

	Inputs:
	- sarcasm_data: dict[clip_id -> {utterance, speaker, context, sarcasm}]
	(loaded from MUStARD/data/sarcasm_data.json)
	- prosody_cache: dict[clip_id -> prosody json] keyed by clip id
	(loaded from data/prosody_cache/utterances/<id>.json)

	The "span" argument on prosody / pitch tools is optional. Clips are short
	(~3-5s), so for the hackathon we always render features over the whole clip;
	span is accepted for forward compatibility but currently ignored.
	"""
	from __future__ import annotations

	from typing import Any, Dict, Optional


	def _bucket(value: float, low: float, high: float) -> str:
	if value < low:
	return "LOW"
	if value > high:
	return "HIGH"
	return "MID"


	def render_transcript(
	clip_id: str,
	sarcasm_data: Dict[str, dict],
	) -> str:
	"""Return the literal transcript + preceding conversational context.

	Format:
	Speaker context turns:
	[LEONARD] I never would have identified the fingerprints of string theory ...
	[SHELDON] My apologies. What's your plan?
	Target utterance:
	[SHELDON] It's just a privilege to watch your mind at work.
	"""
	entry = sarcasm_data.get(clip_id)
	if entry is None:
	return f"[error] clip {clip_id} not in sarcasm_data"

	lines = []
	ctx_turns = entry.get("context", [])
	ctx_speakers = entry.get("context_speakers", [])
	if ctx_turns:
	lines.append("Conversational context (lines spoken just before):")
	for spk, line in zip(ctx_speakers, ctx_turns):
	lines.append(f" [{spk}] {line}")
	spk = entry.get("speaker", "?")
	lines.append("Target utterance (the line you must classify):")
	lines.append(f" [{spk}] {entry.get('utterance', '')}")
	return "\n".join(lines)


	def render_prosody_features(
	clip_id: str,
	prosody: Dict[str, Any],
	span: Optional[Dict[str, float]] = None,
	) -> str:
	"""Render pitch_var, energy, pauses, voiced_ratio as a text summary.

	Buckets pitch_var with thresholds tuned on MUStARD: LOW < 25Hz, HIGH > 45Hz
	(sarcastic delivery typically shows higher pitch variability).

	When the source audio is dominated by music or laugh tracks (~13% of MUStARD
	clips), pyin produces unreliable f0 estimates. We detect this via
	voiced_ratio < 0.1 and tell the agent the features are unreliable rather
	than feeding it noise.
	"""
	if not prosody:
	return f"[error] no prosody features cached for clip {clip_id}"

	duration = prosody.get("duration_s", 0.0)
	pitch_var = prosody.get("pitch_var_hz", 0.0)
	pitch_mean = prosody.get("pitch_mean_hz", 0.0)
	energy_var = prosody.get("energy_var", 0.0)
	energy_mean = prosody.get("energy_mean", 0.0)
	voiced_ratio = prosody.get("voiced_ratio", 0.0)
	pre_pause_ms = prosody.get("pre_pause_ms", 0)
	pauses = prosody.get("pauses", []) or []

	if voiced_ratio < 0.1:
	return (
	f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
	f" [WARNING] only {voiced_ratio:.0%} of frames have detected pitch — "
	f"the audio may be dominated by music, laughter, or background noise. "
	f"Prosody features for this clip are unreliable; rely primarily on "
	f"the transcript and conversational context."
	)

	pitch_var_b = _bucket(pitch_var, 25.0, 45.0)
	energy_var_b = _bucket(energy_var, 0.02, 0.05)
	voiced_b = _bucket(voiced_ratio, 0.45, 0.75)

	pause_lines = []
	for (s, e) in pauses[:5]:
	pause_lines.append(f" {s:.2f}s -> {e:.2f}s ({(e - s) * 1000:.0f}ms)")
	pause_block = (
	" Internal pauses >150ms:\n" + "\n".join(pause_lines)
	if pause_lines
	else " Internal pauses >150ms: none"
	)

	return (
	f"Prosody features for clip {clip_id} (duration {duration:.2f}s):\n"
	f" Pitch: mean={pitch_mean:.0f} Hz, variability={pitch_var:.0f} Hz [{pitch_var_b}]\n"
	f" Energy: mean={energy_mean:.4f}, variability={energy_var:.4f} [{energy_var_b}]\n"
	f" Voiced: {voiced_ratio:.2f} of frames have detected pitch [{voiced_b}]\n"
	f" Pre-utterance silence: {pre_pause_ms} ms\n"
	f"{pause_block}"
	)


	def render_pitch_contour(
	clip_id: str,
	prosody: Dict[str, Any],
	span: Optional[Dict[str, float]] = None,
	) -> str:
	"""Render the coarse pitch contour as a sparkline-like string."""
	if not prosody:
	return f"[error] no prosody features cached for clip {clip_id}"
	contour = prosody.get("pitch_contour_hz", []) or []
	if not contour:
	return f"Pitch contour for clip {clip_id}: <unvoiced or silent>"

	# ASCII sparkline in 8 levels
	levels = "▁▂▃▄▅▆▇█"
	lo, hi = min(contour), max(contour)
	rng = max(hi - lo, 1.0)
	spark = "".join(levels[min(7, int((v - lo) / rng * 7))] for v in contour)
	direction = "rising" if contour[-1] > contour[0] + 5 else (
	"falling" if contour[-1] < contour[0] - 5 else "flat"
	)
	return (
	f"Pitch contour for clip {clip_id} ({len(contour)} samples):\n"
	f" range: {lo:.0f} Hz -> {hi:.0f} Hz, overall trend: {direction}\n"
	f" shape: {spark}"
	)