Spaces:

aamrinder
/

subtext-arena

Sleeping

App Files Files Community

subtext-arena / train /curate_pivot_set.py

aamrinder

Upload folder using huggingface_hub

a008aa6 verified 12 days ago

raw

history blame contribute delete

14.1 kB

	"""Curate the Prosody-Pivot Set — 50 MUStARD clips where text-only Qwen
	fails AND the audio cues flip the answer.

	Why this set is critical:
	- Without it, the project is text RL with audio-flavored tool names: a
	skeptical judge can fairly say "you didn't prove the audio matters."
	- With it, the headline number "24/50 -> 41/50" on a held-out set isolates
	exactly what audio buys you. That delta is the project's defensibility.

	Pipeline (5 steps, ~2.5 hours wall clock):
	1. baseline — run text-only Qwen2.5-3B on all 690 utterances; record
	per-clip predicted label + confidence. (~40 min on T4.)
	2. filter — keep clips where (a) text-only is wrong AND (b) confidence
	is high (>0.6 wrong, i.e. confidently incorrect — these are
	the clips where prosody is THE deciding signal).
	3. ui — open a tiny Gradio UI; play each candidate clip; rate
	"is the audio cue audible? sarcasm/sincerity clear from
	prosody alone?" 1-5. Keep clips with score >=4 from at
	least 2 of 3 listeners. (~60 min for 3 listeners.)
	4. finalize — balance classes (25 sarcastic + 25 sincere), pick top 50,
	write to data/pivot_set.json.
	5. (optional) augment — synthetic TTS-augmented clips with XTTS-v2 to
	expand the train side; eval set stays REAL only.

	Usage:
	# On HF Jobs T4 (or any 16GB+ GPU with HF token in env)
	python train/curate_pivot_set.py baseline --out data/baseline.json
	python train/curate_pivot_set.py filter --baseline data/baseline.json --out data/candidates.json
	python train/curate_pivot_set.py ui --candidates data/candidates.json --out data/listener_ratings.json
	python train/curate_pivot_set.py finalize --candidates data/candidates.json \\
	--ratings data/listener_ratings.json --out data/pivot_set.json
	"""
	from __future__ import annotations

	import argparse
	import json
	import random
	import sys
	from pathlib import Path
	from typing import Dict, List


	DATA_ROOT = Path(__file__).resolve().parent.parent / "data"


	# ---------------------------------------------------------------------------
	# Step 1: baseline — run text-only Qwen2.5-3B on all 690 utterances
	# ---------------------------------------------------------------------------

	def cmd_baseline(args: argparse.Namespace) -> None:
	"""Run text-only baseline: ask Qwen2.5-3B-Instruct (no audio tools) to
	classify each utterance from transcript+context only. Save per-clip
	predicted label + confidence."""
	sarcasm_data = json.loads((DATA_ROOT / "sarcasm_data.json").read_text())

	print(f"[baseline] loading {args.model}")
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	tok = AutoTokenizer.from_pretrained(args.model)
	model = AutoModelForCausalLM.from_pretrained(
	args.model, torch_dtype=torch.bfloat16, device_map="auto",
	)
	model.eval()

	# Neutral prompt: avoid the word "sarcasm" before the model has answered,
	# list "sincere" before "sarcastic" to fight the prefix bias of Qwen2.5-3B.
	PROMPT = (
	"You will read a line of TV dialogue with its conversational context.\n"
	"Decide whether the speaker is being sincere (means what they say) "
	"or sarcastic (means the opposite of what they say).\n\n"
	"{ctx}\n\n"
	"Target line:\n[{spk}] {utt}\n\n"
	"Output exactly two lines, in this format:\n"
	"Label: sincere\n"
	"Confidence: 0.7\n\n"
	"Now classify the target line above.\n"
	"Output:\n"
	)

	out: Dict[str, Dict] = {}
	keys = list(sarcasm_data.keys())
	for i, clip_id in enumerate(keys):
	entry = sarcasm_data[clip_id]
	ctx_lines = [
	f"[{s}] {l}" for s, l in zip(
	entry.get("context_speakers", []) or [], entry.get("context", []) or []
	)
	]
	prompt = PROMPT.format(
	ctx="Context:\n" + ("\n".join(ctx_lines) if ctx_lines else "(no context)"),
	spk=entry.get("speaker", "?"),
	utt=entry.get("utterance", ""),
	)
	ids = tok(prompt, return_tensors="pt").to(model.device)
	with torch.no_grad():
	o = model.generate(
	**ids,
	max_new_tokens=24,
	do_sample=False,
	pad_token_id=tok.eos_token_id,
	)
	text = tok.decode(o[0][ids.input_ids.shape[1]:], skip_special_tokens=True).strip()

	# Parse "Label: X\nConfidence: Y" format
	pred_label = "sincere" # default to sincere if parsing fails (less biased)
	conf = 0.5
	for line in text.splitlines():
	stripped = line.strip().lower()
	if stripped.startswith("label:"):
	value = stripped[len("label:"):].strip()
	if "sarc" in value:
	pred_label = "sarcastic"
	elif "sinc" in value:
	pred_label = "sincere"
	elif stripped.startswith("confidence:"):
	value = stripped[len("confidence:"):].strip()
	try:
	v = float(value)
	if 0.0 <= v <= 1.0:
	conf = v
	except ValueError:
	pass
	out[clip_id] = {
	"predicted": pred_label,
	"confidence": conf,
	"gold": "sarcastic" if entry.get("sarcasm") else "sincere",
	"raw_output": text,
	}
	if (i + 1) % 50 == 0:
	print(f" [{i+1}/{len(keys)}] last: {clip_id} -> {pred_label} (conf={conf:.2f})", flush=True)

	Path(args.out).parent.mkdir(parents=True, exist_ok=True)
	Path(args.out).write_text(json.dumps(out, indent=2))
	correct = sum(1 for v in out.values() if v["predicted"] == v["gold"])
	print(f"[baseline] saved to {args.out}. Text-only accuracy: {correct}/{len(out)} = {correct/len(out):.2%}")

	# Auto-upload to HF Space so the next step (filter) can pull it.
	# Disabled if NO_HF_UPLOAD=1 or no token available.
	import os
	if not os.environ.get("NO_HF_UPLOAD") and os.environ.get("HF_TOKEN"):
	try:
	from huggingface_hub import HfApi
	repo_id = os.environ.get("HF_REPO_ID", "aamrinder/subtext-arena")
	HfApi().upload_file(
	path_or_fileobj=str(args.out),
	path_in_repo="data/baseline.json",
	repo_id=repo_id,
	repo_type="space",
	token=os.environ["HF_TOKEN"],
	commit_message="add text-only Qwen baseline (curate_pivot_set baseline)",
	)
	print(f"[baseline] uploaded to {repo_id}/data/baseline.json")
	except Exception as e:
	print(f"[baseline] upload failed (saved locally only): {e}")


	# ---------------------------------------------------------------------------
	# Step 2: filter — confidently-wrong clips are the Pivot candidates
	# ---------------------------------------------------------------------------

	def cmd_filter(args: argparse.Namespace) -> None:
	"""Keep clips where text-only is WRONG AND confident (>=0.6).

	Confident-wrong is the strict version of "audio matters here." A correct
	text-only call doesn't tell us anything; a confidently-wrong text call
	means the prosody almost certainly carries the deciding signal.
	"""
	baseline = json.loads(Path(args.baseline).read_text())
	candidates: List[Dict] = []
	for clip_id, row in baseline.items():
	if row["predicted"] != row["gold"] and float(row["confidence"]) >= args.min_conf:
	candidates.append({"clip_id": clip_id, **row})

	n_sarc = sum(1 for c in candidates if c["gold"] == "sarcastic")
	n_sinc = sum(1 for c in candidates if c["gold"] == "sincere")
	print(f"[filter] {len(candidates)} confidently-wrong clips (sarc={n_sarc}, sinc={n_sinc})")

	Path(args.out).parent.mkdir(parents=True, exist_ok=True)
	Path(args.out).write_text(json.dumps({"candidates": candidates, "min_conf": args.min_conf}, indent=2))
	print(f"[filter] candidates -> {args.out}")


	# ---------------------------------------------------------------------------
	# Step 3: ui — Gradio listener UI (run on a machine with audio playback)
	# ---------------------------------------------------------------------------

	GRADIO_UI_SOURCE = '''
	"""Run: pip install gradio && python <this script> ui --candidates ... --out ..."""
	import gradio as gr, json, sys, random
	from pathlib import Path

	CAND_PATH = sys.argv[sys.argv.index("--candidates")+1]
	OUT_PATH = sys.argv[sys.argv.index("--out")+1]
	WAV_DIR = Path(__file__).resolve().parent.parent / "data" / "audio_cache" / "utterances"

	cands = json.loads(Path(CAND_PATH).read_text())["candidates"]
	random.shuffle(cands)
	ratings = {}
	if Path(OUT_PATH).exists():
	ratings = json.loads(Path(OUT_PATH).read_text())

	def render(idx):
	if idx >= len(cands):
	return None, "All done!", "", str(idx)
	c = cands[idx]
	wav = str(WAV_DIR / f"{c['clip_id']}.wav")
	if not Path(wav).exists():
	wav = None
	info = f"Clip {c['clip_id']} \| gold={c['gold']} \| text-only said {c['predicted']} (conf={c['confidence']:.2f})"
	return wav, info, ratings.get(c["clip_id"], ""), str(idx)

	def save(idx_str, score, listener):
	idx = int(idx_str)
	if idx >= len(cands): return "saved (end)"
	cid = cands[idx]["clip_id"]
	ratings.setdefault(cid, {})[listener] = int(score)
	Path(OUT_PATH).write_text(json.dumps(ratings, indent=2))
	return f"saved {cid} <- {listener}={score}"

	with gr.Blocks() as app:
	idx_state = gr.State(0)
	listener = gr.Textbox(label="Listener name (a/b/c)", value="a")
	audio = gr.Audio(label="clip", type="filepath")
	info = gr.Markdown()
	rating_in = gr.Slider(1, 5, step=1, label="Audio cue clearly indicates the gold label? (1=no 5=yes)")
	save_btn = gr.Button("save + next")
	status = gr.Markdown()

	def go(i, l, s):
	save(str(i), s, l)
	i = int(i) + 1
	wav, info_t, prev, _ = render(i)
	return wav, info_t, i, f"saved, next idx={i}"
	save_btn.click(go, [idx_state, listener, rating_in], [audio, info, idx_state, status])

	audio_init, info_init, _, _ = render(0)
	audio.value = audio_init; info.value = info_init

	app.launch(server_name="0.0.0.0", server_port=7860, share=False)
	'''


	def cmd_ui(args: argparse.Namespace) -> None:
	"""Write the Gradio listener UI to /tmp and exec it."""
	ui_path = Path("/tmp/_pivot_ui.py")
	ui_path.write_text(GRADIO_UI_SOURCE)
	import subprocess
	subprocess.run(
	[sys.executable, str(ui_path),
	"ui", "--candidates", args.candidates, "--out", args.out],
	check=False,
	)


	# ---------------------------------------------------------------------------
	# Step 4: finalize — pick balanced top 50
	# ---------------------------------------------------------------------------

	def cmd_finalize(args: argparse.Namespace) -> None:
	cands = json.loads(Path(args.candidates).read_text())["candidates"]
	if Path(args.ratings).exists():
	ratings = json.loads(Path(args.ratings).read_text())
	else:
	ratings = {}

	def score(clip_id: str) -> float:
	listener_scores = ratings.get(clip_id, {})
	if not listener_scores:
	return 0.0
	return sum(int(v) for v in listener_scores.values()) / max(1, len(listener_scores))

	# Filter candidates to ones with >=2 listener-yes (score >=4)
	if ratings:
	cands = [c for c in cands if score(c["clip_id"]) >= 4.0]

	sarc = sorted(
	[c for c in cands if c["gold"] == "sarcastic"],
	key=lambda c: -score(c["clip_id"]) if ratings else -float(c["confidence"]),
	)[: args.per_class]
	sinc = sorted(
	[c for c in cands if c["gold"] == "sincere"],
	key=lambda c: -score(c["clip_id"]) if ratings else -float(c["confidence"]),
	)[: args.per_class]

	pivot_clip_ids = [c["clip_id"] for c in sarc + sinc]
	random.Random(0).shuffle(pivot_clip_ids)

	payload = {
	"clip_ids": pivot_clip_ids,
	"n_sarcastic": len(sarc),
	"n_sincere": len(sinc),
	"from_ratings": bool(ratings),
	"method": (
	"confident-text-wrong + listener-rated >=4 from >=2 listeners"
	if ratings else
	"confident-text-wrong (no listener pass yet)"
	),
	}
	Path(args.out).write_text(json.dumps(payload, indent=2))
	print(f"[finalize] wrote {len(pivot_clip_ids)} pivot clips to {args.out} "
	f"({len(sarc)} sarcastic, {len(sinc)} sincere)")


	# ---------------------------------------------------------------------------
	# Wiring
	# ---------------------------------------------------------------------------

	def main():
	parser = argparse.ArgumentParser()
	sub = parser.add_subparsers(dest="cmd", required=True)

	p = sub.add_parser("baseline")
	p.add_argument("--model", default="Qwen/Qwen2.5-3B-Instruct")
	p.add_argument("--out", default=str(DATA_ROOT / "baseline.json"))
	p.set_defaults(func=cmd_baseline)

	p = sub.add_parser("filter")
	p.add_argument("--baseline", default=str(DATA_ROOT / "baseline.json"))
	p.add_argument("--out", default=str(DATA_ROOT / "candidates.json"))
	p.add_argument("--min-conf", type=float, default=0.6)
	p.set_defaults(func=cmd_filter)

	p = sub.add_parser("ui")
	p.add_argument("--candidates", default=str(DATA_ROOT / "candidates.json"))
	p.add_argument("--out", default=str(DATA_ROOT / "listener_ratings.json"))
	p.set_defaults(func=cmd_ui)

	p = sub.add_parser("finalize")
	p.add_argument("--candidates", default=str(DATA_ROOT / "candidates.json"))
	p.add_argument("--ratings", default=str(DATA_ROOT / "listener_ratings.json"))
	p.add_argument("--out", default=str(DATA_ROOT / "pivot_set.json"))
	p.add_argument("--per-class", type=int, default=25)
	p.set_defaults(func=cmd_finalize)

	args = parser.parse_args()
	args.func(args)


	if __name__ == "__main__":
	main()