Spaces:

aamrinder
/

subtext-arena

Sleeping

File size: 14,114 Bytes

"""Curate the Prosody-Pivot Set — 50 MUStARD clips where text-only Qwen
fails AND the audio cues flip the answer.

Why this set is critical:
  - Without it, the project is text RL with audio-flavored tool names: a
    skeptical judge can fairly say "you didn't prove the audio matters."
  - With it, the headline number "24/50 -> 41/50" on a held-out set isolates
    exactly what audio buys you. That delta is the project's defensibility.

Pipeline (5 steps, ~2.5 hours wall clock):
  1. baseline   — run text-only Qwen2.5-3B on all 690 utterances; record
                  per-clip predicted label + confidence. (~40 min on T4.)
  2. filter     — keep clips where (a) text-only is wrong AND (b) confidence
                  is high (>0.6 wrong, i.e. confidently incorrect — these are
                  the clips where prosody is THE deciding signal).
  3. ui         — open a tiny Gradio UI; play each candidate clip; rate
                  "is the audio cue audible? sarcasm/sincerity clear from
                  prosody alone?" 1-5. Keep clips with score >=4 from at
                  least 2 of 3 listeners. (~60 min for 3 listeners.)
  4. finalize   — balance classes (25 sarcastic + 25 sincere), pick top 50,
                  write to data/pivot_set.json.
  5. (optional) augment — synthetic TTS-augmented clips with XTTS-v2 to
                  expand the train side; eval set stays REAL only.

Usage:
    # On HF Jobs T4 (or any 16GB+ GPU with HF token in env)
    python train/curate_pivot_set.py baseline --out data/baseline.json
    python train/curate_pivot_set.py filter --baseline data/baseline.json --out data/candidates.json
    python train/curate_pivot_set.py ui --candidates data/candidates.json --out data/listener_ratings.json
    python train/curate_pivot_set.py finalize --candidates data/candidates.json \\
        --ratings data/listener_ratings.json --out data/pivot_set.json
"""
from __future__ import annotations

import argparse
import json
import random
import sys
from pathlib import Path
from typing import Dict, List


DATA_ROOT = Path(__file__).resolve().parent.parent / "data"


# ---------------------------------------------------------------------------
# Step 1: baseline — run text-only Qwen2.5-3B on all 690 utterances
# ---------------------------------------------------------------------------

def cmd_baseline(args: argparse.Namespace) -> None:
    """Run text-only baseline: ask Qwen2.5-3B-Instruct (no audio tools) to
    classify each utterance from transcript+context only. Save per-clip
    predicted label + confidence."""
    sarcasm_data = json.loads((DATA_ROOT / "sarcasm_data.json").read_text())

    print(f"[baseline] loading {args.model}")
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch

    tok = AutoTokenizer.from_pretrained(args.model)
    model = AutoModelForCausalLM.from_pretrained(
        args.model, torch_dtype=torch.bfloat16, device_map="auto",
    )
    model.eval()

    # Neutral prompt: avoid the word "sarcasm" before the model has answered,
    # list "sincere" before "sarcastic" to fight the prefix bias of Qwen2.5-3B.
    PROMPT = (
        "You will read a line of TV dialogue with its conversational context.\n"
        "Decide whether the speaker is being sincere (means what they say) "
        "or sarcastic (means the opposite of what they say).\n\n"
        "{ctx}\n\n"
        "Target line:\n[{spk}] {utt}\n\n"
        "Output exactly two lines, in this format:\n"
        "Label: sincere\n"
        "Confidence: 0.7\n\n"
        "Now classify the target line above.\n"
        "Output:\n"
    )

    out: Dict[str, Dict] = {}
    keys = list(sarcasm_data.keys())
    for i, clip_id in enumerate(keys):
        entry = sarcasm_data[clip_id]
        ctx_lines = [
            f"[{s}] {l}" for s, l in zip(
                entry.get("context_speakers", []) or [], entry.get("context", []) or []
            )
        ]
        prompt = PROMPT.format(
            ctx="Context:\n" + ("\n".join(ctx_lines) if ctx_lines else "(no context)"),
            spk=entry.get("speaker", "?"),
            utt=entry.get("utterance", ""),
        )
        ids = tok(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            o = model.generate(
                **ids,
                max_new_tokens=24,
                do_sample=False,
                pad_token_id=tok.eos_token_id,
            )
        text = tok.decode(o[0][ids.input_ids.shape[1]:], skip_special_tokens=True).strip()

        # Parse "Label: X\nConfidence: Y" format
        pred_label = "sincere"   # default to sincere if parsing fails (less biased)
        conf = 0.5
        for line in text.splitlines():
            stripped = line.strip().lower()
            if stripped.startswith("label:"):
                value = stripped[len("label:"):].strip()
                if "sarc" in value:
                    pred_label = "sarcastic"
                elif "sinc" in value:
                    pred_label = "sincere"
            elif stripped.startswith("confidence:"):
                value = stripped[len("confidence:"):].strip()
                try:
                    v = float(value)
                    if 0.0 <= v <= 1.0:
                        conf = v
                except ValueError:
                    pass
        out[clip_id] = {
            "predicted": pred_label,
            "confidence": conf,
            "gold": "sarcastic" if entry.get("sarcasm") else "sincere",
            "raw_output": text,
        }
        if (i + 1) % 50 == 0:
            print(f"  [{i+1}/{len(keys)}] last: {clip_id} -> {pred_label} (conf={conf:.2f})", flush=True)

    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    Path(args.out).write_text(json.dumps(out, indent=2))
    correct = sum(1 for v in out.values() if v["predicted"] == v["gold"])
    print(f"[baseline] saved to {args.out}. Text-only accuracy: {correct}/{len(out)} = {correct/len(out):.2%}")

    # Auto-upload to HF Space so the next step (filter) can pull it.
    # Disabled if NO_HF_UPLOAD=1 or no token available.
    import os
    if not os.environ.get("NO_HF_UPLOAD") and os.environ.get("HF_TOKEN"):
        try:
            from huggingface_hub import HfApi
            repo_id = os.environ.get("HF_REPO_ID", "aamrinder/subtext-arena")
            HfApi().upload_file(
                path_or_fileobj=str(args.out),
                path_in_repo="data/baseline.json",
                repo_id=repo_id,
                repo_type="space",
                token=os.environ["HF_TOKEN"],
                commit_message="add text-only Qwen baseline (curate_pivot_set baseline)",
            )
            print(f"[baseline] uploaded to {repo_id}/data/baseline.json")
        except Exception as e:
            print(f"[baseline] upload failed (saved locally only): {e}")


# ---------------------------------------------------------------------------
# Step 2: filter — confidently-wrong clips are the Pivot candidates
# ---------------------------------------------------------------------------

def cmd_filter(args: argparse.Namespace) -> None:
    """Keep clips where text-only is WRONG AND confident (>=0.6).

    Confident-wrong is the strict version of "audio matters here." A correct
    text-only call doesn't tell us anything; a confidently-wrong text call
    means the prosody almost certainly carries the deciding signal.
    """
    baseline = json.loads(Path(args.baseline).read_text())
    candidates: List[Dict] = []
    for clip_id, row in baseline.items():
        if row["predicted"] != row["gold"] and float(row["confidence"]) >= args.min_conf:
            candidates.append({"clip_id": clip_id, **row})

    n_sarc = sum(1 for c in candidates if c["gold"] == "sarcastic")
    n_sinc = sum(1 for c in candidates if c["gold"] == "sincere")
    print(f"[filter] {len(candidates)} confidently-wrong clips (sarc={n_sarc}, sinc={n_sinc})")

    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
    Path(args.out).write_text(json.dumps({"candidates": candidates, "min_conf": args.min_conf}, indent=2))
    print(f"[filter] candidates -> {args.out}")


# ---------------------------------------------------------------------------
# Step 3: ui — Gradio listener UI (run on a machine with audio playback)
# ---------------------------------------------------------------------------

GRADIO_UI_SOURCE = '''
"""Run: pip install gradio && python <this script> ui --candidates ... --out ..."""
import gradio as gr, json, sys, random
from pathlib import Path

CAND_PATH = sys.argv[sys.argv.index("--candidates")+1]
OUT_PATH  = sys.argv[sys.argv.index("--out")+1]
WAV_DIR   = Path(__file__).resolve().parent.parent / "data" / "audio_cache" / "utterances"

cands = json.loads(Path(CAND_PATH).read_text())["candidates"]
random.shuffle(cands)
ratings = {}
if Path(OUT_PATH).exists():
    ratings = json.loads(Path(OUT_PATH).read_text())

def render(idx):
    if idx >= len(cands):
        return None, "All done!", "", str(idx)
    c = cands[idx]
    wav = str(WAV_DIR / f"{c['clip_id']}.wav")
    if not Path(wav).exists():
        wav = None
    info = f"Clip {c['clip_id']} | gold={c['gold']} | text-only said {c['predicted']} (conf={c['confidence']:.2f})"
    return wav, info, ratings.get(c["clip_id"], ""), str(idx)

def save(idx_str, score, listener):
    idx = int(idx_str)
    if idx >= len(cands): return "saved (end)"
    cid = cands[idx]["clip_id"]
    ratings.setdefault(cid, {})[listener] = int(score)
    Path(OUT_PATH).write_text(json.dumps(ratings, indent=2))
    return f"saved {cid} <- {listener}={score}"

with gr.Blocks() as app:
    idx_state = gr.State(0)
    listener  = gr.Textbox(label="Listener name (a/b/c)", value="a")
    audio = gr.Audio(label="clip", type="filepath")
    info = gr.Markdown()
    rating_in = gr.Slider(1, 5, step=1, label="Audio cue clearly indicates the gold label? (1=no 5=yes)")
    save_btn = gr.Button("save + next")
    status = gr.Markdown()

    def go(i, l, s):
        save(str(i), s, l)
        i = int(i) + 1
        wav, info_t, prev, _ = render(i)
        return wav, info_t, i, f"saved, next idx={i}"
    save_btn.click(go, [idx_state, listener, rating_in], [audio, info, idx_state, status])

    audio_init, info_init, _, _ = render(0)
    audio.value = audio_init; info.value = info_init

app.launch(server_name="0.0.0.0", server_port=7860, share=False)
'''


def cmd_ui(args: argparse.Namespace) -> None:
    """Write the Gradio listener UI to /tmp and exec it."""
    ui_path = Path("/tmp/_pivot_ui.py")
    ui_path.write_text(GRADIO_UI_SOURCE)
    import subprocess
    subprocess.run(
        [sys.executable, str(ui_path),
         "ui", "--candidates", args.candidates, "--out", args.out],
        check=False,
    )


# ---------------------------------------------------------------------------
# Step 4: finalize — pick balanced top 50
# ---------------------------------------------------------------------------

def cmd_finalize(args: argparse.Namespace) -> None:
    cands = json.loads(Path(args.candidates).read_text())["candidates"]
    if Path(args.ratings).exists():
        ratings = json.loads(Path(args.ratings).read_text())
    else:
        ratings = {}

    def score(clip_id: str) -> float:
        listener_scores = ratings.get(clip_id, {})
        if not listener_scores:
            return 0.0
        return sum(int(v) for v in listener_scores.values()) / max(1, len(listener_scores))

    # Filter candidates to ones with >=2 listener-yes (score >=4)
    if ratings:
        cands = [c for c in cands if score(c["clip_id"]) >= 4.0]

    sarc = sorted(
        [c for c in cands if c["gold"] == "sarcastic"],
        key=lambda c: -score(c["clip_id"]) if ratings else -float(c["confidence"]),
    )[: args.per_class]
    sinc = sorted(
        [c for c in cands if c["gold"] == "sincere"],
        key=lambda c: -score(c["clip_id"]) if ratings else -float(c["confidence"]),
    )[: args.per_class]

    pivot_clip_ids = [c["clip_id"] for c in sarc + sinc]
    random.Random(0).shuffle(pivot_clip_ids)

    payload = {
        "clip_ids": pivot_clip_ids,
        "n_sarcastic": len(sarc),
        "n_sincere": len(sinc),
        "from_ratings": bool(ratings),
        "method": (
            "confident-text-wrong + listener-rated >=4 from >=2 listeners"
            if ratings else
            "confident-text-wrong (no listener pass yet)"
        ),
    }
    Path(args.out).write_text(json.dumps(payload, indent=2))
    print(f"[finalize] wrote {len(pivot_clip_ids)} pivot clips to {args.out} "
          f"({len(sarc)} sarcastic, {len(sinc)} sincere)")


# ---------------------------------------------------------------------------
# Wiring
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser()
    sub = parser.add_subparsers(dest="cmd", required=True)

    p = sub.add_parser("baseline")
    p.add_argument("--model", default="Qwen/Qwen2.5-3B-Instruct")
    p.add_argument("--out", default=str(DATA_ROOT / "baseline.json"))
    p.set_defaults(func=cmd_baseline)

    p = sub.add_parser("filter")
    p.add_argument("--baseline", default=str(DATA_ROOT / "baseline.json"))
    p.add_argument("--out", default=str(DATA_ROOT / "candidates.json"))
    p.add_argument("--min-conf", type=float, default=0.6)
    p.set_defaults(func=cmd_filter)

    p = sub.add_parser("ui")
    p.add_argument("--candidates", default=str(DATA_ROOT / "candidates.json"))
    p.add_argument("--out", default=str(DATA_ROOT / "listener_ratings.json"))
    p.set_defaults(func=cmd_ui)

    p = sub.add_parser("finalize")
    p.add_argument("--candidates", default=str(DATA_ROOT / "candidates.json"))
    p.add_argument("--ratings", default=str(DATA_ROOT / "listener_ratings.json"))
    p.add_argument("--out", default=str(DATA_ROOT / "pivot_set.json"))
    p.add_argument("--per-class", type=int, default=25)
    p.set_defaults(func=cmd_finalize)

    args = parser.parse_args()
    args.func(args)


if __name__ == "__main__":
    main()