Spaces:

aamrinder
/

subtext-arena

Sleeping

App Files Files Community

aamrinder commited on 29 days ago

Commit

37818f1

verified ·

1 Parent(s): 7f60dea

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +2 -0
pyproject.toml +10 -1
train/side_by_side.py +206 -0

README.md CHANGED Viewed

@@ -185,6 +185,8 @@ GRPOTrainer(
 **Plots** (`train/plot_reward_decomp.py`): generates the 3-line reward decomposition chart (correctness / reasoning_length / format) from the training log. Saves to `docs/plots/reward_decomposition.png`.
 ---
 ## Results

 **Plots** (`train/plot_reward_decomp.py`): generates the 3-line reward decomposition chart (correctness / reasoning_length / format) from the training log. Saves to `docs/plots/reward_decomposition.png`.
+**Side-by-side demo** (`train/side_by_side.py`): runs both the base Qwen2.5-3B and the trained checkpoint on hand-picked Pivot clips, dumps an HTML page with their reasoning traces side-by-side. This is the demo artifact judges read.
 ---
 ## Results

pyproject.toml CHANGED Viewed

@@ -36,4 +36,13 @@ server = "subtext_arena.server.app:main"
 [tool.setuptools]
 include-package-data = true
 packages = ["subtext_arena", "subtext_arena.server", "subtext_arena.train"]
-package-dir = { "subtext_arena" = ".", "subtext_arena.server" = "server", "subtext_arena.train" = "train" }

 [tool.setuptools]
 include-package-data = true
 packages = ["subtext_arena", "subtext_arena.server", "subtext_arena.train"]
+package-dir = { "subtext_arena" = ".", "subtext_arena.server" = "server", "subtext_arena.train" = "train" }
+[tool.setuptools.package-data]
+subtext_arena = [
+    "data/sarcasm_data.json",
+    "data/pivot_set.json",
+    "data/prosody_cache/utterances/*.json",
+    "data/prosody_cache/context/*.json",
+    "openenv.yaml",
+]

train/side_by_side.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Generate side-by-side baseline-vs-trained reasoning for hand-picked clips.
+This is the demo artifact: judges look at it and read what the model learned.
+Output: an HTML table that can be embedded in the README + a JSON dump.
+For each clip:
+  - Run the BASE Qwen2.5-3B (no LoRA) and dump <think> + <final>
+  - Run the TRAINED checkpoint and dump <think> + <final>
+  - Show gold label, both predictions, and which (if either) was right
+Usage:
+    python -m subtext_arena.train.side_by_side \\
+        --trained-checkpoint ./checkpoints/run1 \\
+        --clip-ids 1_70 2_190 1_8826 2_236 2_300 \\
+        --out docs/plots/side_by_side.html
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import List
+try:
+    from subtext_arena.server.scenarios import load_scenarios
+    from subtext_arena.train.train_grpo import (
+        SYSTEM_PROMPT, build_full_observation, parse_final, reward_decomposition,
+    )
+except ImportError:
+    ROOT = Path(__file__).resolve().parent.parent
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    from server.scenarios import load_scenarios  # type: ignore[no-redef]
+    from train.train_grpo import (  # type: ignore[no-redef]
+        SYSTEM_PROMPT, build_full_observation, parse_final, reward_decomposition,
+    )
+HTML_TEMPLATE = """<!DOCTYPE html>
+<html><head>
+<meta charset="utf-8">
+<title>Subtext Arena — baseline vs trained, hand-picked clips</title>
+<style>
+  body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+         max-width: 1200px; margin: 40px auto; padding: 0 20px; color: #222; }}
+  h1 {{ font-size: 24px; }}
+  .clip {{ border: 1px solid #ddd; border-radius: 8px; padding: 16px;
+           margin-bottom: 24px; background: #fafafa; }}
+  .clip h2 {{ font-size: 18px; margin-top: 0; }}
+  .gold-sarcastic {{ color: #b3274d; }}
+  .gold-sincere   {{ color: #1d7a4a; }}
+  .columns {{ display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }}
+  .col {{ padding: 12px; border-radius: 6px; }}
+  .baseline {{ background: #fff5f5; border: 1px solid #f8c4c4; }}
+  .trained  {{ background: #effaf3; border: 1px solid #b6e2c1; }}
+  .col h3 {{ margin-top: 0; font-size: 14px; text-transform: uppercase;
+             letter-spacing: 0.05em; color: #666; }}
+  .verdict-correct {{ color: #1d7a4a; font-weight: bold; }}
+  .verdict-wrong   {{ color: #b3274d; font-weight: bold; }}
+  pre {{ white-space: pre-wrap; word-wrap: break-word; font-size: 13px;
+         line-height: 1.4; background: white; padding: 8px; border-radius: 4px;
+         border: 1px solid #eee; }}
+  .transcript {{ font-style: italic; color: #555; margin-bottom: 12px; }}
+</style>
+</head><body>
+<h1>Subtext Arena — baseline vs trained</h1>
+<p>Same prompt fed to the base Qwen2.5-3B-Instruct (left) and to the GRPO-trained
+checkpoint (right). Each shows the model's reasoning trace and final answer.</p>
+{clips_html}
+</body></html>
+"""
+CLIP_BLOCK = """<div class="clip">
+  <h2>Clip {clip_id} — speaker: {speaker}, gold: <span class="gold-{gold}">{gold}</span></h2>
+  <div class="transcript">"{utterance}"</div>
+  <div class="columns">
+    <div class="col baseline">
+      <h3>Baseline (no training)</h3>
+      <p>predicted: <span class="verdict-{baseline_verdict}">{baseline_label}</span> (conf {baseline_conf:.2f})</p>
+      <pre>{baseline_text}</pre>
+    </div>
+    <div class="col trained">
+      <h3>Trained checkpoint</h3>
+      <p>predicted: <span class="verdict-{trained_verdict}">{trained_label}</span> (conf {trained_conf:.2f})</p>
+      <pre>{trained_text}</pre>
+    </div>
+  </div>
+</div>
+"""
+def generate_completion(model, tokenizer, prompt_user_msg, max_tokens=600, temperature=0.7):
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": prompt_user_msg},
+    ]
+    inputs = tokenizer.apply_chat_template(
+        messages, return_tensors="pt", add_generation_prompt=True
+    ).to(model.device)
+    out = model.generate(
+        inputs, max_new_tokens=max_tokens, do_sample=True,
+        temperature=temperature, pad_token_id=tokenizer.eos_token_id,
+    )
+    return tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--trained-checkpoint", required=True)
+    parser.add_argument("--base-model", default="unsloth/Qwen2.5-3B-Instruct")
+    parser.add_argument("--clip-ids", nargs="+", required=True,
+                        help="Hand-picked clip IDs for the side-by-side")
+    parser.add_argument("--out", required=True, help="Output HTML path")
+    parser.add_argument("--out-json", default=None, help="Optional JSON dump")
+    args = parser.parse_args()
+    scenarios = load_scenarios()
+    from unsloth import FastLanguageModel
+    rows = []
+    # Run baseline (no LoRA)
+    print(f"[load] base model: {args.base_model}")
+    base_model, base_tok = FastLanguageModel.from_pretrained(
+        model_name=args.base_model, max_seq_length=4096, load_in_4bit=True,
+    )
+    FastLanguageModel.for_inference(base_model)
+    for clip_id in args.clip_ids:
+        gold = "sarcastic" if scenarios[clip_id]["sarcasm"] else "sincere"
+        prompt_user = build_full_observation(clip_id, scenarios)
+        text = generate_completion(base_model, base_tok, prompt_user)
+        d = reward_decomposition(text, gold)
+        rows.append({
+            "clip_id": clip_id,
+            "speaker": scenarios[clip_id]["speaker"],
+            "utterance": scenarios[clip_id]["utterance"],
+            "gold": gold,
+            "baseline": {
+                "label": d["_predicted"], "confidence": d["_confidence"],
+                "correct": d["_correct"], "text": text[:1200],
+            },
+        })
+        print(f"  baseline {clip_id}: pred={d['_predicted']} (conf={d['_confidence']:.2f}) correct={d['_correct']}")
+    # Free the base model to make room for the trained one
+    del base_model
+    import torch
+    torch.cuda.empty_cache()
+    # Run trained checkpoint
+    print(f"[load] trained checkpoint: {args.trained_checkpoint}")
+    trained_model, trained_tok = FastLanguageModel.from_pretrained(
+        model_name=args.trained_checkpoint, max_seq_length=4096, load_in_4bit=True,
+    )
+    FastLanguageModel.for_inference(trained_model)
+    for row in rows:
+        clip_id = row["clip_id"]
+        prompt_user = build_full_observation(clip_id, scenarios)
+        text = generate_completion(trained_model, trained_tok, prompt_user)
+        d = reward_decomposition(text, row["gold"])
+        row["trained"] = {
+            "label": d["_predicted"], "confidence": d["_confidence"],
+            "correct": d["_correct"], "text": text[:1200],
+        }
+        print(f"  trained  {clip_id}: pred={d['_predicted']} (conf={d['_confidence']:.2f}) correct={d['_correct']}")
+    # Render HTML
+    clips_html_parts = []
+    for row in rows:
+        b = row["baseline"]; t = row["trained"]
+        clips_html_parts.append(CLIP_BLOCK.format(
+            clip_id=row["clip_id"], speaker=row["speaker"],
+            utterance=row["utterance"].replace('"', '&quot;'),
+            gold=row["gold"],
+            baseline_label=b["label"] or "—",
+            baseline_conf=b["confidence"],
+            baseline_verdict=("correct" if b["correct"] else "wrong"),
+            baseline_text=(b["text"] or "(no output)").replace("<", "&lt;").replace(">", "&gt;"),
+            trained_label=t["label"] or "—",
+            trained_conf=t["confidence"],
+            trained_verdict=("correct" if t["correct"] else "wrong"),
+            trained_text=(t["text"] or "(no output)").replace("<", "&lt;").replace(">", "&gt;"),
+        ))
+    html = HTML_TEMPLATE.format(clips_html="\n".join(clips_html_parts))
+    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.out).write_text(html)
+    print(f"[done] wrote {args.out}")
+    if args.out_json:
+        Path(args.out_json).write_text(json.dumps(rows, indent=2))
+        print(f"[done] wrote {args.out_json}")
+    # Tally
+    n_baseline_correct = sum(1 for r in rows if r["baseline"]["correct"])
+    n_trained_correct = sum(1 for r in rows if r["trained"]["correct"])
+    print()
+    print(f"Tally on {len(rows)} hand-picked clips:")
+    print(f"  baseline: {n_baseline_correct}/{len(rows)} correct")
+    print(f"  trained:  {n_trained_correct}/{len(rows)} correct")
+if __name__ == "__main__":
+    main()