rtferraz
/

tucano2-commerce

Model card Files Files and versions

xet

Community

rtferraz commited on 10 days ago

Commit

71422f3

verified ·

1 Parent(s): 0fc9042

Fix V4.2 audit: show INPUT REVIEW alongside MODEL OUTPUT for proper human scoring

Browse files

Files changed (1) hide show

notebooks/v4_2_instruct_grpo.ipynb +1 -1

notebooks/v4_2_instruct_grpo.ipynb CHANGED Viewed

@@ -99,7 +99,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "from scipy.stats import spearmanr\n\nAUDIT_PROMPTS_PER_TASK = 5\n\n# ── Collect audit prompts (5 per task) ───────────────────────────────────────\naudit_by_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n    for line in f:\n        row = json.loads(line)\n        convs = row[\"conversations\"]\n        prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n        if not prompt_msgs:\n            continue\n        user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n        task = _classify_task_type(user_text)\n        if len(audit_by_type[task]) < AUDIT_PROMPTS_PER_TASK:\n            audit_by_type[task].append(prompt_msgs)\n\nprint(f\"Audit prompts collected: {', '.join(f'{k}={len(v)}' for k, v in audit_by_type.items())}\")\n\n# ── Generate completions and score automatically ─────────────────────────────\nFastLanguageModel.for_inference(model)\n\naudit_auto_scores = []\naudit_tasks = []\naudit_completions = []\n\nfor task_type in [\"extraction\", \"sql_qa\", \"insights\", \"push\"]:\n    for msgs in audit_by_type[task_type]:\n        msgs = inject_task_system_prompt(msgs, task_type)\n        text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n        inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n        with torch.no_grad():\n            out = model.generate(\n                **inputs,\n                max_new_tokens=MAX_COMPLETION_LENGTH,\n                temperature=0.1,  # near-deterministic for audit\n                do_sample=True,\n                repetition_penalty=1.0,\n            )\n        resp = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n        r = commerce_reward_fn_raw([resp], [text])[0]  # Raw rewards for audit (not GDPO-normalized)\n        audit_auto_scores.append(r)\n        audit_tasks.append(task_type)\n        audit_completions.append(resp)\n\n# ══════════════════════════════════════════════════════════════════════════════\n# INTERACTIVE REWARD AUDIT\n# Shows each completion in FULL (no truncation), prompts for a 0-10 score.\n# ══════════════════════════════════════════════════════════════════════════════\n\nprint(f\"\\n{'='*80}\")\nprint(\"REWARD FUNCTION AUDIT — 20 Completions (interactive scoring)\")\nprint(\"Score each completion 0-10:  0=garbage, 5=acceptable, 10=perfect\")\nprint(f\"{'='*80}\")\n\naudit_human_scores = []\n\nfor i, (task, auto_r, comp) in enumerate(zip(audit_tasks, audit_auto_scores, audit_completions)):\n    answer = strip_think(comp)  # full completion, no truncation\n    print(f\"\\n{'─'*80}\")\n    print(f\"  Sample {i+1}/{len(audit_auto_scores)} [{task}]  auto_reward={auto_r:.3f}\")\n    print(f\"{'─'*80}\")\n    print(answer)\n    print()\n    while True:\n        try:\n            score = float(input(f\"  Your score (0-10): \"))\n            if 0 <= score <= 10:\n                break\n            print(\"  ⚠️ Score must be between 0 and 10\")\n        except (ValueError, EOFError):\n            print(\"  ⚠️ Enter a number between 0 and 10\")\n    audit_human_scores.append(score)\n    print(f\"  → Recorded: human={score:.0f}, auto={auto_r:.3f}\")\n\n# ── Compute Spearman ρ ───────────────────────────────────────────────────────\nhuman_normalized = [s / 10.0 for s in audit_human_scores]\nrho, p_value = spearmanr(human_normalized, audit_auto_scores)\n\nprint(f\"\\n{'='*80}\")\nprint(f\"AUDIT RESULTS\")\nprint(f\"{'='*80}\")\nprint(f\"  Spearman ρ = {rho:.3f}  (p = {p_value:.4f})\")\nprint()\nprint(f\"  {'#':>3s}  {'Task':12s}  {'Human':>6s}  {'Auto':>6s}  {'Δ':>6s}\")\nprint(f\"  {'─'*40}\")\nfor i, (task, h, a) in enumerate(zip(audit_tasks, human_normalized, audit_auto_scores)):\n    delta = abs(h - a)\n    flag = \" ⚠️\" if delta > 0.3 else \"\"\n    print(f\"  {i+1:3d}  {task:12s}  {h:6.2f}  {a:6.3f}  {delta:6.3f}{flag}\")\n\nif rho > 0.70:\n    print(f\"\\n  ✅ PASS: ρ={rho:.3f} > 0.70 — reward function is calibrated\")\nelse:\n    print(f\"\\n  ❌ FAIL: ρ={rho:.3f} < 0.70 — reward function is miscalibrated\")\n    print(\"  → Investigate samples marked ⚠️ before training. Check:\")\n    print(\"    1. Is the JSON parser handling all output formats?\")\n    print(\"    2. Are SQL reward tiers appropriate for this model's output style?\")\n    print(\"    3. Are insights/push length penalties calibrated?\")\n\nassert rho > 0.70, f\"Reward function miscalibrated (ρ={rho:.3f} < 0.70). Fix before training.\""
   },
   {
    "cell_type": "markdown",

    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "from scipy.stats import spearmanr\n\nAUDIT_PROMPTS_PER_TASK = 5\n\n# ── Collect audit prompts (5 per task) ───────────────────────────────────────\naudit_by_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n    for line in f:\n        row = json.loads(line)\n        convs = row[\"conversations\"]\n        prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n        if not prompt_msgs:\n            continue\n        user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n        task = _classify_task_type(user_text)\n        if len(audit_by_type[task]) < AUDIT_PROMPTS_PER_TASK:\n            audit_by_type[task].append(prompt_msgs)\n\nprint(f\"Audit prompts collected: {', '.join(f'{k}={len(v)}' for k, v in audit_by_type.items())}\")\n\n# ── Generate completions and score automatically ─────────────────────────────\nFastLanguageModel.for_inference(model)\n\naudit_auto_scores = []\naudit_tasks = []\naudit_completions = []\n\naudit_prompts_text = []  # store original user message for display\n\nfor task_type in [\"extraction\", \"sql_qa\", \"insights\", \"push\"]:\n    for msgs in audit_by_type[task_type]:\n        # Extract original user message BEFORE injecting system prompt\n        user_content = \"\\n\".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n        audit_prompts_text.append(user_content)\n        \n        msgs = inject_task_system_prompt(msgs, task_type)\n        text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n        inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n        with torch.no_grad():\n            out = model.generate(\n                **inputs,\n                max_new_tokens=MAX_COMPLETION_LENGTH,\n                temperature=0.1,  # near-deterministic for audit\n                do_sample=True,\n                repetition_penalty=1.0,\n            )\n        resp = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n        r = commerce_reward_fn_raw([resp], [text])[0]  # Raw rewards for audit (not GDPO-normalized)\n        audit_auto_scores.append(r)\n        audit_tasks.append(task_type)\n        audit_completions.append(resp)\n\n# ══════════════════════════════════════════════════════════════════════════════\n# INTERACTIVE REWARD AUDIT\n# Shows each completion in FULL (no truncation), prompts for a 0-10 score.\n# ══════════════════════════════════════════════════════════════════════════════\n\nprint(f\"\\n{'='*80}\")\nprint(\"REWARD FUNCTION AUDIT — 20 Completions (interactive scoring)\")\nprint(\"Score each completion 0-10:  0=garbage, 5=acceptable, 10=perfect\")\nprint(f\"{'='*80}\")\n\naudit_human_scores = []\n\nfor i, (task, auto_r, comp, prompt_txt) in enumerate(zip(audit_tasks, audit_auto_scores, audit_completions, audit_prompts_text)):\n    answer = strip_think(comp)  # full completion, no truncation\n    print(f\"\\n{'─'*80}\")\n    print(f\"  Sample {i+1}/{len(audit_auto_scores)} [{task}]  auto_reward={auto_r:.3f}\")\n    print(f\"{'─'*80}\")\n    print(f\"\\nINPUT REVIEW:\\n{prompt_txt}\\n\")\n    print(f\"MODEL OUTPUT:\\n{answer}\")\n    print()\n    while True:\n        try:\n            score = float(input(f\"  Your score (0-10): \"))\n            if 0 <= score <= 10:\n                break\n            print(\"  ⚠️ Score must be between 0 and 10\")\n        except (ValueError, EOFError):\n            print(\"  ⚠️ Enter a number between 0 and 10\")\n    audit_human_scores.append(score)\n    print(f\"  → Recorded: human={score:.0f}, auto={auto_r:.3f}\")\n\n# ── Compute Spearman ρ ───────────────────────────────────────────────────────\nhuman_normalized = [s / 10.0 for s in audit_human_scores]\nrho, p_value = spearmanr(human_normalized, audit_auto_scores)\n\nprint(f\"\\n{'='*80}\")\nprint(f\"AUDIT RESULTS\")\nprint(f\"{'='*80}\")\nprint(f\"  Spearman ρ = {rho:.3f}  (p = {p_value:.4f})\")\nprint()\nprint(f\"  {'#':>3s}  {'Task':12s}  {'Human':>6s}  {'Auto':>6s}  {'Δ':>6s}\")\nprint(f\"  {'─'*40}\")\nfor i, (task, h, a) in enumerate(zip(audit_tasks, human_normalized, audit_auto_scores)):\n    delta = abs(h - a)\n    flag = \" ⚠️\" if delta > 0.3 else \"\"\n    print(f\"  {i+1:3d}  {task:12s}  {h:6.2f}  {a:6.3f}  {delta:6.3f}{flag}\")\n\nif rho > 0.70:\n    print(f\"\\n  ✅ PASS: ρ={rho:.3f} > 0.70 — reward function is calibrated\")\nelse:\n    print(f\"\\n  ❌ FAIL: ρ={rho:.3f} < 0.70 — reward function is miscalibrated\")\n    print(\"  → Investigate samples marked ⚠️ before training. Check:\")\n    print(\"    1. Is the JSON parser handling all output formats?\")\n    print(\"    2. Are SQL reward tiers appropriate for this model's output style?\")\n    print(\"    3. Are insights/push length penalties calibrated?\")\n\nassert rho > 0.70, f\"Reward function miscalibrated (ρ={rho:.3f} < 0.70). Fix before training.\""
   },
   {
    "cell_type": "markdown",