rtferraz commited on
Commit
71422f3
Β·
verified Β·
1 Parent(s): 0fc9042

Fix V4.2 audit: show INPUT REVIEW alongside MODEL OUTPUT for proper human scoring

Browse files
Files changed (1) hide show
  1. notebooks/v4_2_instruct_grpo.ipynb +1 -1
notebooks/v4_2_instruct_grpo.ipynb CHANGED
@@ -99,7 +99,7 @@
99
  "execution_count": null,
100
  "metadata": {},
101
  "outputs": [],
102
- "source": "from scipy.stats import spearmanr\n\nAUDIT_PROMPTS_PER_TASK = 5\n\n# ── Collect audit prompts (5 per task) ───────────────────────────────────────\naudit_by_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if not prompt_msgs:\n continue\n user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n if len(audit_by_type[task]) < AUDIT_PROMPTS_PER_TASK:\n audit_by_type[task].append(prompt_msgs)\n\nprint(f\"Audit prompts collected: {', '.join(f'{k}={len(v)}' for k, v in audit_by_type.items())}\")\n\n# ── Generate completions and score automatically ─────────────────────────────\nFastLanguageModel.for_inference(model)\n\naudit_auto_scores = []\naudit_tasks = []\naudit_completions = []\n\nfor task_type in [\"extraction\", \"sql_qa\", \"insights\", \"push\"]:\n for msgs in audit_by_type[task_type]:\n msgs = inject_task_system_prompt(msgs, task_type)\n text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n with torch.no_grad():\n out = model.generate(\n **inputs,\n max_new_tokens=MAX_COMPLETION_LENGTH,\n temperature=0.1, # near-deterministic for audit\n do_sample=True,\n repetition_penalty=1.0,\n )\n resp = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n r = commerce_reward_fn_raw([resp], [text])[0] # Raw rewards for audit (not GDPO-normalized)\n audit_auto_scores.append(r)\n audit_tasks.append(task_type)\n audit_completions.append(resp)\n\n# ══════════════════════════════════════════════════════════════════════════════\n# INTERACTIVE REWARD AUDIT\n# Shows each completion in FULL (no truncation), prompts for a 0-10 score.\n# ══════════════════════════════════════════════════════════════════════════════\n\nprint(f\"\\n{'='*80}\")\nprint(\"REWARD FUNCTION AUDIT β€” 20 Completions (interactive scoring)\")\nprint(\"Score each completion 0-10: 0=garbage, 5=acceptable, 10=perfect\")\nprint(f\"{'='*80}\")\n\naudit_human_scores = []\n\nfor i, (task, auto_r, comp) in enumerate(zip(audit_tasks, audit_auto_scores, audit_completions)):\n answer = strip_think(comp) # full completion, no truncation\n print(f\"\\n{'─'*80}\")\n print(f\" Sample {i+1}/{len(audit_auto_scores)} [{task}] auto_reward={auto_r:.3f}\")\n print(f\"{'─'*80}\")\n print(answer)\n print()\n while True:\n try:\n score = float(input(f\" Your score (0-10): \"))\n if 0 <= score <= 10:\n break\n print(\" ⚠️ Score must be between 0 and 10\")\n except (ValueError, EOFError):\n print(\" ⚠️ Enter a number between 0 and 10\")\n audit_human_scores.append(score)\n print(f\" β†’ Recorded: human={score:.0f}, auto={auto_r:.3f}\")\n\n# ── Compute Spearman ρ ───────────────────────────────────────────────────────\nhuman_normalized = [s / 10.0 for s in audit_human_scores]\nrho, p_value = spearmanr(human_normalized, audit_auto_scores)\n\nprint(f\"\\n{'='*80}\")\nprint(f\"AUDIT RESULTS\")\nprint(f\"{'='*80}\")\nprint(f\" Spearman ρ = {rho:.3f} (p = {p_value:.4f})\")\nprint()\nprint(f\" {'#':>3s} {'Task':12s} {'Human':>6s} {'Auto':>6s} {'Ξ”':>6s}\")\nprint(f\" {'─'*40}\")\nfor i, (task, h, a) in enumerate(zip(audit_tasks, human_normalized, audit_auto_scores)):\n delta = abs(h - a)\n flag = \" ⚠️\" if delta > 0.3 else \"\"\n print(f\" {i+1:3d} {task:12s} {h:6.2f} {a:6.3f} {delta:6.3f}{flag}\")\n\nif rho > 0.70:\n print(f\"\\n βœ… PASS: ρ={rho:.3f} > 0.70 β€” reward function is calibrated\")\nelse:\n print(f\"\\n ❌ FAIL: ρ={rho:.3f} < 0.70 β€” reward function is miscalibrated\")\n print(\" β†’ Investigate samples marked ⚠️ before training. Check:\")\n print(\" 1. Is the JSON parser handling all output formats?\")\n print(\" 2. Are SQL reward tiers appropriate for this model's output style?\")\n print(\" 3. Are insights/push length penalties calibrated?\")\n\nassert rho > 0.70, f\"Reward function miscalibrated (ρ={rho:.3f} < 0.70). Fix before training.\""
103
  },
104
  {
105
  "cell_type": "markdown",
 
99
  "execution_count": null,
100
  "metadata": {},
101
  "outputs": [],
102
+ "source": "from scipy.stats import spearmanr\n\nAUDIT_PROMPTS_PER_TASK = 5\n\n# ── Collect audit prompts (5 per task) ───────────────────────────────────────\naudit_by_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if not prompt_msgs:\n continue\n user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n if len(audit_by_type[task]) < AUDIT_PROMPTS_PER_TASK:\n audit_by_type[task].append(prompt_msgs)\n\nprint(f\"Audit prompts collected: {', '.join(f'{k}={len(v)}' for k, v in audit_by_type.items())}\")\n\n# ── Generate completions and score automatically ─────────────────────────────\nFastLanguageModel.for_inference(model)\n\naudit_auto_scores = []\naudit_tasks = []\naudit_completions = []\n\naudit_prompts_text = [] # store original user message for display\n\nfor task_type in [\"extraction\", \"sql_qa\", \"insights\", \"push\"]:\n for msgs in audit_by_type[task_type]:\n # Extract original user message BEFORE injecting system prompt\n user_content = \"\\n\".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n audit_prompts_text.append(user_content)\n \n msgs = inject_task_system_prompt(msgs, task_type)\n text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n with torch.no_grad():\n out = model.generate(\n **inputs,\n max_new_tokens=MAX_COMPLETION_LENGTH,\n temperature=0.1, # near-deterministic for audit\n do_sample=True,\n repetition_penalty=1.0,\n )\n resp = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n r = commerce_reward_fn_raw([resp], [text])[0] # Raw rewards for audit (not GDPO-normalized)\n audit_auto_scores.append(r)\n audit_tasks.append(task_type)\n audit_completions.append(resp)\n\n# ══════════════════════════════════════════════════════════════════════════════\n# INTERACTIVE REWARD AUDIT\n# Shows each completion in FULL (no truncation), prompts for a 0-10 score.\n# ══════════════════════════════════════════════════════════════════════════════\n\nprint(f\"\\n{'='*80}\")\nprint(\"REWARD FUNCTION AUDIT β€” 20 Completions (interactive scoring)\")\nprint(\"Score each completion 0-10: 0=garbage, 5=acceptable, 10=perfect\")\nprint(f\"{'='*80}\")\n\naudit_human_scores = []\n\nfor i, (task, auto_r, comp, prompt_txt) in enumerate(zip(audit_tasks, audit_auto_scores, audit_completions, audit_prompts_text)):\n answer = strip_think(comp) # full completion, no truncation\n print(f\"\\n{'─'*80}\")\n print(f\" Sample {i+1}/{len(audit_auto_scores)} [{task}] auto_reward={auto_r:.3f}\")\n print(f\"{'─'*80}\")\n print(f\"\\nINPUT REVIEW:\\n{prompt_txt}\\n\")\n print(f\"MODEL OUTPUT:\\n{answer}\")\n print()\n while True:\n try:\n score = float(input(f\" Your score (0-10): \"))\n if 0 <= score <= 10:\n break\n print(\" ⚠️ Score must be between 0 and 10\")\n except (ValueError, EOFError):\n print(\" ⚠️ Enter a number between 0 and 10\")\n audit_human_scores.append(score)\n print(f\" β†’ Recorded: human={score:.0f}, auto={auto_r:.3f}\")\n\n# ── Compute Spearman ρ ───────────────────────────────────────────────────────\nhuman_normalized = [s / 10.0 for s in audit_human_scores]\nrho, p_value = spearmanr(human_normalized, audit_auto_scores)\n\nprint(f\"\\n{'='*80}\")\nprint(f\"AUDIT RESULTS\")\nprint(f\"{'='*80}\")\nprint(f\" Spearman ρ = {rho:.3f} (p = {p_value:.4f})\")\nprint()\nprint(f\" {'#':>3s} {'Task':12s} {'Human':>6s} {'Auto':>6s} {'Ξ”':>6s}\")\nprint(f\" {'─'*40}\")\nfor i, (task, h, a) in enumerate(zip(audit_tasks, human_normalized, audit_auto_scores)):\n delta = abs(h - a)\n flag = \" ⚠️\" if delta > 0.3 else \"\"\n print(f\" {i+1:3d} {task:12s} {h:6.2f} {a:6.3f} {delta:6.3f}{flag}\")\n\nif rho > 0.70:\n print(f\"\\n βœ… PASS: ρ={rho:.3f} > 0.70 β€” reward function is calibrated\")\nelse:\n print(f\"\\n ❌ FAIL: ρ={rho:.3f} < 0.70 β€” reward function is miscalibrated\")\n print(\" β†’ Investigate samples marked ⚠️ before training. Check:\")\n print(\" 1. Is the JSON parser handling all output formats?\")\n print(\" 2. Are SQL reward tiers appropriate for this model's output style?\")\n print(\" 3. Are insights/push length penalties calibrated?\")\n\nassert rho > 0.70, f\"Reward function miscalibrated (ρ={rho:.3f} < 0.70). Fix before training.\""
103
  },
104
  {
105
  "cell_type": "markdown",