Spaces:

ycwhencpp
/

final-iteration

Paused

vaibhav12332112312 commited on 12 days ago

Commit

30614d3

1 Parent(s): b1c1732

Inject peak hours + history + post-mandate, run SFT every round

Prompt explicitly tells the model to schedule >=2 `post` actions per day at
heatmap peak hours, plus a rolling 3-day Recent summary so it can react to
its own past results. Eval runs greedy (deterministic), training stays
sampled. QUALITY_FLOOR=0 so SFT runs on positive-advantage samples even
when grader scores are still low. Bumped to 2 training rounds.

Made-with: Cursor

Files changed (1) hide show

training/train_grpo.ipynb +194 -154

training/train_grpo.ipynb CHANGED Viewed

@@ -25,9 +25,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 1: Install dependencies (quote versions — zsh treats `>` as redirect otherwise)\n",
         "!pip install -q torch torchvision torchaudio\n",
@@ -36,13 +34,13 @@
         "!pip install -q \"typing_extensions>=4.13.0\" pydantic httpx\n",
         "!pip install -q \"openenv-core[core]>=0.2.2\"\n",
         "!pip install -q flash-attn --no-build-isolation || echo \"flash-attn install skipped; will use sdpa\""
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 2: Resolve repo path (Colab: fresh clone. Local: auto-detect project root)\n",
         "import os\n",
@@ -118,13 +116,13 @@
         "print(f\"Branch: {REPO_BRANCH}\")\n",
         "print(f\"Commit: {commit}\")\n",
         "print(f\"Plots dir: {PLOTS_DIR}\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 3: Imports (with runtime validation)\n",
         "import json, random, time, textwrap, copy, os, sys\n",
@@ -178,7 +176,9 @@
         "import ast\n",
         "ast.parse(\"def _t(x: int) -> str: return f'{x}'\")\n",
         "print(\"OK: ast.parse (syntax check)\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -191,9 +191,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 4: Define heuristic agents + episode runner\n",
         "_rng = random.Random(42)\n",
@@ -269,13 +267,13 @@
         "            \"rewards\": rewards, \"energies\": energies}\n",
         "\n",
         "print(\"Agents and episode runner defined.\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 5: Run baselines (safe)\n",
         "print(\"Running heuristic baselines (5 agents × 3 tasks)...\")\n",
@@ -310,13 +308,13 @@
         "for name in BASELINE_AGENTS:\n",
         "    scores = [baseline_results[name][t][\"grader_score\"] for t in TASKS]\n",
         "    print(f\"{name:<14s} {scores[0]:>10.4f} {scores[1]:>12.4f} {scores[2]:>14.4f} {sum(scores)/3:>8.4f}\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 6: Baseline plots\n",
         "fig, axes = plt.subplots(1, 3, figsize=(16, 5), sharey=True)\n",
@@ -334,7 +332,9 @@
         "fig.tight_layout()\n",
         "fig.savefig(f\"{PLOTS_DIR}/baseline_leaderboard.png\", dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -347,9 +347,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 7: Load model (Qwen2.5-3B bf16 on CUDA + flash-attn-2; fp16/fp32 fallback)\n",
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
@@ -393,13 +391,13 @@
         "print(f\"Model loaded. dtype={next(model.parameters()).dtype} device={next(model.parameters()).device}\")\n",
         "if torch.cuda.is_available():\n",
         "    print(f\"CUDA memory: {torch.cuda.memory_allocated()/1e9:.2f} GB\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 8: LLM agent functions\n",
         "_SYSTEM_BASE = textwrap.dedent(\"\"\"\\\n",
@@ -439,7 +437,19 @@
         "                  like_bait  -> likes from existing followers\n",
         "- tags:         up to 5 hashtags\n",
         "- topic:        free-form string\n",
-        "- empty scheduled_actions = full day rest\"\"\")\n",
         "\n",
         "SYSTEM_PROMPT = _SYSTEM_BASE + textwrap.dedent(\"\"\"\n",
         "\n",
@@ -458,9 +468,28 @@
         "SYSTEM_PROMPT_TRAIN = SYSTEM_PROMPT\n",
         "\n",
         "\n",
-        "def format_obs(obs):\n",
-        "    days = [\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\"]\n",
-        "    day_name = days[obs.day_of_week] if 0 <= obs.day_of_week < 7 else \"?\"\n",
         "    signals_str = \"\"\n",
         "    signals = getattr(obs, \"engagement_signals\", None)\n",
         "    if signals:\n",
@@ -473,10 +502,11 @@
         "            tool_str += f\"  {tr.name}: {json.dumps(tr.data)}\\n\"\n",
         "    if not tool_str:\n",
         "        tool_str = \"  (none — call query_* tools to discover)\\n\"\n",
-        "    return (f\"Day: {day_name} | days_elapsed={obs.days_elapsed}\\n\"\n",
         "            f\"Energy: {obs.creator_energy:.2f} | Followers: {obs.follower_count}\\n\"\n",
         "            f\"Engagement: {obs.engagement_rate:.3f} | Queue: {obs.content_queue_size}\\n\"\n",
         "            f\"{signals_str}\"\n",
         "            f\"Tool results:\\n{tool_str}\"\n",
         "            f\"Plan today's actions (JSON only):\")\n",
         "\n",
@@ -554,11 +584,11 @@
         "\n",
         "def _batched_generate(mdl, tok, prompts, eval=False, max_new_tokens=512):\n",
         "    enc = tok(prompts, return_tensors=\"pt\", padding=True, truncation=False).to(_infer_model_device(mdl))\n",
-        "    gen_kwargs = dict(\n",
-        "        max_new_tokens=max_new_tokens,\n",
-        "        pad_token_id=tok.pad_token_id,\n",
-        "        do_sample=True, temperature=1.0, top_p=0.95,\n",
-        "    )\n",
         "    with torch.no_grad():\n",
         "        out = mdl.generate(**enc, **gen_kwargs)\n",
         "    resps = tok.batch_decode(out[:, enc[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
@@ -576,96 +606,104 @@
         "        f.write(json.dumps(rec) + \"\\n\")\n",
         "\n",
         "\n",
-    "DISCOVERY_SUFFIX = \"\\n\\nPHASE A (DISCOVERY): respond with JSON {\\\"tool_calls\\\": [...]} only.\"\n",
-    "PLANNING_SUFFIX = \"\\n\\nPHASE B (PLANNING): respond with JSON {\\\"scheduled_actions\\\": [...], \\\"notes\\\": \\\"...\\\"} using the fresh Tool results above.\"\n",
-    "\n",
-    "\n",
-    "def _parse_tool_calls_only(text):\n",
-    "    return parse_model_output(text).tool_calls\n",
-    "\n",
-    "\n",
-    "def _parse_actions_only(text):\n",
-    "    a = parse_model_output(text)\n",
-    "    return ViraltestAction(tool_calls=[], scheduled_actions=a.scheduled_actions, notes=a.notes)\n",
-    "\n",
-    "\n",
-    "def _format_fresh_results(fresh):\n",
-    "    if not fresh:\n",
-    "        return \"\"\n",
-    "    out = \"Fresh tool results (PHASE A):\\n\"\n",
-    "    for tr in fresh:\n",
-    "        if tr.success:\n",
-    "            out += f\"  {tr.name}: {json.dumps(tr.data)}\\n\"\n",
-    "        else:\n",
-    "            out += f\"  {tr.name}: ERROR {tr.error}\\n\"\n",
-    "    return out\n",
-    "\n",
-    "\n",
-    "def run_llm_episodes_batched(mdl, tok, tasks_seeds, verbose=True, eval=False, system=None, log_tag=None):\n",
-    "    \"\"\"Run N episodes in parallel. ReAct two-pass: discovery -> dispatch -> planning.\"\"\"\n",
-    "    sys_prompt = system or (SYSTEM_PROMPT_EVAL if eval else SYSTEM_PROMPT_TRAIN)\n",
-    "    n = len(tasks_seeds)\n",
-    "    envs = [ViraltestEnvironment() for _ in range(n)]\n",
-    "    obss = [envs[i].reset(task=t, seed=s) for i, (t, s) in enumerate(tasks_seeds)]\n",
-    "    rewards = [[] for _ in range(n)]\n",
-    "    energies = [[obs.creator_energy] for obs in obss]\n",
-    "    pairs = [[] for _ in range(n)]\n",
-    "    done_mask = [obs.done for obs in obss]\n",
-    "    rest_action = ViraltestAction(scheduled_actions=[])\n",
-    "\n",
-    "    def _gen(prompts):\n",
-    "        chats = [_build_chat(sys_prompt, p) for p in prompts]\n",
-    "        texts = [tok.apply_chat_template(c, tokenize=False, add_generation_prompt=True) for c in chats]\n",
-    "        return _batched_generate(mdl, tok, texts, eval=eval)\n",
-    "\n",
-    "    for day in range(1, TASK_HORIZON + 1):\n",
-    "        active = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy > 0.25]\n",
-    "        rest = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy <= 0.25]\n",
-    "        if not active and not rest:\n",
-    "            break\n",
-    "\n",
-    "        actions_by_idx = {i: rest_action for i in rest}\n",
-    "        if active:\n",
-    "            base_prompts = [format_obs(obss[i]) for i in active]\n",
-    "\n",
-    "            disc_prompts = [p + DISCOVERY_SUFFIX for p in base_prompts]\n",
-    "            disc_resps, ptok = _gen(disc_prompts)\n",
-    "            if verbose:\n",
-    "                print(f\"  D{day:2d}A: batch={len(active)} rest={len(rest)} prompt_tok={ptok}\")\n",
-    "\n",
-    "            fresh_per_active = []\n",
-    "            for j, i in enumerate(active):\n",
-    "                tcs = _parse_tool_calls_only(disc_resps[j])\n",
-    "                fresh_per_active.append([envs[i]._dispatch_tool(tc) for tc in tcs])\n",
-    "                pairs[i].append({\"prompt\": disc_prompts[j], \"response\": disc_resps[j],\n",
-    "                                 \"step\": len(rewards[i]), \"phase\": \"A\"})\n",
-    "                if log_tag is not None:\n",
-    "                    t, s = tasks_seeds[i]\n",
-    "                    _log_io(f\"{log_tag}/A\", i, day, t, s, disc_prompts[j], disc_resps[j])\n",
-    "\n",
-    "            plan_prompts = [base_prompts[j] + \"\\n\" + _format_fresh_results(fresh_per_active[j]) + PLANNING_SUFFIX\n",
-    "                            for j in range(len(active))]\n",
-    "            plan_resps, ptok2 = _gen(plan_prompts)\n",
-    "            if verbose:\n",
-    "                print(f\"  D{day:2d}B: batch={len(active)} prompt_tok={ptok2}\")\n",
-    "\n",
-    "            for j, i in enumerate(active):\n",
-    "                actions_by_idx[i] = _parse_actions_only(plan_resps[j])\n",
-    "                pairs[i].append({\"prompt\": plan_prompts[j], \"response\": plan_resps[j],\n",
-    "                                 \"step\": len(rewards[i]), \"phase\": \"B\"})\n",
-    "                if log_tag is not None:\n",
-    "                    t, s = tasks_seeds[i]\n",
-    "                    _log_io(f\"{log_tag}/B\", i, day, t, s, plan_prompts[j], plan_resps[j])\n",
-    "\n",
-    "        for i in range(n):\n",
-    "            if done_mask[i] or i not in actions_by_idx:\n",
-    "                continue\n",
-    "            obss[i] = envs[i].step(actions_by_idx[i])\n",
-    "            r = obss[i].reward or 0.0\n",
-    "            rewards[i].append(r)\n",
-    "            energies[i].append(obss[i].creator_energy)\n",
-    "            if obss[i].done:\n",
-    "                done_mask[i] = True\n",
         "\n",
         "    GAMMA, TERMINAL_W = 0.95, 5.0\n",
         "    results = []\n",
@@ -694,7 +732,9 @@
         "\n",
         "\n",
         "print(\"LLM agent functions defined (batched).\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -707,9 +747,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 9: Run untrained model (batched: all 3 tasks in parallel envs)\n",
         "print(\"Running UNTRAINED base model on all tasks (batched)...\")\n",
@@ -723,7 +761,9 @@
         "print(f\"BEFORE TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={before_results[t]['grader_score']:.4f}\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -742,9 +782,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 10: Attach LoRA adapter\n",
         "from peft import LoraConfig, get_peft_model, TaskType\n",
@@ -758,21 +796,21 @@
         "model.enable_input_require_grads()\n",
         "peft_model = get_peft_model(model, lora_config)\n",
         "peft_model.print_trainable_parameters()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 11: Training loop\n",
         "from trl import SFTTrainer, SFTConfig\n",
         "from datasets import Dataset\n",
         "\n",
-        "NUM_ROUNDS = 1\n",
         "EPISODES_PER_ROUND = 6\n",
-        "QUALITY_FLOOR = 0.40  # skip SFT for the round if no episode beats this grader score\n",
         "\n",
         "training_log = {\n",
         "    \"round\": [], \"avg_episode_reward\": [], \"max_episode_reward\": [],\n",
@@ -869,7 +907,9 @@
         "elapsed = time.time() - t_start\n",
         "print(f\"\\nTraining complete in {elapsed/60:.1f} min\")\n",
         "print(pd.DataFrame(training_log).to_string(index=False))"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -882,9 +922,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 12: Run trained model (batched)\n",
         "print(\"Running TRAINED model on all tasks (batched)...\")\n",
@@ -899,7 +937,9 @@
         "print(f\"AFTER TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={after_results[t]['grader_score']:.4f}\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -910,9 +950,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 13: Training curves\n",
         "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
@@ -934,13 +972,13 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/reward_curve.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 14: Before vs After\n",
         "task_labels = [t.replace('monthly_', '').title() for t in TASKS]\n",
@@ -970,13 +1008,13 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/before_after.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 15: Trajectory comparison\n",
         "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
@@ -1000,7 +1038,9 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/training_trajectories.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -1011,9 +1051,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 16: Final summary\n",
         "print(\"=\" * 67)\n",
@@ -1050,13 +1088,13 @@
         "\n",
         "print(f\"\\nSaved to {PLOTS_DIR}/\")\n",
         "print(\"All results are from real LoRA weight updates on real environment runs.\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 17: Save adapter\n",
         "save_path = \"./viraltest_trained_adapter\"\n",
@@ -1064,7 +1102,9 @@
         "tokenizer.save_pretrained(save_path)\n",
         "print(f\"LoRA adapter saved to {save_path}\")\n",
         "print(\"Load with: PeftModel.from_pretrained(base_model, save_path)\")"
-      ]
     }
   ],
   "metadata": {
@@ -1090,4 +1130,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 4
-}

     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 1: Install dependencies (quote versions — zsh treats `>` as redirect otherwise)\n",
         "!pip install -q torch torchvision torchaudio\n",
         "!pip install -q \"typing_extensions>=4.13.0\" pydantic httpx\n",
         "!pip install -q \"openenv-core[core]>=0.2.2\"\n",
         "!pip install -q flash-attn --no-build-isolation || echo \"flash-attn install skipped; will use sdpa\""
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 2: Resolve repo path (Colab: fresh clone. Local: auto-detect project root)\n",
         "import os\n",
         "print(f\"Branch: {REPO_BRANCH}\")\n",
         "print(f\"Commit: {commit}\")\n",
         "print(f\"Plots dir: {PLOTS_DIR}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 3: Imports (with runtime validation)\n",
         "import json, random, time, textwrap, copy, os, sys\n",
         "import ast\n",
         "ast.parse(\"def _t(x: int) -> str: return f'{x}'\")\n",
         "print(\"OK: ast.parse (syntax check)\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 4: Define heuristic agents + episode runner\n",
         "_rng = random.Random(42)\n",
         "            \"rewards\": rewards, \"energies\": energies}\n",
         "\n",
         "print(\"Agents and episode runner defined.\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 5: Run baselines (safe)\n",
         "print(\"Running heuristic baselines (5 agents × 3 tasks)...\")\n",
         "for name in BASELINE_AGENTS:\n",
         "    scores = [baseline_results[name][t][\"grader_score\"] for t in TASKS]\n",
         "    print(f\"{name:<14s} {scores[0]:>10.4f} {scores[1]:>12.4f} {scores[2]:>14.4f} {sum(scores)/3:>8.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 6: Baseline plots\n",
         "fig, axes = plt.subplots(1, 3, figsize=(16, 5), sharey=True)\n",
         "fig.tight_layout()\n",
         "fig.savefig(f\"{PLOTS_DIR}/baseline_leaderboard.png\", dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 7: Load model (Qwen2.5-3B bf16 on CUDA + flash-attn-2; fp16/fp32 fallback)\n",
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
         "print(f\"Model loaded. dtype={next(model.parameters()).dtype} device={next(model.parameters()).device}\")\n",
         "if torch.cuda.is_available():\n",
         "    print(f\"CUDA memory: {torch.cuda.memory_allocated()/1e9:.2f} GB\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 8: LLM agent functions\n",
         "_SYSTEM_BASE = textwrap.dedent(\"\"\"\\\n",
         "                  like_bait  -> likes from existing followers\n",
         "- tags:         up to 5 hashtags\n",
         "- topic:        free-form string\n",
+        "- empty scheduled_actions = full day rest\n",
+        "\n",
+        "POSTING RULES (critical — only `post` actions earn engagement reward):\n",
+        "- EVERY active day MUST schedule at least 2 `post` actions (max 3). `create_content`\n",
+        "  alone gives 0 reward — content stays in queue. Mix in 0-1 `create_content` only\n",
+        "  if the queue is empty.\n",
+        "- Schedule posts at HEATMAP PEAK HOURS (Buffer/Sprout-derived):\n",
+        "    Mon  peaks 14, 18, 19      Tue  peaks 14, 15, 19\n",
+        "    Wed  peaks 13, 14, 18      Thu  peaks 12, 13, 19\n",
+        "    Fri  peaks 12, 13, 22      Sat  peaks 21, 22, 13\n",
+        "    Sun  peaks 21, 22, 11\n",
+        "- Vary `intent` across the day; rotate `content_type` to avoid fatigue.\n",
+        "- Reuse strong tags from the Recent-days summary (those that earned reward).\"\"\")\n",
         "\n",
         "SYSTEM_PROMPT = _SYSTEM_BASE + textwrap.dedent(\"\"\"\n",
         "\n",
         "SYSTEM_PROMPT_TRAIN = SYSTEM_PROMPT\n",
         "\n",
         "\n",
+        "_DAY_NAMES = [\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\"]\n",
+        "_PEAK_HOURS = {0:[14,18,19], 1:[14,15,19], 2:[13,14,18], 3:[12,13,19],\n",
+        "               4:[12,13,22], 5:[21,22,13], 6:[21,22,11]}\n",
+        "\n",
+        "\n",
+        "def _format_history(history, k=3):\n",
+        "    if not history:\n",
+        "        return \"Recent (last 3 days): (none — day 1)\\n\"\n",
+        "    out = \"Recent (last 3 days):\\n\"\n",
+        "    for h in history[-k:]:\n",
+        "        posts = h.get(\"posts\", [])\n",
+        "        if not posts:\n",
+        "            out += f\"  D-{h['ago']}: rest reward={h['reward']:.2f}\\n\"\n",
+        "        else:\n",
+        "            ph = \",\".join(f\"{p['hour']}h/{p['content_type'][:4]}/{p['intent'][:4]}\" for p in posts)\n",
+        "            out += f\"  D-{h['ago']}: posts=[{ph}] reward={h['reward']:.2f}\\n\"\n",
+        "    return out\n",
+        "\n",
+        "\n",
+        "def format_obs(obs, history=None):\n",
+        "    day_name = _DAY_NAMES[obs.day_of_week] if 0 <= obs.day_of_week < 7 else \"?\"\n",
+        "    peaks = _PEAK_HOURS.get(obs.day_of_week, [12, 18, 20])\n",
         "    signals_str = \"\"\n",
         "    signals = getattr(obs, \"engagement_signals\", None)\n",
         "    if signals:\n",
         "            tool_str += f\"  {tr.name}: {json.dumps(tr.data)}\\n\"\n",
         "    if not tool_str:\n",
         "        tool_str = \"  (none — call query_* tools to discover)\\n\"\n",
+        "    return (f\"Day: {day_name} | days_elapsed={obs.days_elapsed} | today's peak hours={peaks}\\n\"\n",
         "            f\"Energy: {obs.creator_energy:.2f} | Followers: {obs.follower_count}\\n\"\n",
         "            f\"Engagement: {obs.engagement_rate:.3f} | Queue: {obs.content_queue_size}\\n\"\n",
         "            f\"{signals_str}\"\n",
+        "            f\"{_format_history(history)}\"\n",
         "            f\"Tool results:\\n{tool_str}\"\n",
         "            f\"Plan today's actions (JSON only):\")\n",
         "\n",
         "\n",
         "def _batched_generate(mdl, tok, prompts, eval=False, max_new_tokens=512):\n",
         "    enc = tok(prompts, return_tensors=\"pt\", padding=True, truncation=False).to(_infer_model_device(mdl))\n",
+        "    if eval:\n",
+        "        gen_kwargs = dict(max_new_tokens=max_new_tokens, pad_token_id=tok.pad_token_id, do_sample=False)\n",
+        "    else:\n",
+        "        gen_kwargs = dict(max_new_tokens=max_new_tokens, pad_token_id=tok.pad_token_id,\n",
+        "                          do_sample=True, temperature=0.9, top_p=0.95)\n",
         "    with torch.no_grad():\n",
         "        out = mdl.generate(**enc, **gen_kwargs)\n",
         "    resps = tok.batch_decode(out[:, enc[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
         "        f.write(json.dumps(rec) + \"\\n\")\n",
         "\n",
         "\n",
+        "DISCOVERY_SUFFIX = \"\\n\\nPHASE A (DISCOVERY): respond with JSON {\\\"tool_calls\\\": [...]} only.\"\n",
+        "PLANNING_SUFFIX = \"\\n\\nPHASE B (PLANNING): respond with JSON {\\\"scheduled_actions\\\": [...], \\\"notes\\\": \\\"...\\\"} using the fresh Tool results above.\"\n",
+        "\n",
+        "\n",
+        "def _parse_tool_calls_only(text):\n",
+        "    return parse_model_output(text).tool_calls\n",
+        "\n",
+        "\n",
+        "def _parse_actions_only(text):\n",
+        "    a = parse_model_output(text)\n",
+        "    return ViraltestAction(tool_calls=[], scheduled_actions=a.scheduled_actions, notes=a.notes)\n",
+        "\n",
+        "\n",
+        "def _format_fresh_results(fresh):\n",
+        "    if not fresh:\n",
+        "        return \"\"\n",
+        "    out = \"Fresh tool results (PHASE A):\\n\"\n",
+        "    for tr in fresh:\n",
+        "        if tr.success:\n",
+        "            out += f\"  {tr.name}: {json.dumps(tr.data)}\\n\"\n",
+        "        else:\n",
+        "            out += f\"  {tr.name}: ERROR {tr.error}\\n\"\n",
+        "    return out\n",
+        "\n",
+        "\n",
+        "def run_llm_episodes_batched(mdl, tok, tasks_seeds, verbose=True, eval=False, system=None, log_tag=None):\n",
+        "    \"\"\"Run N episodes in parallel. ReAct two-pass: discovery -> dispatch -> planning.\"\"\"\n",
+        "    sys_prompt = system or (SYSTEM_PROMPT_EVAL if eval else SYSTEM_PROMPT_TRAIN)\n",
+        "    n = len(tasks_seeds)\n",
+        "    envs = [ViraltestEnvironment() for _ in range(n)]\n",
+        "    obss = [envs[i].reset(task=t, seed=s) for i, (t, s) in enumerate(tasks_seeds)]\n",
+        "    rewards = [[] for _ in range(n)]\n",
+        "    energies = [[obs.creator_energy] for obs in obss]\n",
+        "    pairs = [[] for _ in range(n)]\n",
+        "    histories = [[] for _ in range(n)]\n",
+        "    done_mask = [obs.done for obs in obss]\n",
+        "    rest_action = ViraltestAction(scheduled_actions=[])\n",
+        "\n",
+        "    def _gen(prompts):\n",
+        "        chats = [_build_chat(sys_prompt, p) for p in prompts]\n",
+        "        texts = [tok.apply_chat_template(c, tokenize=False, add_generation_prompt=True) for c in chats]\n",
+        "        return _batched_generate(mdl, tok, texts, eval=eval)\n",
+        "\n",
+        "    for day in range(1, TASK_HORIZON + 1):\n",
+        "        active = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy > 0.25]\n",
+        "        rest = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy <= 0.25]\n",
+        "        if not active and not rest:\n",
+        "            break\n",
+        "\n",
+        "        actions_by_idx = {i: rest_action for i in rest}\n",
+        "        if active:\n",
+        "            base_prompts = [format_obs(obss[i], histories[i]) for i in active]\n",
+        "\n",
+        "            disc_prompts = [p + DISCOVERY_SUFFIX for p in base_prompts]\n",
+        "            disc_resps, ptok = _gen(disc_prompts)\n",
+        "            if verbose:\n",
+        "                print(f\"  D{day:2d}A: batch={len(active)} rest={len(rest)} prompt_tok={ptok}\")\n",
+        "\n",
+        "            fresh_per_active = []\n",
+        "            for j, i in enumerate(active):\n",
+        "                tcs = _parse_tool_calls_only(disc_resps[j])\n",
+        "                fresh_per_active.append([envs[i]._dispatch_tool(tc) for tc in tcs])\n",
+        "                pairs[i].append({\"prompt\": disc_prompts[j], \"response\": disc_resps[j],\n",
+        "                                 \"step\": len(rewards[i]), \"phase\": \"A\"})\n",
+        "                if log_tag is not None:\n",
+        "                    t, s = tasks_seeds[i]\n",
+        "                    _log_io(f\"{log_tag}/A\", i, day, t, s, disc_prompts[j], disc_resps[j])\n",
+        "\n",
+        "            plan_prompts = [base_prompts[j] + \"\\n\" + _format_fresh_results(fresh_per_active[j]) + PLANNING_SUFFIX\n",
+        "                            for j in range(len(active))]\n",
+        "            plan_resps, ptok2 = _gen(plan_prompts)\n",
+        "            if verbose:\n",
+        "                print(f\"  D{day:2d}B: batch={len(active)} prompt_tok={ptok2}\")\n",
+        "\n",
+        "            for j, i in enumerate(active):\n",
+        "                actions_by_idx[i] = _parse_actions_only(plan_resps[j])\n",
+        "                pairs[i].append({\"prompt\": plan_prompts[j], \"response\": plan_resps[j],\n",
+        "                                 \"step\": len(rewards[i]), \"phase\": \"B\"})\n",
+        "                if log_tag is not None:\n",
+        "                    t, s = tasks_seeds[i]\n",
+        "                    _log_io(f\"{log_tag}/B\", i, day, t, s, plan_prompts[j], plan_resps[j])\n",
+        "\n",
+        "        for i in range(n):\n",
+        "            if done_mask[i] or i not in actions_by_idx:\n",
+        "                continue\n",
+        "            act = actions_by_idx[i]\n",
+        "            obss[i] = envs[i].step(act)\n",
+        "            r = obss[i].reward or 0.0\n",
+        "            rewards[i].append(r)\n",
+        "            energies[i].append(obss[i].creator_energy)\n",
+        "            posts = [{\"hour\": s.hour, \"content_type\": s.content_type or \"?\", \"intent\": s.intent or \"?\"}\n",
+        "                     for s in (act.scheduled_actions or []) if s.action_type == \"post\"]\n",
+        "            for h in histories[i]:\n",
+        "                h[\"ago\"] += 1\n",
+        "            histories[i].append({\"ago\": 1, \"posts\": posts, \"reward\": r})\n",
+        "            histories[i] = histories[i][-3:]\n",
+        "            if obss[i].done:\n",
+        "                done_mask[i] = True\n",
         "\n",
         "    GAMMA, TERMINAL_W = 0.95, 5.0\n",
         "    results = []\n",
         "\n",
         "\n",
         "print(\"LLM agent functions defined (batched).\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 9: Run untrained model (batched: all 3 tasks in parallel envs)\n",
         "print(\"Running UNTRAINED base model on all tasks (batched)...\")\n",
         "print(f\"BEFORE TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={before_results[t]['grader_score']:.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 10: Attach LoRA adapter\n",
         "from peft import LoraConfig, get_peft_model, TaskType\n",
         "model.enable_input_require_grads()\n",
         "peft_model = get_peft_model(model, lora_config)\n",
         "peft_model.print_trainable_parameters()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 11: Training loop\n",
         "from trl import SFTTrainer, SFTConfig\n",
         "from datasets import Dataset\n",
         "\n",
+        "NUM_ROUNDS = 2\n",
         "EPISODES_PER_ROUND = 6\n",
+        "QUALITY_FLOOR = 0.0  # 0 = always run SFT on positive-advantage samples\n",
         "\n",
         "training_log = {\n",
         "    \"round\": [], \"avg_episode_reward\": [], \"max_episode_reward\": [],\n",
         "elapsed = time.time() - t_start\n",
         "print(f\"\\nTraining complete in {elapsed/60:.1f} min\")\n",
         "print(pd.DataFrame(training_log).to_string(index=False))"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 12: Run trained model (batched)\n",
         "print(\"Running TRAINED model on all tasks (batched)...\")\n",
         "print(f\"AFTER TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={after_results[t]['grader_score']:.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 13: Training curves\n",
         "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/reward_curve.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 14: Before vs After\n",
         "task_labels = [t.replace('monthly_', '').title() for t in TASKS]\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/before_after.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 15: Trajectory comparison\n",
         "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/training_trajectories.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 16: Final summary\n",
         "print(\"=\" * 67)\n",
         "\n",
         "print(f\"\\nSaved to {PLOTS_DIR}/\")\n",
         "print(\"All results are from real LoRA weight updates on real environment runs.\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 17: Save adapter\n",
         "save_path = \"./viraltest_trained_adapter\"\n",
         "tokenizer.save_pretrained(save_path)\n",
         "print(f\"LoRA adapter saved to {save_path}\")\n",
         "print(\"Load with: PeftModel.from_pretrained(base_model, save_path)\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     }
   ],
   "metadata": {
   },
   "nbformat": 4,
   "nbformat_minor": 4
+}