Spaces:

ycwhencpp
/

final-iteration

Paused

vaibhav12332112312 commited on 12 days ago

Commit

a6b8df0

1 Parent(s): 81cdb34

train: batched parallel rollouts on Qwen2.5-3B + parser hardening

- Qwen2.5-3B-Instruct in bf16 + flash-attn-2 (sdpa fallback)
- New run_llm_episodes_batched: N parallel envs, one batched generate per day
(~10x faster rollouts than sequential)
- parse_model_output: per-tool-call try/except so a malformed `arguments` no
longer wipes the whole action (root cause of post-train follower collapse)
- is_well_formed_response filter on SFT data
- SFT: max_length=4096, batch=4 x accum=2, bf16
- Per-step credit assignment for SFT sample weights

Made-with: Cursor

Files changed (1) hide show

training/train_grpo.ipynb +251 -203

training/train_grpo.ipynb CHANGED Viewed

@@ -25,23 +25,22 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 1: Install dependencies (quote versions — zsh treats `>` as redirect otherwise)\n",
         "!pip install -q torch torchvision torchaudio\n",
-        "!pip install -q \"transformers>=4.45.0\" \"accelerate\" \"peft>=0.10.0\" \"trl>=0.20.0\" \"datasets\" \"bitsandbytes\"\n",
         "!pip install -q matplotlib pandas\n",
         "!pip install -q \"typing_extensions>=4.13.0\" pydantic httpx\n",
-        "!pip install -q \"openenv-core[core]>=0.2.2\""
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 2: Resolve repo path (Colab: fresh clone. Local: auto-detect project root)\n",
         "import os\n",
@@ -117,13 +116,13 @@
         "print(f\"Branch: {REPO_BRANCH}\")\n",
         "print(f\"Commit: {commit}\")\n",
         "print(f\"Plots dir: {PLOTS_DIR}\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 3: Imports (with runtime validation)\n",
         "import json, random, time, textwrap, copy, os, sys\n",
@@ -177,7 +176,9 @@
         "import ast\n",
         "ast.parse(\"def _t(x: int) -> str: return f'{x}'\")\n",
         "print(\"OK: ast.parse (syntax check)\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -190,9 +191,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 4: Define heuristic agents + episode runner\n",
         "_rng = random.Random(42)\n",
@@ -269,13 +268,13 @@
         "            \"rewards\": rewards, \"energies\": energies}\n",
         "\n",
         "print(\"Agents and episode runner defined.\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 5: Run baselines (safe)\n",
         "print(\"Running heuristic baselines (5 agents × 3 tasks)...\")\n",
@@ -310,13 +309,13 @@
         "for name in BASELINE_AGENTS:\n",
         "    scores = [baseline_results[name][t][\"grader_score\"] for t in TASKS]\n",
         "    print(f\"{name:<14s} {scores[0]:>10.4f} {scores[1]:>12.4f} {scores[2]:>14.4f} {sum(scores)/3:>8.4f}\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 6: Baseline plots\n",
         "fig, axes = plt.subplots(1, 3, figsize=(16, 5), sharey=True)\n",
@@ -334,7 +333,9 @@
         "fig.tight_layout()\n",
         "fig.savefig(f\"{PLOTS_DIR}/baseline_leaderboard.png\", dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -347,80 +348,57 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "# Cell 7: Load model (4-bit on CUDA Colab; fp16/fp32 fallback if bitsandbytes missing)\n",
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
         "\n",
-        "MODEL_NAME = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
         "\n",
         "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
         "\n",
-        "_use_4bit = False\n",
-        "try:\n",
-        "    from transformers.utils import is_bitsandbytes_available\n",
-        "except Exception:  # older transformers\n",
-        "    def is_bitsandbytes_available():\n",
-        "        try:\n",
-        "            import bitsandbytes  # noqa: F401\n",
-        "            return True\n",
-        "        except ImportError:\n",
-        "            return False\n",
-        "\n",
-        "if torch.cuda.is_available() and is_bitsandbytes_available():\n",
-        "    from transformers import BitsAndBytesConfig\n",
-        "    _use_4bit = True\n",
-        "\n",
-        "if _use_4bit:\n",
-        "    print(f\"Loading {MODEL_NAME} (4-bit quantized, CUDA)...\")\n",
-        "    bnb_config = BitsAndBytesConfig(\n",
-        "        load_in_4bit=True,\n",
-        "        bnb_4bit_quant_type=\"nf4\",\n",
-        "        bnb_4bit_compute_dtype=torch.float16,\n",
-        "        bnb_4bit_use_double_quant=True,\n",
-        "    )\n",
-        "    model = AutoModelForCausalLM.from_pretrained(\n",
-        "        MODEL_NAME,\n",
-        "        trust_remote_code=True,\n",
-        "        quantization_config=bnb_config,\n",
-        "        device_map=\"auto\",\n",
-        "    )\n",
         "else:\n",
-        "    print(\n",
-        "        f\"Loading {MODEL_NAME} without 4-bit (bitsandbytes/CUDA unavailable).\\n\"\n",
-        "        \"  On Colab: run `pip install -U bitsandbytes>=0.46.1` and use a GPU runtime.\\n\"\n",
-        "        \"  On Mac: use fp16 on MPS or fp32 on CPU.\"\n",
-        "    )\n",
-        "    dtype = torch.float16 if (torch.cuda.is_available() or getattr(torch.backends, \"mps\", None) and torch.backends.mps.is_available()) else torch.float32\n",
-        "    model = AutoModelForCausalLM.from_pretrained(\n",
-        "        MODEL_NAME,\n",
-        "        trust_remote_code=True,\n",
-        "        dtype=dtype,\n",
-        "        device_map=\"auto\" if torch.cuda.is_available() else None,\n",
-        "    )\n",
-        "    if not torch.cuda.is_available():\n",
-        "        if getattr(torch.backends, \"mps\", None) and torch.backends.mps.is_available():\n",
-        "            model = model.to(\"mps\")\n",
-        "        else:\n",
-        "            model = model.to(\"cpu\")\n",
         "\n",
         "model.eval()\n",
-        "print(f\"Model loaded. dtype={next(model.parameters()).dtype}\")\n",
-        "try:\n",
-        "    print(f\"Device: {model.device}\")\n",
-        "except Exception:\n",
-        "    print(\"Device: (see first parameter device)\")\n",
         "if torch.cuda.is_available():\n",
         "    print(f\"CUDA memory: {torch.cuda.memory_allocated()/1e9:.2f} GB\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 8: LLM agent functions\n",
         "SYSTEM_PROMPT = textwrap.dedent(\"\"\"\\\n",
@@ -468,6 +446,21 @@
         "            f\"Plan your actions (JSON only):\")\n",
         "\n",
         "\n",
         "def parse_model_output(text):\n",
         "    text = text.strip()\n",
         "    if \"```\" in text:\n",
@@ -478,24 +471,33 @@
         "        text = text[start:end]\n",
         "    try:\n",
         "        data = json.loads(text)\n",
-        "        tool_calls = [ToolCall(name=tc[\"name\"], arguments=tc.get(\"arguments\", {}))\n",
-        "                      for tc in data.get(\"tool_calls\", []) if isinstance(tc, dict) and \"name\" in tc]\n",
-        "        scheduled = []\n",
-        "        for a in data.get(\"scheduled_actions\", []):\n",
-        "            try:\n",
-        "                scheduled.append(ScheduledAction(**a))\n",
-        "            except Exception:\n",
-        "                # Same as original bare `except:`: skip invalid scheduled_actions entries\n",
-        "                pass\n",
-        "        return ViraltestAction(\n",
-        "            tool_calls=tool_calls,\n",
-        "            scheduled_actions=scheduled,\n",
-        "            replies=data.get(\"replies\", []),\n",
-        "            notes=data.get(\"notes\"),\n",
-        "        )\n",
         "    except Exception:\n",
-        "        # Same behavior as original bare `except:`: any parse/validation failure -> empty action\n",
         "        return ViraltestAction(scheduled_actions=[])\n",
         "\n",
         "\n",
         "def _infer_model_device(m):\n",
@@ -509,53 +511,101 @@
         "    return torch.device(\"cpu\")\n",
         "\n",
         "\n",
-        "def generate_action(mdl, tok, obs, history, temperature=0.7):\n",
-        "    prompt = format_obs(obs)\n",
-        "    messages = [{\"role\": \"system\", \"content\": SYSTEM_PROMPT}]\n",
-        "    messages.extend(history[-4:])\n",
-        "    messages.append({\"role\": \"user\", \"content\": prompt})\n",
-        "    text_input = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
-        "    inputs = tok(text_input, return_tensors=\"pt\").to(_infer_model_device(mdl))\n",
         "    with torch.no_grad():\n",
-        "        out = mdl.generate(**inputs, max_new_tokens=512, temperature=temperature,\n",
-        "                           do_sample=True, top_p=0.9, pad_token_id=tok.eos_token_id)\n",
-        "    resp = tok.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
-        "    return resp, parse_model_output(resp)\n",
         "\n",
         "\n",
         "def run_llm_episode(mdl, tok, task, seed=42, verbose=False):\n",
-        "    env = ViraltestEnvironment()\n",
-        "    obs = env.reset(task=task, seed=seed)\n",
-        "    rewards, energies = [], [obs.creator_energy]\n",
-        "    history, pairs = [], []\n",
-        "    for day in range(1, TASK_HORIZON + 1):\n",
-        "        if obs.done: break\n",
-        "        if obs.creator_energy <= 0.25:\n",
-        "            action = ViraltestAction(scheduled_actions=[])\n",
-        "            resp = '{\"scheduled_actions\": []}'\n",
-        "        else:\n",
-        "            resp, action = generate_action(mdl, tok, obs, history)\n",
-        "        prompt = format_obs(obs)\n",
-        "        pairs.append({\"prompt\": prompt, \"response\": resp})\n",
-        "        obs = env.step(action)\n",
-        "        r = obs.reward or 0.0\n",
-        "        rewards.append(r)\n",
-        "        energies.append(obs.creator_energy)\n",
-        "        history.extend([{\"role\": \"user\", \"content\": prompt},\n",
-        "                        {\"role\": \"assistant\", \"content\": resp}])\n",
-        "        if verbose:\n",
-        "            n_p = len([s for s in action.scheduled_actions if s.action_type==\"post\"])\n",
-        "            print(f\"    Day {day:2d}: r={r:.4f} e={obs.creator_energy:.2f} posts={n_p} tools={len(action.tool_calls)}\")\n",
-        "        if obs.done: break\n",
-        "    gs = (obs.metadata or {}).get(\"grader_score\", 0.0)\n",
-        "    return {\"task\": task, \"grader_score\": gs, \"total_reward\": sum(rewards),\n",
-        "            \"final_energy\": obs.creator_energy, \"rewards\": rewards,\n",
-        "            \"energies\": energies, \"pairs\": pairs,\n",
-        "            \"follower_delta\": obs.follower_count - 10000,\n",
-        "            \"burned_out\": obs.creator_energy <= 0}\n",
         "\n",
-        "print(\"LLM agent functions defined.\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -568,26 +618,23 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "# Cell 9: Run untrained model\n",
-        "print(\"Running UNTRAINED base model on all tasks...\")\n",
         "print(\"=\" * 60)\n",
         "\n",
-        "before_results = {}\n",
-        "for task in TASKS:\n",
-        "    print(f\"\\n  Task: {task}\")\n",
-        "    result = run_llm_episode(model, tokenizer, task, seed=42, verbose=True)\n",
-        "    before_results[task] = result\n",
-        "    print(f\"  => grader={result['grader_score']:.4f} reward={result['total_reward']:.3f}\")\n",
         "\n",
         "print(\"\\n\" + \"=\" * 60)\n",
-        "print(\"BEFORE TRAINING:\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={before_results[t]['grader_score']:.4f}\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -606,9 +653,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 10: Attach LoRA adapter\n",
         "from peft import LoraConfig, get_peft_model, TaskType\n",
@@ -623,13 +668,13 @@
         "model.enable_input_require_grads()\n",
         "peft_model = get_peft_model(model, lora_config)\n",
         "peft_model.print_trainable_parameters()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 11: Training loop\n",
         "from trl import SFTTrainer, SFTConfig\n",
@@ -652,35 +697,39 @@
         "    print(f\"TRAINING ROUND {round_idx}/{NUM_ROUNDS}\")\n",
         "    print(f\"{'=' * 60}\")\n",
         "\n",
-        "    # Collect episodes\n",
         "    peft_model.eval()\n",
-        "    all_pairs, episode_rewards, episode_graders = [], [], []\n",
         "\n",
-        "    for ep in range(EPISODES_PER_ROUND):\n",
-        "        task = TASKS[ep % len(TASKS)]\n",
-        "        seed = 42 + (round_idx - 1) * 100 + ep\n",
-        "        result = run_llm_episode(peft_model, tokenizer, task, seed=seed)\n",
         "        ep_reward = result[\"total_reward\"] + 2.0 * result[\"grader_score\"]\n",
         "        episode_rewards.append(ep_reward)\n",
         "        episode_graders.append(result[\"grader_score\"])\n",
-        "\n",
         "        for pr in result[\"pairs\"]:\n",
         "            text = (f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>user\\n{pr['prompt']}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>assistant\\n{pr['response']}<|im_end|>\")\n",
-        "            all_pairs.append({\"text\": text, \"reward\": ep_reward})\n",
-        "\n",
-        "        print(f\"  ep {ep+1}/{EPISODES_PER_ROUND}: {task.split('_')[-1]:>11s} \"\n",
-        "              f\"grader={result['grader_score']:.4f} reward={ep_reward:.3f}\")\n",
-        "\n",
-        "    avg_r = np.mean(episode_rewards)\n",
-        "    avg_g = np.mean(episode_graders)\n",
-        "    print(f\"  Avg reward={avg_r:.3f} Avg grader={avg_g:.4f}\")\n",
         "\n",
-        "    # Filter to top-K\n",
         "    threshold = np.percentile([p[\"reward\"] for p in all_pairs], (1 - TOP_K_FRACTION) * 100)\n",
         "    filtered = [p for p in all_pairs if p[\"reward\"] >= threshold] or all_pairs\n",
-        "    print(f\"  Filtered to {len(filtered)}/{len(all_pairs)} samples\")\n",
         "\n",
         "    dataset = Dataset.from_list([{\"text\": p[\"text\"]} for p in filtered])\n",
         "\n",
@@ -688,14 +737,14 @@
         "    sft_config = SFTConfig(\n",
         "        output_dir=f\"./checkpoints/round_{round_idx}\",\n",
         "        num_train_epochs=2,\n",
-        "        per_device_train_batch_size=1,\n",
-        "        gradient_accumulation_steps=4,\n",
         "        learning_rate=2e-5,\n",
         "        warmup_steps=5,\n",
         "        logging_steps=5,\n",
         "        save_strategy=\"no\",\n",
-        "        max_length=1024,\n",
-        "        fp16=True,\n",
         "        report_to=\"none\",\n",
         "    )\n",
         "\n",
@@ -720,7 +769,9 @@
         "elapsed = time.time() - t_start\n",
         "print(f\"\\nTraining complete in {elapsed/60:.1f} min\")\n",
         "print(pd.DataFrame(training_log).to_string(index=False))"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -733,27 +784,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "# Cell 12: Run trained model\n",
-        "print(\"Running TRAINED model on all tasks...\")\n",
         "print(\"=\" * 60)\n",
         "\n",
         "peft_model.eval()\n",
-        "after_results = {}\n",
-        "for task in TASKS:\n",
-        "    print(f\"\\n  Task: {task}\")\n",
-        "    result = run_llm_episode(peft_model, tokenizer, task, seed=42, verbose=True)\n",
-        "    after_results[task] = result\n",
-        "    print(f\"  => grader={result['grader_score']:.4f} reward={result['total_reward']:.3f}\")\n",
         "\n",
         "print(\"\\n\" + \"=\" * 60)\n",
-        "print(\"AFTER TRAINING:\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={after_results[t]['grader_score']:.4f}\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -764,9 +812,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 13: Training curves\n",
         "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
@@ -788,13 +834,13 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/reward_curve.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 14: Before vs After\n",
         "task_labels = [t.replace('monthly_', '').title() for t in TASKS]\n",
@@ -824,13 +870,13 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/before_after.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 15: Trajectory comparison\n",
         "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
@@ -854,7 +900,9 @@
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/training_trajectories.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -865,9 +913,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 16: Final summary\n",
         "print(\"=\" * 67)\n",
@@ -904,13 +950,13 @@
         "\n",
         "print(f\"\\nSaved to {PLOTS_DIR}/\")\n",
         "print(\"All results are from real LoRA weight updates on real environment runs.\")"
-      ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# Cell 17: Save adapter\n",
         "save_path = \"./viraltest_trained_adapter\"\n",
@@ -918,7 +964,9 @@
         "tokenizer.save_pretrained(save_path)\n",
         "print(f\"LoRA adapter saved to {save_path}\")\n",
         "print(\"Load with: PeftModel.from_pretrained(base_model, save_path)\")"
-      ]
     }
   ],
   "metadata": {
@@ -944,4 +992,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 4
-}

     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 1: Install dependencies (quote versions — zsh treats `>` as redirect otherwise)\n",
         "!pip install -q torch torchvision torchaudio\n",
+        "!pip install -q \"transformers>=4.45.0\" \"accelerate\" \"peft>=0.10.0\" \"trl>=0.20.0\" \"datasets\"\n",
         "!pip install -q matplotlib pandas\n",
         "!pip install -q \"typing_extensions>=4.13.0\" pydantic httpx\n",
+        "!pip install -q \"openenv-core[core]>=0.2.2\"\n",
+        "!pip install -q flash-attn --no-build-isolation || echo \"flash-attn install skipped; will use sdpa\""
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 2: Resolve repo path (Colab: fresh clone. Local: auto-detect project root)\n",
         "import os\n",
         "print(f\"Branch: {REPO_BRANCH}\")\n",
         "print(f\"Commit: {commit}\")\n",
         "print(f\"Plots dir: {PLOTS_DIR}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 3: Imports (with runtime validation)\n",
         "import json, random, time, textwrap, copy, os, sys\n",
         "import ast\n",
         "ast.parse(\"def _t(x: int) -> str: return f'{x}'\")\n",
         "print(\"OK: ast.parse (syntax check)\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 4: Define heuristic agents + episode runner\n",
         "_rng = random.Random(42)\n",
         "            \"rewards\": rewards, \"energies\": energies}\n",
         "\n",
         "print(\"Agents and episode runner defined.\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 5: Run baselines (safe)\n",
         "print(\"Running heuristic baselines (5 agents × 3 tasks)...\")\n",
         "for name in BASELINE_AGENTS:\n",
         "    scores = [baseline_results[name][t][\"grader_score\"] for t in TASKS]\n",
         "    print(f\"{name:<14s} {scores[0]:>10.4f} {scores[1]:>12.4f} {scores[2]:>14.4f} {sum(scores)/3:>8.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 6: Baseline plots\n",
         "fig, axes = plt.subplots(1, 3, figsize=(16, 5), sharey=True)\n",
         "fig.tight_layout()\n",
         "fig.savefig(f\"{PLOTS_DIR}/baseline_leaderboard.png\", dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Cell 7: Load model (Qwen2.5-3B bf16 on CUDA + flash-attn-2; fp16/fp32 fallback)\n",
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
         "\n",
+        "MODEL_NAME = \"Qwen/Qwen2.5-3B-Instruct\"\n",
         "\n",
         "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "tokenizer.padding_side = \"left\"\n",
         "\n",
+        "\n",
+        "def _has_flash_attn():\n",
+        "    try:\n",
+        "        import flash_attn  # noqa: F401\n",
+        "        return torch.cuda.is_available()\n",
+        "    except Exception:\n",
+        "        return False\n",
+        "\n",
+        "\n",
+        "if torch.cuda.is_available():\n",
+        "    dtype = torch.bfloat16\n",
+        "    attn_impl = \"flash_attention_2\" if _has_flash_attn() else \"sdpa\"\n",
+        "elif getattr(torch.backends, \"mps\", None) and torch.backends.mps.is_available():\n",
+        "    dtype, attn_impl = torch.float16, \"sdpa\"\n",
         "else:\n",
+        "    dtype, attn_impl = torch.float32, \"eager\"\n",
+        "\n",
+        "print(f\"Loading {MODEL_NAME} (dtype={dtype}, attn={attn_impl})...\")\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "    MODEL_NAME,\n",
+        "    trust_remote_code=True,\n",
+        "    dtype=dtype,\n",
+        "    attn_implementation=attn_impl,\n",
+        "    device_map=\"cuda:0\" if torch.cuda.is_available() else None,\n",
+        ")\n",
+        "if not torch.cuda.is_available():\n",
+        "    model = model.to(\"mps\") if (getattr(torch.backends, \"mps\", None) and torch.backends.mps.is_available()) else model.to(\"cpu\")\n",
         "\n",
         "model.eval()\n",
+        "print(f\"Model loaded. dtype={next(model.parameters()).dtype} device={next(model.parameters()).device}\")\n",
         "if torch.cuda.is_available():\n",
         "    print(f\"CUDA memory: {torch.cuda.memory_allocated()/1e9:.2f} GB\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 8: LLM agent functions\n",
         "SYSTEM_PROMPT = textwrap.dedent(\"\"\"\\\n",
         "            f\"Plan your actions (JSON only):\")\n",
         "\n",
         "\n",
+        "def is_well_formed_response(text):\n",
+        "    try:\n",
+        "        t = text.strip()\n",
+        "        if \"```\" in t:\n",
+        "            t = \"\\n\".join(l for l in t.split(\"\\n\") if not l.strip().startswith(\"```\")).strip()\n",
+        "        s, e = t.find(\"{\"), t.rfind(\"}\") + 1\n",
+        "        d = json.loads(t[s:e])\n",
+        "        for tc in d.get(\"tool_calls\", []):\n",
+        "            if not isinstance(tc, dict) or not isinstance(tc.get(\"arguments\", {}), dict):\n",
+        "                return False\n",
+        "        return True\n",
+        "    except Exception:\n",
+        "        return False\n",
+        "\n",
+        "\n",
         "def parse_model_output(text):\n",
         "    text = text.strip()\n",
         "    if \"```\" in text:\n",
         "        text = text[start:end]\n",
         "    try:\n",
         "        data = json.loads(text)\n",
         "    except Exception:\n",
         "        return ViraltestAction(scheduled_actions=[])\n",
+        "    tool_calls = []\n",
+        "    for tc in data.get(\"tool_calls\", []):\n",
+        "        if not isinstance(tc, dict) or \"name\" not in tc:\n",
+        "            continue\n",
+        "        args = tc.get(\"arguments\", {})\n",
+        "        if isinstance(args, list) and args and isinstance(args[0], dict):\n",
+        "            args = args[0]\n",
+        "        if not isinstance(args, dict):\n",
+        "            continue\n",
+        "        try:\n",
+        "            tool_calls.append(ToolCall(name=tc[\"name\"], arguments=args))\n",
+        "        except Exception:\n",
+        "            pass\n",
+        "    scheduled = []\n",
+        "    for a in data.get(\"scheduled_actions\", []):\n",
+        "        try:\n",
+        "            scheduled.append(ScheduledAction(**a))\n",
+        "        except Exception:\n",
+        "            pass\n",
+        "    return ViraltestAction(\n",
+        "        tool_calls=tool_calls,\n",
+        "        scheduled_actions=scheduled,\n",
+        "        replies=data.get(\"replies\", []),\n",
+        "        notes=data.get(\"notes\"),\n",
+        "    )\n",
         "\n",
         "\n",
         "def _infer_model_device(m):\n",
         "    return torch.device(\"cpu\")\n",
         "\n",
         "\n",
+        "def _build_chat(history, prompt):\n",
+        "    msgs = [{\"role\": \"system\", \"content\": SYSTEM_PROMPT}]\n",
+        "    msgs.extend(history[-14:])\n",
+        "    msgs.append({\"role\": \"user\", \"content\": prompt})\n",
+        "    return msgs\n",
+        "\n",
+        "\n",
+        "def _batched_generate(mdl, tok, prompts, temperature=0.7, max_new_tokens=512):\n",
+        "    enc = tok(prompts, return_tensors=\"pt\", padding=True, truncation=False).to(_infer_model_device(mdl))\n",
         "    with torch.no_grad():\n",
+        "        out = mdl.generate(\n",
+        "            **enc, max_new_tokens=max_new_tokens, temperature=temperature,\n",
+        "            do_sample=True, top_p=0.9, pad_token_id=tok.pad_token_id,\n",
+        "        )\n",
+        "    resps = tok.batch_decode(out[:, enc[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
+        "    return resps, enc[\"input_ids\"].shape[1]\n",
+        "\n",
+        "\n",
+        "def run_llm_episodes_batched(mdl, tok, tasks_seeds, verbose=True):\n",
+        "    \"\"\"Run N episodes in parallel. tasks_seeds: list of (task, seed). One batched generate per day.\"\"\"\n",
+        "    n = len(tasks_seeds)\n",
+        "    envs = [ViraltestEnvironment() for _ in range(n)]\n",
+        "    obss = [envs[i].reset(task=t, seed=s) for i, (t, s) in enumerate(tasks_seeds)]\n",
+        "    histories = [[] for _ in range(n)]\n",
+        "    rewards = [[] for _ in range(n)]\n",
+        "    energies = [[obs.creator_energy] for obs in obss]\n",
+        "    pairs = [[] for _ in range(n)]\n",
+        "    done_mask = [obs.done for obs in obss]\n",
+        "    rest_resp = '{\"scheduled_actions\": []}'\n",
+        "\n",
+        "    for day in range(1, TASK_HORIZON + 1):\n",
+        "        active = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy > 0.25]\n",
+        "        rest = [i for i in range(n) if not done_mask[i] and obss[i].creator_energy <= 0.25]\n",
+        "        if not active and not rest:\n",
+        "            break\n",
+        "\n",
+        "        resps_by_idx = {}\n",
+        "        if active:\n",
+        "            prompts = [format_obs(obss[i]) for i in active]\n",
+        "            chats = [_build_chat(histories[i], p) for i, p in zip(active, prompts)]\n",
+        "            texts = [tok.apply_chat_template(c, tokenize=False, add_generation_prompt=True) for c in chats]\n",
+        "            resps, ptok = _batched_generate(mdl, tok, texts)\n",
+        "            if verbose:\n",
+        "                print(f\"  D{day:2d}: batch={len(active)} rest={len(rest)} prompt_tok={ptok}\")\n",
+        "            for j, i in enumerate(active):\n",
+        "                resps_by_idx[i] = (resps[j], prompts[j])\n",
+        "        for i in rest:\n",
+        "            resps_by_idx[i] = (rest_resp, format_obs(obss[i]))\n",
+        "\n",
+        "        for i in range(n):\n",
+        "            if done_mask[i] or i not in resps_by_idx:\n",
+        "                continue\n",
+        "            resp, prompt = resps_by_idx[i]\n",
+        "            action = parse_model_output(resp)\n",
+        "            pairs[i].append({\"prompt\": prompt, \"response\": resp})\n",
+        "            obss[i] = envs[i].step(action)\n",
+        "            r = obss[i].reward or 0.0\n",
+        "            rewards[i].append(r)\n",
+        "            energies[i].append(obss[i].creator_energy)\n",
+        "            histories[i].extend([\n",
+        "                {\"role\": \"user\", \"content\": prompt},\n",
+        "                {\"role\": \"assistant\", \"content\": resp},\n",
+        "            ])\n",
+        "            if obss[i].done:\n",
+        "                done_mask[i] = True\n",
+        "\n",
+        "    GAMMA, TERMINAL_W = 0.95, 5.0\n",
+        "    results = []\n",
+        "    for i, (task, seed) in enumerate(tasks_seeds):\n",
+        "        gs = (obss[i].metadata or {}).get(\"grader_score\", 0.0)\n",
+        "        rets = [0.0] * len(rewards[i])\n",
+        "        G = gs * TERMINAL_W\n",
+        "        for t in reversed(range(len(rewards[i]))):\n",
+        "            G = rewards[i][t] + GAMMA * G\n",
+        "            rets[t] = G\n",
+        "        for k, pr in enumerate(pairs[i]):\n",
+        "            pr[\"return\"] = rets[k] if k < len(rets) else 0.0\n",
+        "        results.append({\n",
+        "            \"task\": task, \"seed\": seed, \"grader_score\": gs,\n",
+        "            \"total_reward\": sum(rewards[i]), \"final_energy\": obss[i].creator_energy,\n",
+        "            \"rewards\": rewards[i], \"returns\": rets, \"energies\": energies[i],\n",
+        "            \"pairs\": pairs[i], \"follower_delta\": obss[i].follower_count - 10000,\n",
+        "            \"burned_out\": obss[i].creator_energy <= 0,\n",
+        "        })\n",
+        "    return results\n",
         "\n",
         "\n",
         "def run_llm_episode(mdl, tok, task, seed=42, verbose=False):\n",
+        "    return run_llm_episodes_batched(mdl, tok, [(task, seed)], verbose=verbose)[0]\n",
         "\n",
+        "\n",
+        "print(\"LLM agent functions defined (batched).\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Cell 9: Run untrained model (batched: all 3 tasks in parallel envs)\n",
+        "print(\"Running UNTRAINED base model on all tasks (batched)...\")\n",
         "print(\"=\" * 60)\n",
         "\n",
+        "t0 = time.time()\n",
+        "results = run_llm_episodes_batched(model, tokenizer, [(t, 42) for t in TASKS], verbose=True)\n",
+        "before_results = {r[\"task\"]: r for r in results}\n",
         "\n",
         "print(\"\\n\" + \"=\" * 60)\n",
+        "print(f\"BEFORE TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={before_results[t]['grader_score']:.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 10: Attach LoRA adapter\n",
         "from peft import LoraConfig, get_peft_model, TaskType\n",
         "model.enable_input_require_grads()\n",
         "peft_model = get_peft_model(model, lora_config)\n",
         "peft_model.print_trainable_parameters()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 11: Training loop\n",
         "from trl import SFTTrainer, SFTConfig\n",
         "    print(f\"TRAINING ROUND {round_idx}/{NUM_ROUNDS}\")\n",
         "    print(f\"{'=' * 60}\")\n",
         "\n",
         "    peft_model.eval()\n",
+        "    tasks_seeds = [(TASKS[ep % len(TASKS)], 42 + (round_idx - 1) * 100 + ep) for ep in range(EPISODES_PER_ROUND)]\n",
+        "    t_roll = time.time()\n",
+        "    results = run_llm_episodes_batched(peft_model, tokenizer, tasks_seeds, verbose=False)\n",
+        "    print(f\"  Rollouts: {len(results)} eps × {TASK_HORIZON} days in {time.time()-t_roll:.1f}s\")\n",
         "\n",
+        "    all_pairs, episode_rewards, episode_graders = [], [], []\n",
+        "    for ep, result in enumerate(results):\n",
         "        ep_reward = result[\"total_reward\"] + 2.0 * result[\"grader_score\"]\n",
         "        episode_rewards.append(ep_reward)\n",
         "        episode_graders.append(result[\"grader_score\"])\n",
+        "        kept = 0\n",
         "        for pr in result[\"pairs\"]:\n",
+        "            if not is_well_formed_response(pr[\"response\"]):\n",
+        "                continue\n",
         "            text = (f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>user\\n{pr['prompt']}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>assistant\\n{pr['response']}<|im_end|>\")\n",
+        "            all_pairs.append({\"text\": text, \"reward\": pr[\"return\"]})\n",
+        "            kept += 1\n",
+        "        print(f\"  ep {ep+1}/{EPISODES_PER_ROUND}: {result['task'].split('_')[-1]:>11s} \"\n",
+        "              f\"grader={result['grader_score']:.4f} reward={ep_reward:.3f} kept={kept}/{len(result['pairs'])}\")\n",
+        "\n",
+        "    avg_r = float(np.mean(episode_rewards))\n",
+        "    avg_g = float(np.mean(episode_graders))\n",
+        "    print(f\"  Avg reward={avg_r:.3f} Avg grader={avg_g:.4f} | total pairs={len(all_pairs)}\")\n",
+        "    if not all_pairs:\n",
+        "        print(\"  WARNING: 0 well-formed pairs collected; skipping SFT.\")\n",
+        "        continue\n",
         "\n",
         "    threshold = np.percentile([p[\"reward\"] for p in all_pairs], (1 - TOP_K_FRACTION) * 100)\n",
         "    filtered = [p for p in all_pairs if p[\"reward\"] >= threshold] or all_pairs\n",
+        "    print(f\"  Filtered to {len(filtered)}/{len(all_pairs)} samples (return >= {threshold:.3f})\")\n",
         "\n",
         "    dataset = Dataset.from_list([{\"text\": p[\"text\"]} for p in filtered])\n",
         "\n",
         "    sft_config = SFTConfig(\n",
         "        output_dir=f\"./checkpoints/round_{round_idx}\",\n",
         "        num_train_epochs=2,\n",
+        "        per_device_train_batch_size=4,\n",
+        "        gradient_accumulation_steps=2,\n",
         "        learning_rate=2e-5,\n",
         "        warmup_steps=5,\n",
         "        logging_steps=5,\n",
         "        save_strategy=\"no\",\n",
+        "        max_length=4096,\n",
+        "        bf16=True,\n",
         "        report_to=\"none\",\n",
         "    )\n",
         "\n",
         "elapsed = time.time() - t_start\n",
         "print(f\"\\nTraining complete in {elapsed/60:.1f} min\")\n",
         "print(pd.DataFrame(training_log).to_string(index=False))"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "# Cell 12: Run trained model (batched)\n",
+        "print(\"Running TRAINED model on all tasks (batched)...\")\n",
         "print(\"=\" * 60)\n",
         "\n",
         "peft_model.eval()\n",
+        "t0 = time.time()\n",
+        "results = run_llm_episodes_batched(peft_model, tokenizer, [(t, 42) for t in TASKS], verbose=True)\n",
+        "after_results = {r[\"task\"]: r for r in results}\n",
         "\n",
         "print(\"\\n\" + \"=\" * 60)\n",
+        "print(f\"AFTER TRAINING (took {time.time()-t0:.1f}s):\")\n",
         "for t in TASKS:\n",
         "    print(f\"  {t}: grader={after_results[t]['grader_score']:.4f}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 13: Training curves\n",
         "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/reward_curve.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 14: Before vs After\n",
         "task_labels = [t.replace('monthly_', '').title() for t in TASKS]\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/before_after.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 15: Trajectory comparison\n",
         "fig, axes = plt.subplots(2, 3, figsize=(16, 8))\n",
         "fig.tight_layout()\n",
         "fig.savefig(f'{PLOTS_DIR}/training_trajectories.png', dpi=150, bbox_inches='tight')\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 16: Final summary\n",
         "print(\"=\" * 67)\n",
         "\n",
         "print(f\"\\nSaved to {PLOTS_DIR}/\")\n",
         "print(\"All results are from real LoRA weight updates on real environment runs.\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "# Cell 17: Save adapter\n",
         "save_path = \"./viraltest_trained_adapter\"\n",
         "tokenizer.save_pretrained(save_path)\n",
         "print(f\"LoRA adapter saved to {save_path}\")\n",
         "print(\"Load with: PeftModel.from_pretrained(base_model, save_path)\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     }
   ],
   "metadata": {
   },
   "nbformat": 4,
   "nbformat_minor": 4
+}