Spaces:

ycwhencpp
/

final-iteration

Paused

anuragredbus commited on 12 days ago

Commit

7db31d9

1 Parent(s): 1d82571

train_grpo: add TEST_ONLY mode to skip training and run eval+plots only

When TEST_ONLY=1 (env var), Cell 11 short-circuits the rollout+SFT loop
so the rest of the notebook (AFTER eval, debug, plots, summary, adapter
save) runs end-to-end on a zero-init LoRA wrapper. Lets us validate the
eval+plot pipeline in ~5 min on a small GPU instead of waiting on a
multi-hour training run.

Made-with: Cursor

Files changed (1) hide show

training/train_grpo.ipynb +13 -3

training/train_grpo.ipynb CHANGED Viewed

@@ -188,8 +188,12 @@
         "print(\"OK: ast.parse (syntax check)\")\n",
         "\n",
         "SMOKE_MODE = bool(int(os.environ.get(\"SMOKE_MODE\", \"1\")))\n",
         "HINT_ALWAYS = True\n",
-        "print(f\"SMOKE_MODE={SMOKE_MODE} | HINT_ALWAYS={HINT_ALWAYS}\")"
       ],
       "execution_count": null,
       "outputs": []
@@ -837,8 +841,9 @@
         "# Cell 11: Two-phase training loop (timing -> content)\n",
         "# Each phase: 3 rounds (round 0 = hardcoded peak-hours hint, rounds 1-2 = normal prompt).\n",
         "# Adapter persisted to ./checkpoints/phaseN_adapter/ between phases.\n",
-        "from trl import SFTTrainer, SFTConfig\n",
-        "from datasets import Dataset\n",
         "\n",
         "if SMOKE_MODE:\n",
         "    EPISODES_PER_ROUND = 4\n",
@@ -870,6 +875,11 @@
         "t_start = time.time()\n",
         "global_step = 0\n",
         "\n",
         "for phase in PHASES:\n",
         "    phase_name = phase[\"name\"]\n",
         "    sys_prompt = phase[\"system\"]\n",

         "print(\"OK: ast.parse (syntax check)\")\n",
         "\n",
         "SMOKE_MODE = bool(int(os.environ.get(\"SMOKE_MODE\", \"1\")))\n",
+        "# TEST_ONLY=1 skips the training loop entirely (load model -> eval -> plots).\n",
+        "# Use when you only want to verify the eval/plot pipeline on a fast small GPU.\n",
+        "# AFTER eval will then run on a zero-init LoRA wrapper (== base model behaviour).\n",
+        "TEST_ONLY = bool(int(os.environ.get(\"TEST_ONLY\", \"0\")))\n",
         "HINT_ALWAYS = True\n",
+        "print(f\"SMOKE_MODE={SMOKE_MODE} | TEST_ONLY={TEST_ONLY} | HINT_ALWAYS={HINT_ALWAYS}\")"
       ],
       "execution_count": null,
       "outputs": []
         "# Cell 11: Two-phase training loop (timing -> content)\n",
         "# Each phase: 3 rounds (round 0 = hardcoded peak-hours hint, rounds 1-2 = normal prompt).\n",
         "# Adapter persisted to ./checkpoints/phaseN_adapter/ between phases.\n",
+        "if not TEST_ONLY:\n",
+        "    from trl import SFTTrainer, SFTConfig\n",
+        "    from datasets import Dataset\n",
         "\n",
         "if SMOKE_MODE:\n",
         "    EPISODES_PER_ROUND = 4\n",
         "t_start = time.time()\n",
         "global_step = 0\n",
         "\n",
+        "if TEST_ONLY:\n",
+        "    print(\"TEST_ONLY=1 -> skipping training rollouts + SFT. AFTER eval will run on \"\n",
+        "          \"zero-init LoRA (== base model behaviour). All plot/summary cells still execute.\")\n",
+        "    PHASES = []  # empty so the for-loop below is a no-op\n",
+        "\n",
         "for phase in PHASES:\n",
         "    phase_name = phase[\"name\"]\n",
         "    sys_prompt = phase[\"system\"]\n",