sh4shv4t commited on
Commit
213dee8
·
1 Parent(s): 64b4e71

feat: added images, new sft notebook, jobs to do grpo

Browse files
.cursorrules CHANGED
@@ -3,7 +3,7 @@
3
  ## Training Results Page
4
  - Route: `GET /train` → `dashboard/train_results.html` (served from `main.py`)
5
  - API: `GET /api/training-status` → reads `results/eval_results.json` when present; exposes plot file flags and `HF_MODEL_REPO` presence
6
- - Static: `results/` is served at `/results/` via `StaticFiles` in `main.py` (directory is created on startup if missing)
7
  - `results/` is gitignored for data; plot PNGs/HTML can be committed intentionally if the team chooses
8
  - The `/train` page uses placeholders when files are missing
9
  - Training runs in Colab / local GPU — **not** inside the Space process
 
3
  ## Training Results Page
4
  - Route: `GET /train` → `dashboard/train_results.html` (served from `main.py`)
5
  - API: `GET /api/training-status` → reads `results/eval_results.json` when present; exposes plot file flags and `HF_MODEL_REPO` presence
6
+ - Static: `results/` at `/results/`, `images/` at `/images/` (e.g. committed `images/sft_loss_curve.png`) via `StaticFiles` in `main.py`
7
  - `results/` is gitignored for data; plot PNGs/HTML can be committed intentionally if the team chooses
8
  - The `/train` page uses placeholders when files are missing
9
  - Training runs in Colab / local GPU — **not** inside the Space process
README.md CHANGED
@@ -165,6 +165,8 @@ GRPO Fine-tuning — 100 steps, G=4, static JSONL prompts
165
  sh4shv4t/parlay-grpo-1-5b
166
  ```
167
 
 
 
168
  ```text
169
  Gemini self-play (generate_data.py)
170
  → 80 quality-filtered episodes across 9 persona×scenario combos
@@ -184,6 +186,8 @@ The ω warmup is a practical detail worth flagging: at step 0, the base model oc
184
 
185
  ### Results
186
 
 
 
187
  ![Mean episode reward over GRPO training steps](results/grpo_reward_curve.png)
188
 
189
  ![GRPO training loss](results/grpo_loss_curve.png)
 
165
  sh4shv4t/parlay-grpo-1-5b
166
  ```
167
 
168
+ **Run GRPO on Hugging Face Jobs** (pre-paid credits, data + SFT on the Hub; `scripts/hf_grpo_entry.sh`; template uses **`--timeout 6h`** and **`a100-large`**): see [`training/GRPO_HF_RUNBOOK.md`](training/GRPO_HF_RUNBOOK.md).
169
+
170
  ```text
171
  Gemini self-play (generate_data.py)
172
  → 80 quality-filtered episodes across 9 persona×scenario combos
 
186
 
187
  ### Results
188
 
189
+ ![SFT training loss — Qwen2.5-1.5B + LoRA on Parlay episodes](images/sft_loss_curve.png)
190
+
191
  ![Mean episode reward over GRPO training steps](results/grpo_reward_curve.png)
192
 
193
  ![GRPO training loss](results/grpo_loss_curve.png)
dashboard/api.py CHANGED
@@ -48,6 +48,7 @@ _sessions: dict[str, dict[str, Any]] = {}
48
  # Opponent backend for /api/game/move: "gemini" (default) or "trained" (HF_MODEL_REPO + Qwen)
49
  OPPONENT_MODE: str = "gemini"
50
  _RESULTS_DIR = Path("results")
 
51
  _CP_COSTS: dict[TacticalMove, int] = {
52
  TacticalMove.ANCHOR_HIGH: 0,
53
  TacticalMove.BATNA_REVEAL: 20,
@@ -345,6 +346,13 @@ def _training_status_payload() -> dict[str, Any]:
345
  except Exception: # noqa: BLE001
346
  has_results = False
347
  repo = (os.environ.get("HF_MODEL_REPO") or "").strip() or None
 
 
 
 
 
 
 
348
  return {
349
  "has_results": has_results,
350
  "grpo_mean_reward": grpo,
@@ -352,10 +360,12 @@ def _training_status_payload() -> dict[str, Any]:
352
  "random_mean_reward": rnd,
353
  "model_on_hub": bool(repo),
354
  "model_repo": repo,
 
355
  "plots_available": {
356
  "reward_curve": (_RESULTS_DIR / "grpo_reward_curve.png").is_file(),
357
  "comparison": (_RESULTS_DIR / "training_curves.png").is_file(),
358
  "transcript": (_RESULTS_DIR / "before_after_transcript.html").is_file(),
 
359
  },
360
  }
361
 
 
48
  # Opponent backend for /api/game/move: "gemini" (default) or "trained" (HF_MODEL_REPO + Qwen)
49
  OPPONENT_MODE: str = "gemini"
50
  _RESULTS_DIR = Path("results")
51
+ _IMAGES_DIR = Path("images")
52
  _CP_COSTS: dict[TacticalMove, int] = {
53
  TacticalMove.ANCHOR_HIGH: 0,
54
  TacticalMove.BATNA_REVEAL: 20,
 
346
  except Exception: # noqa: BLE001
347
  has_results = False
348
  repo = (os.environ.get("HF_MODEL_REPO") or "").strip() or None
349
+
350
+ sft_loss_path: str | None = None
351
+ if (_IMAGES_DIR / "sft_loss_curve.png").is_file():
352
+ sft_loss_path = "/images/sft_loss_curve.png"
353
+ elif (_RESULTS_DIR / "sft_loss_curve.png").is_file():
354
+ sft_loss_path = "/results/sft_loss_curve.png"
355
+
356
  return {
357
  "has_results": has_results,
358
  "grpo_mean_reward": grpo,
 
360
  "random_mean_reward": rnd,
361
  "model_on_hub": bool(repo),
362
  "model_repo": repo,
363
+ "sft_loss_url": sft_loss_path,
364
  "plots_available": {
365
  "reward_curve": (_RESULTS_DIR / "grpo_reward_curve.png").is_file(),
366
  "comparison": (_RESULTS_DIR / "training_curves.png").is_file(),
367
  "transcript": (_RESULTS_DIR / "before_after_transcript.html").is_file(),
368
+ "sft_loss": sft_loss_path is not None,
369
  },
370
  }
371
 
dashboard/train_results.html CHANGED
@@ -136,6 +136,12 @@
136
  </div>
137
  </section>
138
 
 
 
 
 
 
 
139
  <section>
140
  <h2>GRPO Training Curve</h2>
141
  <div id="fig-reward"></div>
@@ -198,6 +204,19 @@
198
  d.style.color = x >= 0 ? "var(--emerald)" : "var(--scarlet)";
199
  }
200
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  function rewardSection(pa) {
202
  const el = document.getElementById("fig-reward");
203
  const cap = document.getElementById("cap-reward");
@@ -272,6 +291,7 @@
272
  if (data.has_results) {
273
  document.querySelector(".subtitle")?.setAttribute("data-has-eval", "1");
274
  }
 
275
  rewardSection(data.plots_available);
276
  compareSection(data.plots_available);
277
  await transcriptSection(data.plots_available);
 
136
  </div>
137
  </section>
138
 
139
+ <section>
140
+ <h2>SFT cold start — training loss</h2>
141
+ <div id="fig-sft"></div>
142
+ <p class="caption" id="cap-sft" hidden>Supervised fine-tuning loss (Qwen2.5-1.5B + LoRA on Parlay episodes from Colab).</p>
143
+ </section>
144
+
145
  <section>
146
  <h2>GRPO Training Curve</h2>
147
  <div id="fig-reward"></div>
 
204
  d.style.color = x >= 0 ? "var(--emerald)" : "var(--scarlet)";
205
  }
206
  }
207
+ function sftSection(url) {
208
+ const el = document.getElementById("fig-sft");
209
+ const cap = document.getElementById("cap-sft");
210
+ if (url) {
211
+ el.innerHTML = '<img src="' + url + '" alt="SFT training loss" style="width:100%;border:1px solid var(--gold);border-radius:2px" />';
212
+ cap.hidden = false;
213
+ } else {
214
+ el.innerHTML = '<div class="fig-placeholder">SFT loss curve will appear after running ' +
215
+ '<a href="https://colab.research.google.com/github/sh4shv4t/Parlay/blob/main/training/notebooks/sft_qwen_colab.ipynb" target="_blank" rel="noopener">sft_qwen_colab.ipynb</a> ' +
216
+ 'or by adding <code>images/sft_loss_curve.png</code> to the repo.</div>';
217
+ cap.hidden = true;
218
+ }
219
+ }
220
  function rewardSection(pa) {
221
  const el = document.getElementById("fig-reward");
222
  const cap = document.getElementById("cap-reward");
 
291
  if (data.has_results) {
292
  document.querySelector(".subtitle")?.setAttribute("data-has-eval", "1");
293
  }
294
+ sftSection(data.sft_loss_url);
295
  rewardSection(data.plots_available);
296
  compareSection(data.plots_available);
297
  await transcriptSection(data.plots_available);
images/sft_loss_curve.png ADDED

Git LFS Details

  • SHA256: fc677bac5b08d449ea73f1020315279a0edd168775a13a97f2ee99339babab57
  • Pointer size: 130 Bytes
  • Size of remote file: 20.2 kB
main.py CHANGED
@@ -78,6 +78,12 @@ try:
78
  except OSError as exc:
79
  logger.warning("Could not mount /results: %s", exc)
80
 
 
 
 
 
 
 
81
 
82
  @app.get("/", include_in_schema=False)
83
  async def serve_index() -> FileResponse:
 
78
  except OSError as exc:
79
  logger.warning("Could not mount /results: %s", exc)
80
 
81
+ os.makedirs("images", exist_ok=True)
82
+ try:
83
+ app.mount("/images", StaticFiles(directory="images"), name="images")
84
+ except OSError as exc:
85
+ logger.warning("Could not mount /images: %s", exc)
86
+
87
 
88
  @app.get("/", include_in_schema=False)
89
  async def serve_index() -> FileResponse:
scripts/hf_grpo_entry.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Run from the Parlay repository root (the folder that contains training/ and parlay_env/).
3
+ # Intended for Linux GPU jobs (Hugging Face Jobs, RunPod, etc.).
4
+ #
5
+ # Usage (after: git clone ... && cd Parlay && pip install -r requirements-train.txt):
6
+ # export HF_TOKEN=... # read private assets + push (required if PUSH_TO_HF=1)
7
+ # export GRPO_STEPS=120 GRPO_G=4
8
+ # bash scripts/hf_grpo_entry.sh
9
+ #
10
+ # See training/GRPO_HF_RUNBOOK.md for a full walkthrough.
11
+ set -euo pipefail
12
+ export PYTHONUNBUFFERED=1
13
+
14
+ : "${DATASET_ID:=sh4shv4t/parlay-episodes}"
15
+ : "${EPISODE_FILE:=episodes_v2.jsonl}"
16
+ : "${SFT_MODEL:=sh4shv4t/parlay-sft-1-5b}"
17
+ : "${GRPO_STEPS:=120}"
18
+ : "${GRPO_G:=4}"
19
+ : "${MIN_REWARD:=-50.0}"
20
+ : "${OUTPUT_DIR:=outputs/grpo_run}"
21
+ # Set to 0 to skip push (e.g. smoke test)
22
+ : "${PUSH_TO_HF:=1}"
23
+ # Model repo to upload the GRPO output folder to
24
+ : "${HF_GRPO_REPO:=sh4shv4t/parlay-grpo-1-5b}"
25
+
26
+ if [[ ! -f "training/grpo_train.py" ]]; then
27
+ echo "Run this script from the Parlay repo root (training/grpo_train.py not found). pwd=$(pwd)" >&2
28
+ exit 1
29
+ fi
30
+
31
+ echo "==> Downloading ${EPISODE_FILE} from dataset ${DATASET_ID} ..."
32
+ export DATASET_ID EPISODE_FILE
33
+ JSONL_PATH=$(
34
+ python -c "import os
35
+ from huggingface_hub import hf_hub_download
36
+ print(hf_hub_download(
37
+ repo_id=os.environ['DATASET_ID'],
38
+ filename=os.environ['EPISODE_FILE'],
39
+ repo_type='dataset',
40
+ ))"
41
+ )
42
+ echo " JSONL: ${JSONL_PATH}"
43
+
44
+ mkdir -p "$(dirname "$OUTPUT_DIR")"
45
+ OUT_ABS="$(cd "$(dirname "$OUTPUT_DIR")" && pwd)/$(basename "$OUTPUT_DIR")"
46
+
47
+ echo "==> GRPO: SFT=${SFT_MODEL} steps=${GRPO_STEPS} G=${GRPO_G} out=${OUT_ABS}"
48
+ python -m training.grpo_train \
49
+ --model "${SFT_MODEL}" \
50
+ --data "${JSONL_PATH}" \
51
+ --output "${OUT_ABS}" \
52
+ --steps "${GRPO_STEPS}" \
53
+ --g "${GRPO_G}" \
54
+ --min-reward "${MIN_REWARD}"
55
+
56
+ # Bundle Matplotlib curves + TRL log JSON into the model folder so one Hub upload includes visualizations.
57
+ echo "==> Collecting training plots under ${OUT_ABS}/training_plots/ ..."
58
+ TP="${OUT_ABS}/training_plots"
59
+ mkdir -p "${TP}"
60
+ for f in results/grpo_reward_curve.png results/grpo_loss_curve.png; do
61
+ if [[ -f "$f" ]]; then
62
+ cp -f "$f" "${TP}/"
63
+ echo " + ${f}"
64
+ fi
65
+ done
66
+ if [[ -d "${OUT_ABS}/plots" ]]; then
67
+ shopt -s nullglob
68
+ for f in "${OUT_ABS}/plots/"*.png "${OUT_ABS}/plots/"*.json; do
69
+ [[ -e "$f" ]] || continue
70
+ cp -f "$f" "${TP}/"
71
+ echo " + ${f}"
72
+ done
73
+ shopt -u nullglob
74
+ fi
75
+ if [[ ! -f "${TP}/grpo_reward_curve.png" && ! -f "${TP}/grpo_reward.png" ]]; then
76
+ echo " (warning: no reward plot in training_plots — check logs for empty log_history or plot errors)"
77
+ fi
78
+
79
+ if [[ "${PUSH_TO_HF}" == "1" || "${PUSH_TO_HF}" == "true" ]]; then
80
+ if [[ -z "${HF_TOKEN:-}" && -z "${HUGGINGFACE_HUB_TOKEN:-}" ]]; then
81
+ echo "PUSH_TO_HF is set but neither HF_TOKEN nor HUGGINGFACE_HUB_TOKEN is set." >&2
82
+ exit 1
83
+ fi
84
+ # push_to_hub.py reads HF_TOKEN; Jobs often set HUGGINGFACE_HUB_TOKEN
85
+ export HF_TOKEN="${HF_TOKEN:-${HUGGINGFACE_HUB_TOKEN:-}}"
86
+ echo "==> Pushing to https://huggingface.co/${HF_GRPO_REPO} ..."
87
+ export HF_REPO_ID="${HF_GRPO_REPO}"
88
+ python -m training.push_to_hub --model "${OUT_ABS}" --repo "${HF_GRPO_REPO}"
89
+ else
90
+ echo "==> PUSH_TO_HF disabled; model saved at ${OUT_ABS}"
91
+ fi
92
+
93
+ echo "==> Done."
training/GRPO_HF_RUNBOOK.md ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run Parlay GRPO on Hugging Face (Jobs + your credits)
2
+
3
+ ## Checklist — what you actually do
4
+
5
+ 1. **On your PC:** install the HF CLI → `hf auth login` (token with read + write).
6
+ 2. **Money:** confirm [pre-paid credits / billing](https://huggingface.co/settings/billing) ($30 is enough for a modest A100 run if you cap time and steps).
7
+ 3. **Repo:** commit and push this repo (including `scripts/hf_grpo_entry.sh`) to GitHub — or set `GITHUB_CLONE` in the job to a URL/branch that has the script.
8
+ 4. **Start the job** from your terminal using the **`hf jobs run ... huggingface/trl ...`** command in [§3.1](#31-command-template). This runbook uses **`--timeout 6h`** on **`a100-large`** (see [§3.2](#32-is-6-hours-on-a100-enough)).
9
+ 5. **Watch** [huggingface.co/jobs](https://huggingface.co/jobs) until it finishes or errors.
10
+ 6. **Artifacts:** the trained adapter lands in `HF_GRPO_REPO` (default `sh4shv4t/parlay-grpo-1-5b`). **Plots** are generated during training and copied into **`training_plots/`** inside that upload (see [Visualizations](#visualizations--curves)).
11
+ 7. **Optional — local app / README:** download `grpo_reward_curve.png` / `grpo_loss_curve.png` from the model repo’s `training_plots/` (or from the job’s `results/` folder) and put them in your clone under `results/` so the Space **Training Results** page can show them ([`dashboard/api.py`](../dashboard/api.py) looks for `results/grpo_reward_curve.png`).
12
+
13
+ ---
14
+
15
+ This walkthrough uses assets that are **already on the Hub**:
16
+
17
+ | What | Hub ID |
18
+ |------|--------|
19
+ | Episodes JSONL (dataset) | `sh4shv4t/parlay-episodes` (file: `episodes_v2.jsonl`) |
20
+ | SFT LoRA (starting point) | `sh4shv4t/parlay-sft-1-5b` |
21
+ | Optional: upload GRPO output here | `sh4shv4t/parlay-grpo-1-5b` (default in `scripts/hf_grpo_entry.sh`) |
22
+
23
+ `training/grpo_train.py` loads the SFT **adapter** from the Hub (it looks for `adapter_config.json` and fetches the **base** model name from that file, usually `Qwen/Qwen2.5-1.5B-Instruct`).
24
+
25
+ **Requirements:** a Hugging Face account with [pre-paid credits](https://huggingface.co/pricing) and a [fine-grained or classic token](https://huggingface.co/settings/tokens) with **read** (dataset + models) and **write** (if you push the trained adapter to your model repo).
26
+
27
+ ---
28
+
29
+ ## 0. What you are paying for
30
+
31
+ - [Jobs pricing](https://huggingface.co/docs/hub/jobs-pricing) is **per minute** while the job is **starting** or **running** (rough guide: 1× A100 large ≈ $2.50/hr; **$30** is on the order of **~12 h** on that flavor if you used it nonstop—always check the current table).
32
+ - The default timeout for Jobs is **short**; you must set **`--timeout`**. This guide standardizes on **`6h`** for **1× A100** with `GRPO_STEPS=120` and `G=4` (see §3.2). Cost is still **~per minute of runtime** (see current [pricing](https://huggingface.co/docs/hub/jobs-pricing)).
33
+
34
+ ---
35
+
36
+ ## 1. One-time setup on your laptop
37
+
38
+ 1. **Install the HF CLI** (see [Quickstart](https://huggingface.co/docs/hub/jobs-quickstart)):
39
+ - e.g. `curl -LsSf https://hf.co/cli/install.sh | bash` (macOS/Linux) or the Windows installer from the same doc page.
40
+ 2. **Log in**:
41
+ - `hf auth login`
42
+ - Paste a token with **read** and **write** to the Hub.
43
+ 3. **Confirm credits** at [huggingface.co/settings/billing](https://huggingface.co/settings/billing).
44
+ 4. **Open the Jobs UI** to watch runs: [huggingface.co/jobs](https://huggingface.co/jobs) (or your org’s jobs page).
45
+
46
+ ---
47
+
48
+ ## 2. What the repo provides
49
+
50
+ From the **repository root** after `git clone`:
51
+
52
+ - **`scripts/hf_grpo_entry.sh`**
53
+ - Downloads `episodes_v2.jsonl` from `sh4shv4t/parlay-episodes`
54
+ - Runs `python -m training.grpo_train` with the Hub SFT adapter
55
+ - Optionally runs `python -m training.push_to_hub` to upload the output folder
56
+
57
+ You only need a **GPU Linux** environment with **git**, **Python**, and **pip**; the rest is installed from `requirements-train.txt`.
58
+
59
+ ---
60
+
61
+ ## 3. Recommended: Hugging Face Job with the TRL image
62
+
63
+ Hugging Face documents a ready image for TRL workflows: `huggingface/trl` (see [Popular images](https://huggingface.co/docs/hub/jobs-popular-images)).
64
+
65
+ ### 3.1 Command template
66
+
67
+ Run this from **your** machine; it starts the job in the cloud and streams logs. Adjust `GITHUB_CLONE` if you use a fork (your MetaHackathon mirror should push to GitHub or you must change the URL).
68
+
69
+ ```bash
70
+ # `--secrets HF_TOKEN` passes your Hub token (from `hf auth login`) into the job for downloads + push.
71
+ # Image is the first *positional* argument after flags (same pattern as: hf jobs run ubuntu echo hi).
72
+ # Point GITHUB_CLONE at your fork if needed.
73
+ hf jobs run \
74
+ --flavor a100-large \
75
+ --timeout 5h \
76
+ --secrets HF_TOKEN \
77
+ --env GITHUB_CLONE=https://github.com/sh4shv4t/Parlay.git \
78
+ --env GRPO_STEPS=120 \
79
+ --env GRPO_G=4 \
80
+ --env PUSH_TO_HF=1 \
81
+ --env HF_GRPO_REPO=sh4shv4t/parlay-grpo-1-5b \
82
+ huggingface/trl \
83
+ bash -lc 'set -e; apt-get update -qq && apt-get install -y -qq git; git clone --depth 1 "$GITHUB_CLONE" /work; cd /work; pip install -q -r requirements-train.txt; bash scripts/hf_grpo_entry.sh'
84
+ ```
85
+
86
+ **Note:** if your CLI version differs, run `hf jobs run --help`. If the image already has `git`, drop the `apt-get` / `apt-get install` lines. The TRL image is listed under [Popular images](https://huggingface.co/docs/hub/jobs-popular-images). For a UV-based job instead, see `hf jobs uv run --image huggingface/trl ...` in the same page (usually a small `train.py`); Parlay’s GRPO needs the **full** repo, so the `git clone` pattern above is the straightforward fit.
87
+
88
+ **Windows:** run this block from **Git Bash** or **WSL** (so `bash -lc '... "$GITHUB_CLONE" ...'` is parsed like Linux). In plain PowerShell, quote escaping is different—open this file in an editor and paste the block there.
89
+
90
+ **Environment variables** you can add with `--env`:
91
+
92
+ | Variable | Default (in `hf_grpo_entry.sh`) | Purpose |
93
+ |----------|---------------------------------|---------|
94
+ | `DATASET_ID` | `sh4shv4t/parlay-episodes` | JSONL source |
95
+ | `EPISODE_FILE` | `episodes_v2.jsonl` | File inside the dataset repo |
96
+ | `SFT_MODEL` | `sh4shv4t/parlay-sft-1-5b` | Hub LoRA to continue from |
97
+ | `GRPO_STEPS` | `120` | Training steps |
98
+ | `GRPO_G` | `4` | Group size (lower if OOM) |
99
+ | `MIN_REWARD` | `-50.0` | Skips very bad train rows |
100
+ | `OUTPUT_DIR` | `outputs/grpo_run` | Local output in the job |
101
+ | `PUSH_TO_HF` | `1` | Set to `0` to skip upload |
102
+ | `HF_GRPO_REPO` | `sh4shv4t/parlay-grpo-1-5b` | Push target |
103
+
104
+ `HF_TOKEN` is provided by **`--secrets HF_TOKEN`** (or the short form your CLI supports) so `huggingface-cli` / `huggingface_hub` can push; it must be allowed to write to `HF_GRPO_REPO`.
105
+
106
+ ### 3.2 Is 6 hours on A100 enough?
107
+
108
+ **Usually yes** for the template defaults: **Qwen2.5-1.5B** + SFT LoRA, **`GRPO_STEPS=120`**, **`GRPO_G=4`**. In practice, training often finishes in **on the order of 1–3 hours** of wall time; **6h** is a **safety cap** for Hub downloads, first-time `pip install`, and plotting—so the job is not cut off at the default short timeout. If you **increase** steps into the many hundreds, either raise **`--timeout` to `8h`** or **lower** `GRPO_STEPS` / `G` to stay inside 6h.
109
+
110
+ ### 3.3 If the job OOMs
111
+
112
+ Lower **`GRPO_G`** first (e.g. `2` or `1`), then **`GRPO_STEPS`**. A100 80GB usually runs `G=4` for this 1.5B setup; T4-style VRAM may need `G=2`.
113
+
114
+ ### 3.4 If the job times out
115
+
116
+ Increase **`--timeout`**, or run two shorter jobs: first run saves under `OUTPUT_DIR` / Hub; a second could continue from a saved adapter (only if you wire resume in your workflow—the stock `grpo_train` is single-stage; the practical fix is a longer timeout or fewer steps per job and manual continuation by pointing `--model` at the last save).
117
+
118
+ ### 3.5 Cheaper hardware
119
+
120
+ Use `--flavor t4-small` (or `l4x1` per [hardware list](https://huggingface.co/docs/hub/jobs-pricing)) and reduce steps + `G` so the job **finishes** within the timeout; you trade wall-clock and quality for cost.
121
+
122
+ ### 3.6 After the run
123
+
124
+ - **Model:** [huggingface.co/sh4shv4t/parlay-grpo-1-5b](https://huggingface.co/sh4shv4t/parlay-grpo-1-5b) (if you pushed to that repo).
125
+ - **Curves:** also under that repo in **`training_plots/`** (bundled by `scripts/hf_grpo_entry.sh` before upload).
126
+
127
+ ---
128
+
129
+ ## Visualizations & curves
130
+
131
+ GRPO already **builds charts in code** (`training/grpo_train.py` → `_save_training_plots`):
132
+
133
+ | Output | Where it is written during training | What it shows |
134
+ |--------|-------------------------------------|----------------|
135
+ | Reward curve | `results/grpo_reward_curve.png` and `<output>/plots/grpo_reward.png` | Mean reward vs step (blue) |
136
+ | Loss curve | `results/grpo_loss_curve.png` and `<output>/plots/grpo_loss.png` | Training loss vs step |
137
+ | Raw TRL logs | `<output>/plots/grpo_log.json` | Full `log_history` for your own plotting |
138
+
139
+ **`scripts/hf_grpo_entry.sh`** copies those into **`<OUTPUT_DIR>/training_plots/`** before `push_to_hub`, so after a successful job you get **PNGs + JSON on the model card** next to the adapter (no extra step).
140
+
141
+ **Training Results page (`/train`):** the API exposes `plots_available.reward_curve` when **`results/grpo_reward_curve.png`** exists in the deployed app’s repo. After you download the PNGs from the Hub (or copy from a job artifact), add them under `results/` in the git repo you deploy to Spaces and redeploy.
142
+
143
+ **Optional extras (not wired in repo by default):**
144
+
145
+ - **W&B / TensorBoard:** set `report_to` in `GRPOConfig` inside `training/grpo_train.py` if you want live dashboards (adds setup and secrets).
146
+ - **Eval bar chart:** after GRPO, run `python -m training.evaluate ...` locally or in a small job to regenerate `results/eval_results.json` and comparison figures (see main README).
147
+
148
+ ---
149
+
150
+ ## 4. Alternative: `hf jobs uv run` (TRL quickstart style)
151
+
152
+ The [Jobs quickstart](https://huggingface.co/docs/hub/jobs-quickstart) uses `hf jobs uv run` with `--with trl` and a small `train.py`. Parlay’s GRPO needs the **full repo** (for `parlay_env` and reward functions), so the **`git clone` + `huggingface/trl` image** pattern in §3 is usually simpler than inlining the whole project into one file.
153
+
154
+ ---
155
+
156
+ ## 5. Alternative: Colab (no Jobs)
157
+
158
+ Use `notebooks/parlay_grpo_colab.ipynb`. In the **config** cell, set:
159
+
160
+ ```python
161
+ JSONL_VIA_HF = ("sh4shv4t/parlay-episodes", "episodes_v2.jsonl")
162
+ SFT_MODEL_HF = "sh4shv4t/parlay-sft-1-5b"
163
+ # Optional: use Colab Pro+ or a longer runtime; HF Jobs avoids Colab’s usage caps.
164
+ ```
165
+
166
+ ---
167
+
168
+ ## 6. Sanity checks before spending credits
169
+
170
+ - **Non-empty train split:** `grpo_train` only uses JSONL lines with `"split": "train"`. If the script prints `0 remaining for GRPO`, fix the JSONL or filters (`MIN_REWARD`).
171
+ - **Token:** `python -c "from huggingface_hub import hf_hub_download; print(hf_hub_download('sh4shv4t/parlay-episodes','episodes_v2.jsonl', repo_type='dataset'))"` on your machine should print a local path.
172
+ - **Config pre-flight:** `python scripts/check_training_config.py` (read-only) reviews env defaults for SFT/GRPO.
173
+
174
+ If anything in this file drifts (repo names, file names), align with `README.md` and `scripts/push_dataset.py` (`episodes_v2.jsonl`).
training/notebooks/build_sft_qwen_colab.py CHANGED
@@ -38,7 +38,11 @@ episodes using **Supervised Fine-Tuning (SFT)** with LoRA via
38
  3. Formats to ChatML, trains with LoRA (r=16)
39
  4. Plots, evaluates before/after, pushes adapter to `sh4shv4t/parlay-negotiator`
40
 
41
- **Runtime:** ~25 min on T4 · **Cost:** $0 (free tier).""")
 
 
 
 
42
 
43
  add_md("## Step 1 — Install dependencies")
44
 
@@ -52,18 +56,39 @@ subprocess.run([
52
  subprocess.run([
53
  sys.executable, "-m", "pip", "install",
54
  "trl>=0.8.6", "peft>=0.10.0", "accelerate>=0.28.0",
55
- "datasets>=2.18.0", "huggingface-hub>=0.22.0",
56
  "bitsandbytes>=0.43.0", "xformers", "--quiet",
57
  ], check=True)
58
- print("OK: dependencies")""")
 
 
 
 
 
 
 
 
 
 
59
 
60
- add_md("## Step 2 GPU + config")
 
 
 
 
61
 
62
  add_code("""import os, json
63
  import torch
64
  from google.colab import userdata
65
 
66
- assert torch.cuda.is_available(), "Use Runtime Change runtime type T4 GPU"
 
 
 
 
 
 
 
 
67
  print(torch.cuda.get_device_name(0), torch.cuda.get_device_properties(0).total_memory / 1e9, "GB VRAM")
68
  print("torch", torch.__version__, "cuda", torch.version.cuda)
69
 
@@ -80,10 +105,37 @@ MIN_REWARD_KEEP = 0.25""")
80
 
81
  add_md("## Step 3 — Load dataset")
82
 
83
- add_code("""from datasets import load_dataset
84
  import pandas as pd
85
-
86
- raw = load_dataset(DATASET_ID, token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  df = raw["train"].to_pandas()
88
  print(len(df), "rows", list(df.columns))
89
  if "reward" in df.columns:
@@ -137,7 +189,10 @@ eval_data = Dataset.from_list(formatted[split:])""")
137
 
138
  add_md("## Step 5 — Model + LoRA (Unsloth)")
139
 
140
- add_code("""from unsloth import FastLanguageModel
 
 
 
141
  from unsloth.chat_templates import get_chat_template
142
 
143
  model, tokenizer = FastLanguageModel.from_pretrained(
@@ -146,6 +201,7 @@ model, tokenizer = FastLanguageModel.from_pretrained(
146
  dtype=None,
147
  load_in_4bit=True,
148
  token=HF_TOKEN,
 
149
  )
150
  tokenizer = get_chat_template(tokenizer, chat_template="chatml")
151
  model = FastLanguageModel.get_peft_model(
@@ -187,7 +243,7 @@ args = SFTConfig(
187
  fp16=not torch.cuda.is_bf16_supported(),
188
  bf16=torch.cuda.is_bf16_supported(),
189
  logging_steps=5,
190
- evaluation_strategy="epoch",
191
  save_strategy="epoch",
192
  report_to="none",
193
  seed=42,
 
38
  3. Formats to ChatML, trains with LoRA (r=16)
39
  4. Plots, evaluates before/after, pushes adapter to `sh4shv4t/parlay-negotiator`
40
 
41
+ **Runtime:** ~25 min on T4 · **Cost:** $0 (free tier).
42
+
43
+ **If `load_dataset` fails on `Feature type 'Json'`:** the install cell upgrades `datasets`;
44
+ if it still fails, use **Runtime → Restart runtime**, then run from Step 2 onward. The
45
+ data cell also falls back to downloading `episodes_v2.jsonl` directly (no `Json` schema).""")
46
 
47
  add_md("## Step 1 — Install dependencies")
48
 
 
56
  subprocess.run([
57
  sys.executable, "-m", "pip", "install",
58
  "trl>=0.8.6", "peft>=0.10.0", "accelerate>=0.28.0",
 
59
  "bitsandbytes>=0.43.0", "xformers", "--quiet",
60
  ], check=True)
61
+ # Hub dataset uses `Json` features — needs datasets 3.x (Colab ships an older build)
62
+ subprocess.run([
63
+ sys.executable, "-m", "pip", "install",
64
+ "-U", "datasets>=3.0.0", "huggingface-hub>=0.23.0", "pyarrow>=14.0.0",
65
+ "--quiet",
66
+ ], check=True)
67
+ import os
68
+ os.environ.setdefault("UNSLOTH_DISABLE_STATISTICS", "1")
69
+ print("OK: dependencies (datasets>=3 for Json dtype on Hub)")""")
70
+
71
+ add_md("""## Step 2 — GPU + config
72
 
73
+ **GPU required** (Unsloth + 4-bit QLoRA will not run on CPU).
74
+
75
+ In Colab: **Runtime** → **Change runtime type** → **Hardware accelerator** = **T4** (or L4 / A100 / V100) → **Save**, then re-run this cell. Free tier: choose **T4** if offered.
76
+
77
+ **Unsloth:** we set `UNSLOTH_DISABLE_STATISTICS=1` below so Colab does not hang 120s on Unsloth’s Hugging Face “statistics” fetch (often mis-reported as “HF is down”).""")
78
 
79
  add_code("""import os, json
80
  import torch
81
  from google.colab import userdata
82
 
83
+ # Before any `import unsloth` in later cells avoids TimeoutError on HF stats ping
84
+ os.environ.setdefault("UNSLOTH_DISABLE_STATISTICS", "1")
85
+
86
+ if not torch.cuda.is_available():
87
+ raise RuntimeError(
88
+ "No GPU visible to PyTorch. Colab: Runtime → Change runtime type → set "
89
+ "Hardware accelerator to GPU (T4, L4, A100, …) → Save, then re-run. "
90
+ "If you are on **CPU** only, this notebook cannot train; connect a GPU session."
91
+ )
92
  print(torch.cuda.get_device_name(0), torch.cuda.get_device_properties(0).total_memory / 1e9, "GB VRAM")
93
  print("torch", torch.__version__, "cuda", torch.version.cuda)
94
 
 
105
 
106
  add_md("## Step 3 — Load dataset")
107
 
108
+ add_code("""import json as _json
109
  import pandas as pd
110
+ from datasets import Dataset, DatasetDict, load_dataset
111
+ from huggingface_hub import hf_hub_download
112
+
113
+
114
+ def load_parlay_episodes():
115
+ \"\"\"Prefer `load_dataset`; fall back to raw JSONL if Colab's old `datasets` can't parse `Json` features.\"\"\"
116
+ try:
117
+ return load_dataset(DATASET_ID, token=HF_TOKEN)
118
+ except (ValueError, KeyError) as e:
119
+ msg = str(e)
120
+ if "Json" not in msg and "Feature type" not in msg:
121
+ raise
122
+ path = hf_hub_download(
123
+ repo_id=DATASET_ID,
124
+ repo_type="dataset",
125
+ filename="episodes_v2.jsonl",
126
+ token=HF_TOKEN,
127
+ )
128
+ rows = []
129
+ with open(path, encoding="utf-8") as f:
130
+ for line in f:
131
+ line = line.strip()
132
+ if line:
133
+ rows.append(_json.loads(line))
134
+ df = pd.DataFrame(rows)
135
+ return DatasetDict({"train": Dataset.from_pandas(df, preserve_index=False)})
136
+
137
+
138
+ raw = load_parlay_episodes()
139
  df = raw["train"].to_pandas()
140
  print(len(df), "rows", list(df.columns))
141
  if "reward" in df.columns:
 
189
 
190
  add_md("## Step 5 — Model + LoRA (Unsloth)")
191
 
192
+ add_code("""import os
193
+ os.environ.setdefault("UNSLOTH_DISABLE_STATISTICS", "1")
194
+
195
+ from unsloth import FastLanguageModel
196
  from unsloth.chat_templates import get_chat_template
197
 
198
  model, tokenizer = FastLanguageModel.from_pretrained(
 
201
  dtype=None,
202
  load_in_4bit=True,
203
  token=HF_TOKEN,
204
+ disable_log_stats=True,
205
  )
206
  tokenizer = get_chat_template(tokenizer, chat_template="chatml")
207
  model = FastLanguageModel.get_peft_model(
 
243
  fp16=not torch.cuda.is_bf16_supported(),
244
  bf16=torch.cuda.is_bf16_supported(),
245
  logging_steps=5,
246
+ eval_strategy="epoch",
247
  save_strategy="epoch",
248
  report_to="none",
249
  seed=42,
training/notebooks/sft_qwen_colab.ipynb CHANGED
The diff for this file is too large to render. See raw diff