Spaces:

anugrahhu
/

cernenv-trainer

Paused

App Files Files Community

anugrahhu commited on 15 days ago

Commit

c2c4674

verified ·

1 Parent(s): 2b97998

sft+reward-fix: space/training/app.py

Browse files

Files changed (1) hide show

space/training/app.py +130 -5

space/training/app.py CHANGED Viewed

@@ -28,7 +28,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse, Response
 from fastapi.staticfiles import StaticFiles
@@ -97,6 +97,10 @@ def _detect_gpus() -> int:
 _NUM_GPUS = _detect_gpus()
 CONFIG = {
     "training_backend": _env("TRAINING_BACKEND", "vanilla"),
     "model_name":       _env("MODEL_NAME", "HuggingFaceTB/SmolLM2-360M-Instruct"),
@@ -119,6 +123,15 @@ CONFIG = {
         f"{_env('HF_USERNAME', 'anugrahhu')}/cernenv-grpo-smollm2-360m",
     ),
     "autostart":        _env("AUTOSTART", "0") == "1",
 }
@@ -177,8 +190,35 @@ def _stream_subprocess(cmd: list[str], log_handle) -> int:
     return rc
 def _build_training_cmd(config: Dict[str, Any]) -> list[str]:
-    """Compose the selected training launcher."""
     backend = str(config.get("training_backend", "vanilla")).lower()
     if backend == "vanilla":
         python_bin = "/usr/local/bin/python" if Path("/usr/local/bin/python").exists() else sys.executable
@@ -360,6 +400,31 @@ def _training_pipeline(config: Dict[str, Any]) -> None:
                 log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n")
                 log.flush()
             log.write(f"\n--- GRPO training ({backend}, {config['num_gpus']} GPU process(es)) ---\n")
             log.flush()
             rc = _stream_subprocess(_build_training_cmd(config), log)
@@ -813,6 +878,9 @@ _HTML = """\
       <img id=dist src="/evidence/reward_distribution.png" onerror="this.style.display='none'">
       <div id=dist_missing class=muted style="display:none">(generated after post-train eval)</div>
     </div>
   </div>
   <h2>Before / after metrics</h2>
@@ -888,6 +956,27 @@ async function refresh() {
     probe.src = baseSrc + bust;
   }
   const logs = await fetch('/logs?tail=200').then(r => r.text());
   document.getElementById('logs').textContent = logs || '(no logs yet)';
 }
@@ -929,6 +1018,23 @@ def metrics() -> JSONResponse:
     return JSONResponse({"pre": None, "post": None, "delta": None})
 @app.get("/evidence")
 def evidence_index() -> JSONResponse:
     """List every evidence artifact currently on disk."""
@@ -986,12 +1092,31 @@ def logs(tail: int = 400) -> PlainTextResponse:
 @app.post("/train")
-def train() -> JSONResponse:
     try:
-        _start_training(dict(CONFIG))
     except RuntimeError as exc:
         raise HTTPException(status_code=409, detail=str(exc))
-    return JSONResponse({"status": "started", "config": CONFIG})
 @app.on_event("startup")

 from pathlib import Path
 from typing import Any, Dict, List, Optional
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse, Response
 from fastapi.staticfiles import StaticFiles
 _NUM_GPUS = _detect_gpus()
+def _bool_env(name: str, default: str) -> bool:
+    return _env(name, default).strip().lower() in ("1", "true", "yes", "on")
 CONFIG = {
     "training_backend": _env("TRAINING_BACKEND", "vanilla"),
     "model_name":       _env("MODEL_NAME", "HuggingFaceTB/SmolLM2-360M-Instruct"),
         f"{_env('HF_USERNAME', 'anugrahhu')}/cernenv-grpo-smollm2-360m",
     ),
     "autostart":        _env("AUTOSTART", "0") == "1",
+    # ── SFT warm-start phase (defeats v1's claim-avoidance reward hack
+    # by giving GRPO a non-zero prior over correct trajectories) ─────
+    "sft_warmstart":    _bool_env("SFT_WARMSTART", "false"),
+    "sft_num_episodes": int(_env("SFT_NUM_EPISODES", "200")),
+    "sft_max_steps":    int(_env("SFT_MAX_STEPS", "8")),
+    "sft_epochs":       int(_env("SFT_EPOCHS", "1")),
+    "sft_lr":           float(_env("SFT_LR", "1e-5")),
+    "sft_difficulty":   _env("SFT_DIFFICULTY", "mixed"),
+    "sft_out_dir":      _env("SFT_OUT_DIR", "runs/sft-warmstart"),
 }
     return rc
+def _build_sft_warmstart_cmd(config: Dict[str, Any]) -> list[str]:
+    """Compose the SFT-warm-start subprocess command.
+    Always uses the system Python so GRPO + SFT share the same
+    transformers + trl pin in space/training/requirements.txt.
+    """
+    python_bin = "/usr/local/bin/python" if Path("/usr/local/bin/python").exists() else sys.executable
+    return [
+        python_bin, "-m", "training.sft_warmstart",
+        "--out_dir",        config["sft_out_dir"],
+        "--num_episodes",   str(config["sft_num_episodes"]),
+        "--max_steps",      str(config["sft_max_steps"]),
+        "--epochs",         str(config["sft_epochs"]),
+        "--lr",             str(config["sft_lr"]),
+        "--base_model",     config["model_name"],
+        "--difficulty",     config["sft_difficulty"],
+        "--evidence_dir",   config["evidence_dir"],
+    ]
 def _build_training_cmd(config: Dict[str, Any]) -> list[str]:
+    """Compose the selected training launcher.
+    When ``sft_warmstart`` is on, ``model_name`` is expected to already
+    have been overwritten with the SFT output directory by the caller
+    (``_training_pipeline``), so this function never has to know about
+    the SFT phase explicitly — it just trains GRPO from whatever path
+    is sitting in ``model_name``.
+    """
     backend = str(config.get("training_backend", "vanilla")).lower()
     if backend == "vanilla":
         python_bin = "/usr/local/bin/python" if Path("/usr/local/bin/python").exists() else sys.executable
                 log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n")
                 log.flush()
+            if config.get("sft_warmstart"):
+                # Phase 1 — SFT warm-start. Produces a *full* causal-LM
+                # checkpoint at config['sft_out_dir'] (LoRA adapters are
+                # merged in by training/sft_warmstart.py) so we can hand
+                # it to GRPO as a drop-in --model_name.
+                sft_out = config["sft_out_dir"]
+                log.write(
+                    f"\n--- SFT warm-start ({config['sft_num_episodes']} oracle "
+                    f"episodes, epochs={config['sft_epochs']}, → {sft_out}) ---\n"
+                )
+                log.flush()
+                sft_rc = _stream_subprocess(_build_sft_warmstart_cmd(config), log)
+                if sft_rc != 0:
+                    raise RuntimeError(f"SFT warm-start failed (rc={sft_rc})")
+                log.write(
+                    f"\n[ok] SFT done; switching GRPO base model "
+                    f"{config['model_name']} → {sft_out}\n"
+                )
+                log.flush()
+                config["model_name"] = sft_out
+                # Keep the *base* HF id around for evaluator commands —
+                # tokenizer files in the SFT directory are saved by the
+                # SFT script, but evaluation will load from this dir
+                # directly, so no further path bookkeeping is required.
             log.write(f"\n--- GRPO training ({backend}, {config['num_gpus']} GPU process(es)) ---\n")
             log.flush()
             rc = _stream_subprocess(_build_training_cmd(config), log)
       <img id=dist src="/evidence/reward_distribution.png" onerror="this.style.display='none'">
       <div id=dist_missing class=muted style="display:none">(generated after post-train eval)</div>
     </div>
+    <div class=card><b>Warm-start (SFT)</b><br>
+      <div id=sft_card class=muted>(SFT_WARMSTART disabled — set the env var to enable)</div>
+    </div>
   </div>
   <h2>Before / after metrics</h2>
     probe.src = baseSrc + bust;
   }
+  // SFT warm-start card. /sft_summary returns 404 until the SFT phase
+  // has written evidence/sft_summary.json — when it does, render the
+  // headline numbers (final loss, oracle success rate, duration) so a
+  // reviewer can sanity-check the warm-start at a glance.
+  const sft_resp = await fetch('/sft_summary');
+  const sft_card = document.getElementById('sft_card');
+  if (sft_resp.ok) {
+    try {
+      const sft = await sft_resp.json();
+      sft_card.classList.remove('muted');
+      sft_card.innerHTML =
+        `<table>` +
+        `<tr><td><b>final loss</b></td><td><code>${fmt(sft.final_loss)}</code></td></tr>` +
+        `<tr><td><b>oracle success</b></td><td><code>${fmt(sft.oracle_success_rate)}</code></td></tr>` +
+        `<tr><td><b>transitions trained</b></td><td><code>${sft.num_train_rows ?? '–'}</code></td></tr>` +
+        `<tr><td><b>duration</b></td><td><code>${fmt(sft.duration_s)} s</code></td></tr>` +
+        `<tr><td><b>base → SFT dir</b></td><td><code>${sft.base_model} → ${sft.out_dir}</code></td></tr>` +
+        `</table>`;
+    } catch (e) { /* keep placeholder */ }
+  }
   const logs = await fetch('/logs?tail=200').then(r => r.text());
   document.getElementById('logs').textContent = logs || '(no logs yet)';
 }
     return JSONResponse({"pre": None, "post": None, "delta": None})
+@app.get("/sft_summary")
+def sft_summary() -> JSONResponse:
+    """Return the SFT warm-start summary if it exists.
+    Powers the dashboard's "Warm-start (SFT)" card: shows the final
+    training loss, oracle success rate, and wall-clock duration once
+    the SFT phase has written ``evidence/sft_summary.json``.
+    """
+    path = EVIDENCE_DIR / "sft_summary.json"
+    if path.exists():
+        try:
+            return JSONResponse(json.loads(path.read_text()))
+        except Exception:
+            return JSONResponse({"error": "sft_summary unreadable"}, status_code=500)
+    return JSONResponse({}, status_code=404)
 @app.get("/evidence")
 def evidence_index() -> JSONResponse:
     """List every evidence artifact currently on disk."""
 @app.post("/train")
+async def train(request: Request) -> JSONResponse:
+    """Start a training run.
+    The request body (JSON) is merged into the global ``CONFIG`` for
+    *this* run only, so future API-only triggers can flip
+    ``sft_warmstart`` (or any other config key) without redeploying
+    the Space. Unknown keys are accepted as-is — type coercion is the
+    caller's responsibility.
+    """
+    overrides: Dict[str, Any] = {}
+    try:
+        body = await request.body()
+        if body:
+            overrides = json.loads(body)
+            if not isinstance(overrides, dict):
+                raise ValueError("request body must be a JSON object")
+    except (ValueError, json.JSONDecodeError) as exc:
+        raise HTTPException(status_code=400, detail=f"bad request body: {exc}")
+    cfg = dict(CONFIG)
+    cfg.update(overrides)
     try:
+        _start_training(cfg)
     except RuntimeError as exc:
         raise HTTPException(status_code=409, detail=str(exc))
+    return JSONResponse({"status": "started", "config": cfg})
 @app.on_event("startup")