Spaces:

anugrahhu
/

cernenv-trainer

Paused

App Files Files Community

anugrahhu commited on 13 days ago

Commit

30adf48

verified ·

1 Parent(s): 8f805e2

fix: switch trainer Space to vanilla GRPO path

Browse files

Files changed (1) hide show

space/training/app.py +747 -673

space/training/app.py CHANGED Viewed

@@ -1,673 +1,747 @@
-"""FastAPI control panel for the CERNenv trainer Space.
-Endpoints:
-    GET  /              → status page (HTML)
-    GET  /status        → JSON status of the current training run
-    GET  /metrics       → JSON snapshot of reward / success rate
-    GET  /logs          → tail of the training log
-    POST /train         → start (or restart) a training run
-    GET  /health        → liveness probe
-Designed to run on a Hugging Face Space with `sdk: docker`. Heavy training
-work runs in a background thread so the HTTP server stays responsive.
-"""
-from __future__ import annotations
-import json
-import logging
-import os
-import subprocess
-import sys
-import threading
-import time
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Dict, Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse
-from fastapi.staticfiles import StaticFiles
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
-logger = logging.getLogger(__name__)
-def _resolve_repo_root() -> Path:
-    env_root = os.environ.get("CERNENV_ROOT")
-    candidates = []
-    if env_root:
-        candidates.append(Path(env_root))
-    candidates.extend([
-        Path("/home/user/app"),
-        Path(__file__).resolve().parent.parent.parent,
-    ])
-    for p in candidates:
-        try:
-            if p.exists():
-                return p.resolve()
-        except OSError:
-            continue
-    return candidates[-1].resolve()
-REPO_ROOT = _resolve_repo_root()
-LOG_DIR = REPO_ROOT / "training" / "runs"
-try:
-    LOG_DIR.mkdir(parents=True, exist_ok=True)
-except OSError as exc:  # pragma: no cover - read-only filesystem fallback
-    logger.warning("could not create %s (%s); using /tmp", LOG_DIR, exc)
-    LOG_DIR = Path("/tmp/cernenv-runs")
-    LOG_DIR.mkdir(parents=True, exist_ok=True)
-LOG_FILE = LOG_DIR / "training.log"
-EVIDENCE_DIR = REPO_ROOT / "evidence"
-try:
-    EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)
-except OSError:  # pragma: no cover
-    EVIDENCE_DIR = Path("/tmp/cernenv-evidence")
-    EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)
-METRICS_FILE = EVIDENCE_DIR / "before_after_metrics.json"
-def _env(name: str, default: str) -> str:
-    return os.environ.get(name, default)
-def _detect_gpus() -> int:
-    try:
-        import torch  # type: ignore
-        if torch.cuda.is_available():
-            return torch.cuda.device_count()
-    except Exception:
-        pass
-    try:
-        out = subprocess.run(
-            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
-            capture_output=True, text=True, timeout=5,
-        )
-        return len([l for l in out.stdout.splitlines() if l.strip()])
-    except Exception:
-        return 0
-_NUM_GPUS = _detect_gpus()
-CONFIG = {
-    "model_name":       _env("MODEL_NAME", "unsloth/Qwen2.5-3B-Instruct"),
-    "difficulty":       _env("DIFFICULTY", "easy"),
-    "curriculum":       _env("CURRICULUM", "1") == "1",
-    "curriculum_promote": float(_env("CURRICULUM_PROMOTE", "0.55")),
-    "curriculum_demote":  float(_env("CURRICULUM_DEMOTE", "0.10")),
-    "total_episodes":   int(_env("TOTAL_EPISODES", "1500")),
-    "max_steps":        int(_env("MAX_STEPS", "18")),
-    "num_generations":  int(_env("NUM_GENERATIONS", "8")),
-    "checkpoint_eval_steps":    int(_env("CHECKPOINT_EVAL_STEPS", "25")),
-    "checkpoint_eval_episodes": int(_env("CHECKPOINT_EVAL_EPISODES", "8")),
-    "eval_episodes":    int(_env("EVAL_EPISODES", "32")),
-    "output_dir":       _env("OUTPUT_DIR", "runs/unsloth-grpo"),
-    "evidence_dir":     _env("EVIDENCE_DIR", "evidence"),
-    "num_gpus":         int(_env("NUM_GPUS", str(_NUM_GPUS or 1))),
-    "hf_username":      _env("HF_USERNAME", "anugrah55"),
-    "push_repo":        _env(
-        "PUSH_REPO",
-        f"{_env('HF_USERNAME', 'anugrah55')}/cernenv-grpo-qwen2.5-3b",
-    ),
-    "autostart":        _env("AUTOSTART", "0") == "1",
-}
-# ── Run state ────────────────────────────────────────────────────────────
-class RunState:
-    def __init__(self) -> None:
-        self.lock = threading.Lock()
-        self.thread: Optional[threading.Thread] = None
-        self.process: Optional[subprocess.Popen] = None
-        self.status: str = "idle"  # idle | running | finished | failed
-        self.started_at: Optional[str] = None
-        self.finished_at: Optional[str] = None
-        self.last_error: Optional[str] = None
-        self.last_config: Dict[str, Any] = {}
-    def to_dict(self) -> Dict[str, Any]:
-        with self.lock:
-            return {
-                "status": self.status,
-                "started_at": self.started_at,
-                "finished_at": self.finished_at,
-                "last_error": self.last_error,
-                "last_config": self.last_config,
-            }
-STATE = RunState()
-# ── Training pipeline ────────────────────────────────────────────────────
-def _stream_subprocess(cmd: list[str], log_handle) -> int:
-    log_handle.write(f"\n$ {' '.join(cmd)}\n")
-    log_handle.flush()
-    proc = subprocess.Popen(
-        cmd,
-        cwd=str(REPO_ROOT),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=1,
-        universal_newlines=True,
-        env={**os.environ, "PYTHONPATH": str(REPO_ROOT)},
-    )
-    STATE.process = proc
-    assert proc.stdout is not None
-    for line in proc.stdout:
-        log_handle.write(line)
-        log_handle.flush()
-    rc = proc.wait()
-    log_handle.write(f"[exit code {rc}]\n")
-    log_handle.flush()
-    STATE.process = None
-    return rc
-def _build_training_cmd(config: Dict[str, Any]) -> list[str]:
-    """Compose the training launcher (single-GPU python or multi-GPU accelerate)."""
-    base = [
-        "-m", "training.training_unsloth",
-        "--model_name", config["model_name"],
-        "--difficulty", config["difficulty"],
-        "--total_episodes", str(config["total_episodes"]),
-        "--max_steps", str(config["max_steps"]),
-        "--num_generations", str(config["num_generations"]),
-        "--checkpoint_eval_steps", str(config["checkpoint_eval_steps"]),
-        "--checkpoint_eval_episodes", str(config["checkpoint_eval_episodes"]),
-        "--output_dir", config["output_dir"],
-        "--evidence_dir", config["evidence_dir"],
-    ]
-    if config.get("curriculum"):
-        base.extend([
-            "--curriculum",
-            "--curriculum_promote", str(config["curriculum_promote"]),
-            "--curriculum_demote",  str(config["curriculum_demote"]),
-        ])
-    n = max(int(config.get("num_gpus", 1)), 1)
-    if n > 1:
-        return ["accelerate", "launch", "--num_processes", str(n), "--mixed_precision", "bf16"] + base
-    return [sys.executable] + base
-def _push_evidence_to_hub(*, evidence_dir: Path, repo_id: str, log) -> None:
-    """Upload the entire evidence/ directory to the model repo."""
-    token = os.environ.get("HF_TOKEN")
-    if not token:
-        log.write("\n[skip] HF_TOKEN not set — evidence not pushed\n")
-        log.flush()
-        return
-    try:
-        from huggingface_hub import HfApi
-        api = HfApi(token=token)
-        api.upload_folder(
-            folder_path=str(evidence_dir),
-            repo_id=repo_id,
-            repo_type="model",
-            path_in_repo="evidence",
-            commit_message="Upload CERNenv training evidence (curves, evals, plots)",
-        )
-        log.write(f"\n[ok] uploaded evidence/ → https://huggingface.co/{repo_id}/tree/main/evidence\n")
-        log.flush()
-    except Exception as exc:
-        log.write(f"\n[warn] evidence push failed: {exc}\n")
-        log.flush()
-def _training_pipeline(config: Dict[str, Any]) -> None:
-    started = datetime.now(timezone.utc).isoformat()
-    with STATE.lock:
-        STATE.status = "running"
-        STATE.started_at = started
-        STATE.finished_at = None
-        STATE.last_error = None
-        STATE.last_config = dict(config)
-    evidence_dir = Path(config["evidence_dir"]).resolve()
-    evidence_dir.mkdir(parents=True, exist_ok=True)
-    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
-    with open(LOG_FILE, "a") as log:
-        log.write(f"\n=== Training started {started} ===\n")
-        log.write(json.dumps(config, indent=2) + "\n")
-        log.flush()
-        try:
-            output_dir = config["output_dir"]
-            difficulty = config["difficulty"]
-            max_steps = str(config["max_steps"])
-            eval_episodes = str(config["eval_episodes"])
-            model_name = config["model_name"]
-            push_repo = config["push_repo"]
-            evidence_str = config["evidence_dir"]
-            pre_jsonl = f"{evidence_str}/pre_eval.jsonl"
-            post_jsonl = f"{evidence_str}/post_eval.jsonl"
-            log.write("\n--- baseline sanity check (random / heuristic / oracle) ---\n")
-            log.flush()
-            for agent in ("random", "heuristic", "oracle"):
-                _stream_subprocess(
-                    [
-                        sys.executable, "-m", "scripts.run_agent",
-                        "--agent", agent, "--difficulty", difficulty,
-                        "--episodes", "3", "--quiet",
-                    ],
-                    log,
-                )
-            log.write(f"\n--- pre-train evaluation ({eval_episodes} eps) ---\n")
-            log.flush()
-            rc = _stream_subprocess(
-                [
-                    sys.executable, "-m", "training.evaluate",
-                    "--model_name", model_name,
-                    "--difficulty", difficulty,
-                    "--episodes", eval_episodes,
-                    "--max_steps", max_steps,
-                    "--tag", "pre_train",
-                    "--out", pre_jsonl,
-                ],
-                log,
-            )
-            if rc != 0:
-                # don't abort — we still want training + post-eval evidence.
-                log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n")
-                log.flush()
-            log.write(f"\n--- GRPO training ({config['num_gpus']} GPU process(es)) ---\n")
-            log.flush()
-            rc = _stream_subprocess(_build_training_cmd(config), log)
-            if rc != 0:
-                raise RuntimeError(f"training failed (rc={rc})")
-            # ── LoRA save-and-reload smoke test ─────────────────────
-            # Hackathon FAQ Q9: "Do not upcast a 4-bit model to 16-bit
-            # and then merge the LoRA weights naively" — the canonical
-            # cause of a broken push. Before we burn time on the full
-            # post-train evaluation (32 eps), do a 2-episode cold-load
-            # rollout against the saved adapters. If that fails, abort
-            # immediately so we surface a save problem, not a 30-min
-            # eval timeout.
-            log.write(
-                f"\n--- adapter save/reload smoke test "
-                f"(loading {output_dir} cold-start, 2 eps) ---\n"
-            )
-            log.flush()
-            rc = _stream_subprocess(
-                [
-                    sys.executable, "-m", "training.evaluate",
-                    "--model_name", model_name,
-                    "--adapter_dir", output_dir,
-                    "--difficulty", difficulty,
-                    "--episodes", "2",
-                    "--max_steps", max_steps,
-                    "--tag", "smoke",
-                    "--out", f"{evidence_str}/smoke_eval.jsonl",
-                ],
-                log,
-            )
-            if rc != 0:
-                raise RuntimeError(
-                    f"adapter smoke test failed (rc={rc}); refusing to push "
-                    f"unloadable adapters to the Hub. Inspect {output_dir} and "
-                    "verify adapter_config.json + adapter_model.safetensors exist."
-                )
-            log.write(f"\n--- post-train evaluation ({eval_episodes} eps) ---\n")
-            log.flush()
-            rc = _stream_subprocess(
-                [
-                    sys.executable, "-m", "training.evaluate",
-                    "--model_name", model_name,
-                    "--adapter_dir", output_dir,
-                    "--difficulty", difficulty,
-                    "--episodes", eval_episodes,
-                    "--max_steps", max_steps,
-                    "--tag", "post_train",
-                    "--out", post_jsonl,
-                ],
-                log,
-            )
-            if rc != 0:
-                log.write(f"\n[warn] post-train eval failed (rc={rc}); evidence will be partial\n")
-                log.flush()
-            log.write("\n--- evidence: before/after summary, distribution, trajectories ---\n")
-            log.flush()
-            try:
-                from training.evidence import (
-                    EvidencePaths,
-                    render_before_after,
-                    render_sample_trajectories,
-                    render_training_curve,
-                    render_reward_components,
-                    render_checkpoint_progression,
-                )
-                paths = EvidencePaths(root=Path(evidence_str))
-                paths.ensure()
-                metrics = render_before_after(
-                    pre_jsonl=Path(pre_jsonl),
-                    post_jsonl=Path(post_jsonl),
-                    summary_png=paths.before_after_summary_png,
-                    distribution_png=paths.reward_distribution_png,
-                    metrics_json=paths.before_after_metrics_json,
-                )
-                render_sample_trajectories(
-                    pre_jsonl=Path(pre_jsonl),
-                    post_jsonl=Path(post_jsonl),
-                    md_path=paths.sample_trajectories_md,
-                )
-                render_training_curve(paths.training_log_csv, paths.training_curve_png)
-                render_reward_components(
-                    paths.reward_components_csv, paths.reward_components_png,
-                )
-                render_checkpoint_progression(
-                    paths.checkpoint_evals_csv, paths.checkpoint_progression_png,
-                )
-                log.write(json.dumps(metrics, indent=2) + "\n")
-                log.flush()
-            except Exception as exc:
-                log.write(f"[warn] evidence rendering failed: {exc}\n")
-                log.flush()
-            if os.environ.get("HF_TOKEN"):
-                log.write("\n--- push adapters to Hub ---\n")
-                log.flush()
-                _stream_subprocess(
-                    [
-                        sys.executable, "-m", "scripts.push_to_hub", "model",
-                        "--adapter_dir", output_dir,
-                        "--repo_id", push_repo,
-                        "--base_model", model_name,
-                    ],
-                    log,
-                )
-                _push_evidence_to_hub(
-                    evidence_dir=evidence_dir,
-                    repo_id=push_repo,
-                    log=log,
-                )
-            else:
-                log.write("\n[skip] HF_TOKEN not set — not pushing to Hub\n")
-                log.flush()
-            with STATE.lock:
-                STATE.status = "finished"
-        except Exception as exc:
-            logger.exception("training pipeline failed")
-            with STATE.lock:
-                STATE.status = "failed"
-                STATE.last_error = str(exc)
-        finally:
-            finished = datetime.now(timezone.utc).isoformat()
-            log.write(f"\n=== Training ended {finished} ===\n")
-            log.flush()
-            with STATE.lock:
-                STATE.finished_at = finished
-def _start_training(config: Dict[str, Any]) -> None:
-    with STATE.lock:
-        if STATE.status == "running":
-            raise RuntimeError("a training run is already in progress")
-        STATE.thread = threading.Thread(
-            target=_training_pipeline,
-            args=(config,),
-            name="cernenv-trainer",
-            daemon=True,
-        )
-        STATE.thread.start()
-# ── FastAPI app ──────────────────────────────────────────────────────────
-app = FastAPI(title="CERNenv Trainer", version="0.1.0")
-_HTML = """\
-<!doctype html>
-<html lang=en>
-<head>
-  <meta charset=utf-8>
-  <title>CERNenv Trainer</title>
-  <style>
-    body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 2rem auto;
-           max-width: 1000px; color:#111; padding: 0 1rem; line-height:1.5 }
-    h1 { margin-bottom: 0 }
-    h2 { margin-top: 2rem; border-bottom:1px solid #eee; padding-bottom:.25rem }
-    .muted { color:#666 }
-    pre { background:#0e1116; color:#e6edf3; padding:1rem; border-radius:6px;
-          overflow-x:auto; max-height:40vh; font-size:.85em }
-    button { font-size:1rem; padding:.6rem 1rem; border-radius:6px; border:1px solid #888;
-             background:#fff; cursor:pointer; margin-right:.4rem }
-    .pill { display:inline-block; padding:.1rem .55rem; border-radius:999px;
-            background:#eef; color:#225; font-size:.85em }
-    .ok { background:#dfd; color:#272 }
-    .fail { background:#fdd; color:#822 }
-    .run { background:#fdf6d8; color:#774 }
-    table { border-collapse:collapse; margin:.5rem 0 }
-    td, th { padding:.25rem .8rem .25rem 0; vertical-align: top; text-align:left }
-    th { color:#444; font-weight:600 }
-    .grid { display:grid; grid-template-columns:1fr 1fr; gap:1rem }
-    .card { border:1px solid #e5e7eb; border-radius:8px; padding:.75rem; background:#fafafa }
-    .card img { max-width:100%; border-radius:4px }
-    .delta-pos { color:#15803d; font-weight:600 }
-    .delta-neg { color:#b91c1c; font-weight:600 }
-    code { background:#f4f4f4; padding:.05rem .35rem; border-radius:4px }
-    a { color:#1d4ed8 }
-  </style>
-</head>
-<body>
-  <h1>⚛️ CERNenv Trainer</h1>
-  <p class=muted>GRPO + Unsloth + LoRA on the CERNenv LHC discovery environment. Multi-GPU on Hugging Face Spaces.</p>
-  <h2>Run status</h2>
-  <p>Status: <span id=status class=pill>?</span></p>
-  <table id=meta></table>
-  <p>
-    <button onclick="startRun()">▶ Start training</button>
-    <button onclick="refresh()">↻ Refresh</button>
-    <a href="/evidence" target=_blank><button>📁 Evidence index</button></a>
-    <a href="/docs" target=_blank><button>🛠 API</button></a>
-  </p>
-  <h2>Training-progress evidence</h2>
-  <p class=muted>Auto-updated as training runs. All artifacts are also saved to <code>evidence/</code> and pushed to the model repo on the Hub.</p>
-  <div class=grid>
-    <div class=card><b>Per-step training curve</b><br>
-      <img id=curve src="/evidence/training_curve.png" onerror="this.style.display='none'">
-      <div id=curve_missing class=muted style="display:none">(not yet — waiting for first GRPO step)</div>
-    </div>
-    <div class=card><b>Reward components (terminal vs shaping)</b><br>
-      <img id=components src="/evidence/reward_components.png" onerror="this.style.display='none'">
-      <div id=components_missing class=muted style="display:none">(populated after a few rollouts — watches verifier hacks)</div>
-    </div>
-    <div class=card><b>Mid-training checkpoint progression</b><br>
-      <img id=ckpt src="/evidence/checkpoint_progression.png" onerror="this.style.display='none'">
-      <div id=ckpt_missing class=muted style="display:none">(not yet — waiting for first checkpoint eval)</div>
-    </div>
-    <div class=card><b>Before vs after summary</b><br>
-      <img id=summary src="/evidence/before_after_summary.png" onerror="this.style.display='none'">
-      <div id=summary_missing class=muted style="display:none">(generated after post-train eval)</div>
-    </div>
-    <div class=card><b>Reward distribution: pre vs post</b><br>
-      <img id=dist src="/evidence/reward_distribution.png" onerror="this.style.display='none'">
-      <div id=dist_missing class=muted style="display:none">(generated after post-train eval)</div>
-    </div>
-  </div>
-  <h2>Before / after metrics</h2>
-  <table id=metrics_table>
-    <tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr>
-  </table>
-  <h2>Live logs (tail)</h2>
-  <pre id=logs>loading…</pre>
-<script>
-function fmt(v) {
-  if (v == null) return '–';
-  if (typeof v === 'number') return v.toFixed(3);
-  return v;
-}
-function fmtDelta(d) {
-  if (d == null || isNaN(d)) return '–';
-  const sign = d >= 0 ? '+' : '';
-  const cls = d >= 0 ? 'delta-pos' : 'delta-neg';
-  return `<span class="${cls}">${sign}${d.toFixed(3)}</span>`;
-}
-async function refresh() {
-  // status
-  const s = await fetch('/status').then(r => r.json());
-  const pill = document.getElementById('status');
-  pill.textContent = s.status;
-  pill.className = 'pill ' + ({idle:'',running:'run',finished:'ok',failed:'fail'}[s.status] || '');
-  const meta = document.getElementById('meta');
-  meta.innerHTML = '';
-  const obj = {
-    started_at: s.started_at, finished_at: s.finished_at, error: s.last_error,
-    ...(s.last_config || {}),
-  };
-  for (const [k, v] of Object.entries(obj)) {
-    if (v == null || v === '') continue;
-    const tr = document.createElement('tr');
-    tr.innerHTML = `<td><b>${k}</b></td><td><code>${v}</code></td>`;
-    meta.appendChild(tr);
-  }
-  // metrics
-  const m = await fetch('/metrics').then(r => r.json()).catch(() => ({pre:null, post:null}));
-  const tbody = document.getElementById('metrics_table');
-  tbody.innerHTML = '<tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr>';
-  const fields = ['mean_reward', 'success_rate', 'mass_acc', 'channel_acc', 'median_reward'];
-  for (const f of fields) {
-    const pre = m.pre && m.pre[f];
-    const post = m.post && m.post[f];
-    const delta = m.delta && m.delta[f];
-    const tr = document.createElement('tr');
-    tr.innerHTML = `<td><code>${f}</code></td><td>${fmt(pre)}</td><td>${fmt(post)}</td><td>${fmtDelta(delta)}</td>`;
-    tbody.appendChild(tr);
-  }
-  // bust caches on plots
-  const bust = '?t=' + Date.now();
-  for (const [imgId, missingId] of [
-    ['curve', 'curve_missing'],
-    ['components', 'components_missing'],
-    ['ckpt', 'ckpt_missing'],
-    ['summary', 'summary_missing'],
-    ['dist', 'dist_missing'],
-  ]) {
-    const img = document.getElementById(imgId);
-    const miss = document.getElementById(missingId);
-    const baseSrc = img.getAttribute('src').split('?')[0];
-    const probe = new Image();
-    probe.onload  = () => { img.src = baseSrc + bust; img.style.display=''; miss.style.display='none'; };
-    probe.onerror = () => { img.style.display='none'; miss.style.display=''; };
-    probe.src = baseSrc + bust;
-  }
-  const logs = await fetch('/logs?tail=200').then(r => r.text());
-  document.getElementById('logs').textContent = logs || '(no logs yet)';
-}
-async function startRun() {
-  const r = await fetch('/train', {method:'POST'});
-  if (!r.ok) alert((await r.json()).detail || 'failed');
-  setTimeout(refresh, 500);
-}
-refresh();
-setInterval(refresh, 5000);
-</script>
-</body>
-</html>
-"""
-@app.get("/", response_class=HTMLResponse)
-def index() -> HTMLResponse:
-    return HTMLResponse(_HTML)
-@app.get("/health")
-def health() -> Dict[str, str]:
-    return {"status": "ok"}
-@app.get("/status")
-def status() -> JSONResponse:
-    return JSONResponse(STATE.to_dict())
-@app.get("/metrics")
-def metrics() -> JSONResponse:
-    if METRICS_FILE.exists():
-        try:
-            return JSONResponse(json.loads(METRICS_FILE.read_text()))
-        except Exception:
-            return JSONResponse({"error": "metrics file unreadable"}, status_code=500)
-    return JSONResponse({"pre": None, "post": None, "delta": None})
-@app.get("/evidence")
-def evidence_index() -> JSONResponse:
-    """List every evidence artifact currently on disk."""
-    files = []
-    if EVIDENCE_DIR.exists():
-        for p in sorted(EVIDENCE_DIR.iterdir()):
-            if p.is_file():
-                files.append({
-                    "name": p.name,
-                    "size": p.stat().st_size,
-                    "url": f"/evidence/{p.name}",
-                })
-    return JSONResponse({"dir": str(EVIDENCE_DIR), "files": files})
-@app.get("/evidence/{name}")
-def evidence_file(name: str):
-    """Serve a single evidence artifact (PNG/CSV/JSON/MD) by filename."""
-    if "/" in name or ".." in name:
-        raise HTTPException(status_code=400, detail="invalid name")
-    target = EVIDENCE_DIR / name
-    if not target.exists() or not target.is_file():
-        raise HTTPException(status_code=404, detail=f"{name} not found")
-    return FileResponse(target)
-@app.get("/logs", response_class=PlainTextResponse)
-def logs(tail: int = 400) -> PlainTextResponse:
-    if not LOG_FILE.exists():
-        return PlainTextResponse("")
-    text = LOG_FILE.read_text()
-    lines = text.splitlines()
-    return PlainTextResponse("\n".join(lines[-max(tail, 1):]))
-@app.post("/train")
-def train() -> JSONResponse:
-    try:
-        _start_training(dict(CONFIG))
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-    return JSONResponse({"status": "started", "config": CONFIG})
-@app.on_event("startup")
-def _maybe_autostart() -> None:
-    if CONFIG["autostart"]:
-        try:
-            _start_training(dict(CONFIG))
-            logger.info("autostarted training run")
-        except RuntimeError as exc:
-            logger.warning("autostart skipped: %s", exc)

+"""FastAPI control panel for the CERNenv trainer Space.
+Endpoints:
+    GET  /              → status page (HTML)
+    GET  /status        → JSON status of the current training run
+    GET  /metrics       → JSON snapshot of reward / success rate
+    GET  /logs          → tail of the training log
+    POST /train         → start (or restart) a training run
+    GET  /health        → liveness probe
+Designed to run on a Hugging Face Space with `sdk: docker`. Heavy training
+work runs in a background thread so the HTTP server stays responsive.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import subprocess
+import sys
+import threading
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse
+from fastapi.staticfiles import StaticFiles
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+def _resolve_repo_root() -> Path:
+    env_root = os.environ.get("CERNENV_ROOT")
+    candidates = []
+    if env_root:
+        candidates.append(Path(env_root))
+    candidates.extend([
+        Path("/home/user/app"),
+        Path(__file__).resolve().parent.parent.parent,
+    ])
+    for p in candidates:
+        try:
+            if p.exists():
+                return p.resolve()
+        except OSError:
+            continue
+    return candidates[-1].resolve()
+REPO_ROOT = _resolve_repo_root()
+LOG_DIR = REPO_ROOT / "training" / "runs"
+try:
+    LOG_DIR.mkdir(parents=True, exist_ok=True)
+except OSError as exc:  # pragma: no cover - read-only filesystem fallback
+    logger.warning("could not create %s (%s); using /tmp", LOG_DIR, exc)
+    LOG_DIR = Path("/tmp/cernenv-runs")
+    LOG_DIR.mkdir(parents=True, exist_ok=True)
+LOG_FILE = LOG_DIR / "training.log"
+EVIDENCE_DIR = REPO_ROOT / "evidence"
+try:
+    EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)
+except OSError:  # pragma: no cover
+    EVIDENCE_DIR = Path("/tmp/cernenv-evidence")
+    EVIDENCE_DIR.mkdir(parents=True, exist_ok=True)
+METRICS_FILE = EVIDENCE_DIR / "before_after_metrics.json"
+def _env(name: str, default: str) -> str:
+    return os.environ.get(name, default)
+def _detect_gpus() -> int:
+    try:
+        import torch  # type: ignore
+        if torch.cuda.is_available():
+            return torch.cuda.device_count()
+    except Exception:
+        pass
+    try:
+        out = subprocess.run(
+            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=5,
+        )
+        return len([l for l in out.stdout.splitlines() if l.strip()])
+    except Exception:
+        return 0
+_NUM_GPUS = _detect_gpus()
+CONFIG = {
+    "training_backend": _env("TRAINING_BACKEND", "vanilla"),
+    "model_name":       _env("MODEL_NAME", "HuggingFaceTB/SmolLM2-360M-Instruct"),
+    "difficulty":       _env("DIFFICULTY", "easy"),
+    "curriculum":       _env("CURRICULUM", "0") == "1",
+    "curriculum_promote": float(_env("CURRICULUM_PROMOTE", "0.55")),
+    "curriculum_demote":  float(_env("CURRICULUM_DEMOTE", "0.10")),
+    "total_episodes":   int(_env("TOTAL_EPISODES", "120")),
+    "max_steps":        int(_env("MAX_STEPS", "12")),
+    "num_generations":  int(_env("NUM_GENERATIONS", "4")),
+    "checkpoint_eval_steps":    int(_env("CHECKPOINT_EVAL_STEPS", "25")),
+    "checkpoint_eval_episodes": int(_env("CHECKPOINT_EVAL_EPISODES", "8")),
+    "eval_episodes":    int(_env("EVAL_EPISODES", "8")),
+    "output_dir":       _env("OUTPUT_DIR", "runs/vanilla-grpo"),
+    "evidence_dir":     _env("EVIDENCE_DIR", "evidence"),
+    "num_gpus":         int(_env("NUM_GPUS", "1")),
+    "hf_username":      _env("HF_USERNAME", "anugrahhu"),
+    "push_repo":        _env(
+        "PUSH_REPO",
+        f"{_env('HF_USERNAME', 'anugrahhu')}/cernenv-grpo-smollm2-360m",
+    ),
+    "autostart":        _env("AUTOSTART", "0") == "1",
+}
+# ── Run state ────────────────────────────────────────────────────────────
+class RunState:
+    def __init__(self) -> None:
+        self.lock = threading.Lock()
+        self.thread: Optional[threading.Thread] = None
+        self.process: Optional[subprocess.Popen] = None
+        self.status: str = "idle"  # idle | running | finished | failed
+        self.started_at: Optional[str] = None
+        self.finished_at: Optional[str] = None
+        self.last_error: Optional[str] = None
+        self.last_config: Dict[str, Any] = {}
+    def to_dict(self) -> Dict[str, Any]:
+        with self.lock:
+            return {
+                "status": self.status,
+                "started_at": self.started_at,
+                "finished_at": self.finished_at,
+                "last_error": self.last_error,
+                "last_config": self.last_config,
+            }
+STATE = RunState()
+# ── Training pipeline ────────────────────────────────────────────────────
+def _stream_subprocess(cmd: list[str], log_handle) -> int:
+    log_handle.write(f"\n$ {' '.join(cmd)}\n")
+    log_handle.flush()
+    proc = subprocess.Popen(
+        cmd,
+        cwd=str(REPO_ROOT),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        bufsize=1,
+        universal_newlines=True,
+        env={**os.environ, "PYTHONPATH": str(REPO_ROOT)},
+    )
+    STATE.process = proc
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        log_handle.write(line)
+        log_handle.flush()
+    rc = proc.wait()
+    log_handle.write(f"[exit code {rc}]\n")
+    log_handle.flush()
+    STATE.process = None
+    return rc
+def _build_training_cmd(config: Dict[str, Any]) -> list[str]:
+    """Compose the selected training launcher."""
+    backend = str(config.get("training_backend", "vanilla")).lower()
+    if backend == "vanilla":
+        python_bin = "/usr/local/bin/python" if Path("/usr/local/bin/python").exists() else sys.executable
+        return [
+            python_bin, "-m", "training.training_script",
+            "--model_name", config["model_name"],
+            "--difficulty", config["difficulty"],
+            "--total_episodes", str(config["total_episodes"]),
+            "--max_steps", str(config["max_steps"]),
+            "--num_generations", str(config["num_generations"]),
+            "--output_dir", config["output_dir"],
+        ]
+    if backend != "unsloth":
+        raise ValueError(f"unknown TRAINING_BACKEND={backend!r}")
+    base = [
+        "-m", "training.training_unsloth",
+        "--model_name", config["model_name"],
+        "--difficulty", config["difficulty"],
+        "--total_episodes", str(config["total_episodes"]),
+        "--max_steps", str(config["max_steps"]),
+        "--num_generations", str(config["num_generations"]),
+        "--checkpoint_eval_steps", str(config["checkpoint_eval_steps"]),
+        "--checkpoint_eval_episodes", str(config["checkpoint_eval_episodes"]),
+        "--output_dir", config["output_dir"],
+        "--evidence_dir", config["evidence_dir"],
+    ]
+    if config.get("curriculum"):
+        base.extend([
+            "--curriculum",
+            "--curriculum_promote", str(config["curriculum_promote"]),
+            "--curriculum_demote",  str(config["curriculum_demote"]),
+        ])
+    n = max(int(config.get("num_gpus", 1)), 1)
+    if n > 1:
+        return ["accelerate", "launch", "--num_processes", str(n), "--mixed_precision", "bf16"] + base
+    return [sys.executable] + base
+def _build_eval_cmd(
+    *,
+    model_name: str,
+    difficulty: str,
+    episodes: str,
+    max_steps: str,
+    tag: str,
+    out: str,
+    backend: str,
+    adapter_dir: Optional[str] = None,
+) -> list[str]:
+    cmd = [
+        sys.executable, "-m", "training.evaluate",
+        "--model_name", model_name,
+        "--difficulty", difficulty,
+        "--episodes", episodes,
+        "--max_steps", max_steps,
+        "--tag", tag,
+        "--out", out,
+    ]
+    if adapter_dir:
+        cmd.extend(["--adapter_dir", adapter_dir])
+    if backend == "vanilla":
+        cmd.append("--no_unsloth")
+    return cmd
+def _push_model_folder_to_hub(*, output_dir: Path, repo_id: str, base_model: str, log) -> None:
+    """Upload a vanilla transformers model directory to the Hub."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        log.write("\n[skip] HF_TOKEN not set — model not pushed\n")
+        log.flush()
+        return
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
+        api.upload_folder(
+            folder_path=str(output_dir),
+            repo_id=repo_id,
+            repo_type="model",
+            commit_message=f"Upload vanilla GRPO model based on {base_model}",
+        )
+        log.write(f"\n[ok] uploaded model → https://huggingface.co/{repo_id}\n")
+        log.flush()
+    except Exception as exc:
+        log.write(f"\n[warn] model push failed: {exc}\n")
+        log.flush()
+def _push_evidence_to_hub(*, evidence_dir: Path, repo_id: str, log) -> None:
+    """Upload the entire evidence/ directory to the model repo."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        log.write("\n[skip] HF_TOKEN not set — evidence not pushed\n")
+        log.flush()
+        return
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        api.upload_folder(
+            folder_path=str(evidence_dir),
+            repo_id=repo_id,
+            repo_type="model",
+            path_in_repo="evidence",
+            commit_message="Upload CERNenv training evidence (curves, evals, plots)",
+        )
+        log.write(f"\n[ok] uploaded evidence/ → https://huggingface.co/{repo_id}/tree/main/evidence\n")
+        log.flush()
+    except Exception as exc:
+        log.write(f"\n[warn] evidence push failed: {exc}\n")
+        log.flush()
+def _training_pipeline(config: Dict[str, Any]) -> None:
+    started = datetime.now(timezone.utc).isoformat()
+    with STATE.lock:
+        STATE.status = "running"
+        STATE.started_at = started
+        STATE.finished_at = None
+        STATE.last_error = None
+        STATE.last_config = dict(config)
+    evidence_dir = Path(config["evidence_dir"]).resolve()
+    evidence_dir.mkdir(parents=True, exist_ok=True)
+    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(LOG_FILE, "a") as log:
+        log.write(f"\n=== Training started {started} ===\n")
+        log.write(json.dumps(config, indent=2) + "\n")
+        log.flush()
+        try:
+            output_dir = config["output_dir"]
+            difficulty = config["difficulty"]
+            max_steps = str(config["max_steps"])
+            eval_episodes = str(config["eval_episodes"])
+            model_name = config["model_name"]
+            push_repo = config["push_repo"]
+            evidence_str = config["evidence_dir"]
+            backend = str(config.get("training_backend", "vanilla")).lower()
+            pre_jsonl = f"{evidence_str}/pre_eval.jsonl"
+            post_jsonl = f"{evidence_str}/post_eval.jsonl"
+            log.write("\n--- baseline sanity check (random / heuristic / oracle) ---\n")
+            log.flush()
+            for agent in ("random", "heuristic", "oracle"):
+                _stream_subprocess(
+                    [
+                        sys.executable, "-m", "scripts.run_agent",
+                        "--agent", agent, "--difficulty", difficulty,
+                        "--episodes", "3", "--quiet",
+                    ],
+                    log,
+                )
+            log.write(f"\n--- pre-train evaluation ({eval_episodes} eps) ---\n")
+            log.flush()
+            rc = _stream_subprocess(
+                _build_eval_cmd(
+                    model_name=model_name,
+                    difficulty=difficulty,
+                    episodes=eval_episodes,
+                    max_steps=max_steps,
+                    tag="pre_train",
+                    out=pre_jsonl,
+                    backend=backend,
+                ),
+                log,
+            )
+            if rc != 0:
+                # don't abort — we still want training + post-eval evidence.
+                log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n")
+                log.flush()
+            log.write(f"\n--- GRPO training ({backend}, {config['num_gpus']} GPU process(es)) ---\n")
+            log.flush()
+            rc = _stream_subprocess(_build_training_cmd(config), log)
+            if rc != 0:
+                raise RuntimeError(f"training failed (rc={rc})")
+            # Cold-load the trained artifact before burning time on post-eval.
+            log.write(
+                f"\n--- trained artifact smoke test "
+                f"(loading {output_dir} cold-start, 2 eps) ---\n"
+            )
+            log.flush()
+            smoke_model = output_dir if backend == "vanilla" else model_name
+            smoke_adapter = None if backend == "vanilla" else output_dir
+            rc = _stream_subprocess(
+                _build_eval_cmd(
+                    model_name=smoke_model,
+                    adapter_dir=smoke_adapter,
+                    difficulty=difficulty,
+                    episodes="2",
+                    max_steps=max_steps,
+                    tag="smoke",
+                    out=f"{evidence_str}/smoke_eval.jsonl",
+                    backend=backend,
+                ),
+                log,
+            )
+            if rc != 0:
+                raise RuntimeError(
+                    f"trained artifact smoke test failed (rc={rc}); refusing to push "
+                    f"unloadable output to the Hub. Inspect {output_dir}."
+                )
+            log.write(f"\n--- post-train evaluation ({eval_episodes} eps) ---\n")
+            log.flush()
+            post_model = output_dir if backend == "vanilla" else model_name
+            post_adapter = None if backend == "vanilla" else output_dir
+            rc = _stream_subprocess(
+                _build_eval_cmd(
+                    model_name=post_model,
+                    adapter_dir=post_adapter,
+                    difficulty=difficulty,
+                    episodes=eval_episodes,
+                    max_steps=max_steps,
+                    tag="post_train",
+                    out=post_jsonl,
+                    backend=backend,
+                ),
+                log,
+            )
+            if rc != 0:
+                log.write(f"\n[warn] post-train eval failed (rc={rc}); evidence will be partial\n")
+                log.flush()
+            log.write("\n--- evidence: before/after summary, distribution, trajectories ---\n")
+            log.flush()
+            try:
+                from training.evidence import (
+                    EvidencePaths,
+                    render_before_after,
+                    render_sample_trajectories,
+                    render_training_curve,
+                    render_reward_components,
+                    render_checkpoint_progression,
+                )
+                paths = EvidencePaths(root=Path(evidence_str))
+                paths.ensure()
+                metrics = render_before_after(
+                    pre_jsonl=Path(pre_jsonl),
+                    post_jsonl=Path(post_jsonl),
+                    summary_png=paths.before_after_summary_png,
+                    distribution_png=paths.reward_distribution_png,
+                    metrics_json=paths.before_after_metrics_json,
+                )
+                render_sample_trajectories(
+                    pre_jsonl=Path(pre_jsonl),
+                    post_jsonl=Path(post_jsonl),
+                    md_path=paths.sample_trajectories_md,
+                )
+                render_training_curve(paths.training_log_csv, paths.training_curve_png)
+                render_reward_components(
+                    paths.reward_components_csv, paths.reward_components_png,
+                )
+                render_checkpoint_progression(
+                    paths.checkpoint_evals_csv, paths.checkpoint_progression_png,
+                )
+                log.write(json.dumps(metrics, indent=2) + "\n")
+                log.flush()
+            except Exception as exc:
+                log.write(f"[warn] evidence rendering failed: {exc}\n")
+                log.flush()
+            if os.environ.get("HF_TOKEN"):
+                if backend == "vanilla":
+                    log.write("\n--- push vanilla model to Hub ---\n")
+                    log.flush()
+                    _push_model_folder_to_hub(
+                        output_dir=Path(output_dir),
+                        repo_id=push_repo,
+                        base_model=model_name,
+                        log=log,
+                    )
+                else:
+                    log.write("\n--- push adapters to Hub ---\n")
+                    log.flush()
+                    _stream_subprocess(
+                        [
+                            sys.executable, "-m", "scripts.push_to_hub", "model",
+                            "--adapter_dir", output_dir,
+                            "--repo_id", push_repo,
+                            "--base_model", model_name,
+                        ],
+                        log,
+                    )
+                _push_evidence_to_hub(
+                    evidence_dir=evidence_dir,
+                    repo_id=push_repo,
+                    log=log,
+                )
+            else:
+                log.write("\n[skip] HF_TOKEN not set — not pushing to Hub\n")
+                log.flush()
+            with STATE.lock:
+                STATE.status = "finished"
+        except Exception as exc:
+            logger.exception("training pipeline failed")
+            with STATE.lock:
+                STATE.status = "failed"
+                STATE.last_error = str(exc)
+        finally:
+            finished = datetime.now(timezone.utc).isoformat()
+            log.write(f"\n=== Training ended {finished} ===\n")
+            log.flush()
+            with STATE.lock:
+                STATE.finished_at = finished
+def _start_training(config: Dict[str, Any]) -> None:
+    with STATE.lock:
+        if STATE.status == "running":
+            raise RuntimeError("a training run is already in progress")
+        STATE.thread = threading.Thread(
+            target=_training_pipeline,
+            args=(config,),
+            name="cernenv-trainer",
+            daemon=True,
+        )
+        STATE.thread.start()
+# ── FastAPI app ──────────────────────────────────────────────────────────
+app = FastAPI(title="CERNenv Trainer", version="0.1.0")
+_HTML = """\
+<!doctype html>
+<html lang=en>
+<head>
+  <meta charset=utf-8>
+  <title>CERNenv Trainer</title>
+  <style>
+    body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 2rem auto;
+           max-width: 1000px; color:#111; padding: 0 1rem; line-height:1.5 }
+    h1 { margin-bottom: 0 }
+    h2 { margin-top: 2rem; border-bottom:1px solid #eee; padding-bottom:.25rem }
+    .muted { color:#666 }
+    pre { background:#0e1116; color:#e6edf3; padding:1rem; border-radius:6px;
+          overflow-x:auto; max-height:40vh; font-size:.85em }
+    button { font-size:1rem; padding:.6rem 1rem; border-radius:6px; border:1px solid #888;
+             background:#fff; cursor:pointer; margin-right:.4rem }
+    .pill { display:inline-block; padding:.1rem .55rem; border-radius:999px;
+            background:#eef; color:#225; font-size:.85em }
+    .ok { background:#dfd; color:#272 }
+    .fail { background:#fdd; color:#822 }
+    .run { background:#fdf6d8; color:#774 }
+    table { border-collapse:collapse; margin:.5rem 0 }
+    td, th { padding:.25rem .8rem .25rem 0; vertical-align: top; text-align:left }
+    th { color:#444; font-weight:600 }
+    .grid { display:grid; grid-template-columns:1fr 1fr; gap:1rem }
+    .card { border:1px solid #e5e7eb; border-radius:8px; padding:.75rem; background:#fafafa }
+    .card img { max-width:100%; border-radius:4px }
+    .delta-pos { color:#15803d; font-weight:600 }
+    .delta-neg { color:#b91c1c; font-weight:600 }
+    code { background:#f4f4f4; padding:.05rem .35rem; border-radius:4px }
+    a { color:#1d4ed8 }
+  </style>
+</head>
+<body>
+  <h1>⚛️ CERNenv Trainer</h1>
+  <p class=muted>GRPO + Unsloth + LoRA on the CERNenv LHC discovery environment. Multi-GPU on Hugging Face Spaces.</p>
+  <h2>Run status</h2>
+  <p>Status: <span id=status class=pill>?</span></p>
+  <table id=meta></table>
+  <p>
+    <button onclick="startRun()">▶ Start training</button>
+    <button onclick="refresh()">↻ Refresh</button>
+    <a href="/evidence" target=_blank><button>📁 Evidence index</button></a>
+    <a href="/docs" target=_blank><button>🛠 API</button></a>
+  </p>
+  <h2>Training-progress evidence</h2>
+  <p class=muted>Auto-updated as training runs. All artifacts are also saved to <code>evidence/</code> and pushed to the model repo on the Hub.</p>
+  <div class=grid>
+    <div class=card><b>Per-step training curve</b><br>
+      <img id=curve src="/evidence/training_curve.png" onerror="this.style.display='none'">
+      <div id=curve_missing class=muted style="display:none">(not yet — waiting for first GRPO step)</div>
+    </div>
+    <div class=card><b>Reward components (terminal vs shaping)</b><br>
+      <img id=components src="/evidence/reward_components.png" onerror="this.style.display='none'">
+      <div id=components_missing class=muted style="display:none">(populated after a few rollouts — watches verifier hacks)</div>
+    </div>
+    <div class=card><b>Mid-training checkpoint progression</b><br>
+      <img id=ckpt src="/evidence/checkpoint_progression.png" onerror="this.style.display='none'">
+      <div id=ckpt_missing class=muted style="display:none">(not yet — waiting for first checkpoint eval)</div>
+    </div>
+    <div class=card><b>Before vs after summary</b><br>
+      <img id=summary src="/evidence/before_after_summary.png" onerror="this.style.display='none'">
+      <div id=summary_missing class=muted style="display:none">(generated after post-train eval)</div>
+    </div>
+    <div class=card><b>Reward distribution: pre vs post</b><br>
+      <img id=dist src="/evidence/reward_distribution.png" onerror="this.style.display='none'">
+      <div id=dist_missing class=muted style="display:none">(generated after post-train eval)</div>
+    </div>
+  </div>
+  <h2>Before / after metrics</h2>
+  <table id=metrics_table>
+    <tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr>
+  </table>
+  <h2>Live logs (tail)</h2>
+  <pre id=logs>loading…</pre>
+<script>
+function fmt(v) {
+  if (v == null) return '–';
+  if (typeof v === 'number') return v.toFixed(3);
+  return v;
+}
+function fmtDelta(d) {
+  if (d == null || isNaN(d)) return '–';
+  const sign = d >= 0 ? '+' : '';
+  const cls = d >= 0 ? 'delta-pos' : 'delta-neg';
+  return `<span class="${cls}">${sign}${d.toFixed(3)}</span>`;
+}
+async function refresh() {
+  // status
+  const s = await fetch('/status').then(r => r.json());
+  const pill = document.getElementById('status');
+  pill.textContent = s.status;
+  pill.className = 'pill ' + ({idle:'',running:'run',finished:'ok',failed:'fail'}[s.status] || '');
+  const meta = document.getElementById('meta');
+  meta.innerHTML = '';
+  const obj = {
+    started_at: s.started_at, finished_at: s.finished_at, error: s.last_error,
+    ...(s.last_config || {}),
+  };
+  for (const [k, v] of Object.entries(obj)) {
+    if (v == null || v === '') continue;
+    const tr = document.createElement('tr');
+    tr.innerHTML = `<td><b>${k}</b></td><td><code>${v}</code></td>`;
+    meta.appendChild(tr);
+  }
+  // metrics
+  const m = await fetch('/metrics').then(r => r.json()).catch(() => ({pre:null, post:null}));
+  const tbody = document.getElementById('metrics_table');
+  tbody.innerHTML = '<tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr>';
+  const fields = ['mean_reward', 'success_rate', 'mass_acc', 'channel_acc', 'median_reward'];
+  for (const f of fields) {
+    const pre = m.pre && m.pre[f];
+    const post = m.post && m.post[f];
+    const delta = m.delta && m.delta[f];
+    const tr = document.createElement('tr');
+    tr.innerHTML = `<td><code>${f}</code></td><td>${fmt(pre)}</td><td>${fmt(post)}</td><td>${fmtDelta(delta)}</td>`;
+    tbody.appendChild(tr);
+  }
+  // bust caches on plots
+  const bust = '?t=' + Date.now();
+  for (const [imgId, missingId] of [
+    ['curve', 'curve_missing'],
+    ['components', 'components_missing'],
+    ['ckpt', 'ckpt_missing'],
+    ['summary', 'summary_missing'],
+    ['dist', 'dist_missing'],
+  ]) {
+    const img = document.getElementById(imgId);
+    const miss = document.getElementById(missingId);
+    const baseSrc = img.getAttribute('src').split('?')[0];
+    const probe = new Image();
+    probe.onload  = () => { img.src = baseSrc + bust; img.style.display=''; miss.style.display='none'; };
+    probe.onerror = () => { img.style.display='none'; miss.style.display=''; };
+    probe.src = baseSrc + bust;
+  }
+  const logs = await fetch('/logs?tail=200').then(r => r.text());
+  document.getElementById('logs').textContent = logs || '(no logs yet)';
+}
+async function startRun() {
+  const r = await fetch('/train', {method:'POST'});
+  if (!r.ok) alert((await r.json()).detail || 'failed');
+  setTimeout(refresh, 500);
+}
+refresh();
+setInterval(refresh, 5000);
+</script>
+</body>
+</html>
+"""
+@app.get("/", response_class=HTMLResponse)
+def index() -> HTMLResponse:
+    return HTMLResponse(_HTML)
+@app.get("/health")
+def health() -> Dict[str, str]:
+    return {"status": "ok"}
+@app.get("/status")
+def status() -> JSONResponse:
+    return JSONResponse(STATE.to_dict())
+@app.get("/metrics")
+def metrics() -> JSONResponse:
+    if METRICS_FILE.exists():
+        try:
+            return JSONResponse(json.loads(METRICS_FILE.read_text()))
+        except Exception:
+            return JSONResponse({"error": "metrics file unreadable"}, status_code=500)
+    return JSONResponse({"pre": None, "post": None, "delta": None})
+@app.get("/evidence")
+def evidence_index() -> JSONResponse:
+    """List every evidence artifact currently on disk."""
+    files = []
+    if EVIDENCE_DIR.exists():
+        for p in sorted(EVIDENCE_DIR.iterdir()):
+            if p.is_file():
+                files.append({
+                    "name": p.name,
+                    "size": p.stat().st_size,
+                    "url": f"/evidence/{p.name}",
+                })
+    return JSONResponse({"dir": str(EVIDENCE_DIR), "files": files})
+@app.get("/evidence/{name}")
+def evidence_file(name: str):
+    """Serve a single evidence artifact (PNG/CSV/JSON/MD) by filename."""
+    if "/" in name or ".." in name:
+        raise HTTPException(status_code=400, detail="invalid name")
+    target = EVIDENCE_DIR / name
+    if not target.exists() or not target.is_file():
+        raise HTTPException(status_code=404, detail=f"{name} not found")
+    return FileResponse(target)
+@app.get("/logs", response_class=PlainTextResponse)
+def logs(tail: int = 400) -> PlainTextResponse:
+    if not LOG_FILE.exists():
+        return PlainTextResponse("")
+    text = LOG_FILE.read_text()
+    lines = text.splitlines()
+    return PlainTextResponse("\n".join(lines[-max(tail, 1):]))
+@app.post("/train")
+def train() -> JSONResponse:
+    try:
+        _start_training(dict(CONFIG))
+    except RuntimeError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+    return JSONResponse({"status": "started", "config": CONFIG})
+@app.on_event("startup")
+def _maybe_autostart() -> None:
+    if CONFIG["autostart"]:
+        try:
+            _start_training(dict(CONFIG))
+            logger.info("autostarted training run")
+        except RuntimeError as exc:
+            logger.warning("autostart skipped: %s", exc)