Spaces:
Sleeping
Sleeping
| """FastAPI control panel for the CERNenv trainer Space. | |
| Endpoints: | |
| GET / → status page (HTML) | |
| GET /status → JSON status of the current training run | |
| GET /metrics → JSON snapshot of reward / success rate | |
| GET /logs → tail of the training log | |
| POST /train → start (or restart) a training run | |
| GET /health → liveness probe | |
| Designed to run on a Hugging Face Space with `sdk: docker`. Heavy training | |
| work runs in a background thread so the HTTP server stays responsive. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import subprocess | |
| import sys | |
| import threading | |
| import time | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any, Dict, Optional | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse | |
| from fastapi.staticfiles import StaticFiles | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") | |
| logger = logging.getLogger(__name__) | |
| def _resolve_repo_root() -> Path: | |
| env_root = os.environ.get("CERNENV_ROOT") | |
| candidates = [] | |
| if env_root: | |
| candidates.append(Path(env_root)) | |
| candidates.extend([ | |
| Path("/home/user/app"), | |
| Path(__file__).resolve().parent.parent.parent, | |
| ]) | |
| for p in candidates: | |
| try: | |
| if p.exists(): | |
| return p.resolve() | |
| except OSError: | |
| continue | |
| return candidates[-1].resolve() | |
| REPO_ROOT = _resolve_repo_root() | |
| LOG_DIR = REPO_ROOT / "training" / "runs" | |
| try: | |
| LOG_DIR.mkdir(parents=True, exist_ok=True) | |
| except OSError as exc: # pragma: no cover - read-only filesystem fallback | |
| logger.warning("could not create %s (%s); using /tmp", LOG_DIR, exc) | |
| LOG_DIR = Path("/tmp/cernenv-runs") | |
| LOG_DIR.mkdir(parents=True, exist_ok=True) | |
| LOG_FILE = LOG_DIR / "training.log" | |
| EVIDENCE_DIR = REPO_ROOT / "evidence" | |
| try: | |
| EVIDENCE_DIR.mkdir(parents=True, exist_ok=True) | |
| except OSError: # pragma: no cover | |
| EVIDENCE_DIR = Path("/tmp/cernenv-evidence") | |
| EVIDENCE_DIR.mkdir(parents=True, exist_ok=True) | |
| METRICS_FILE = EVIDENCE_DIR / "before_after_metrics.json" | |
| def _env(name: str, default: str) -> str: | |
| return os.environ.get(name, default) | |
| def _detect_gpus() -> int: | |
| try: | |
| import torch # type: ignore | |
| if torch.cuda.is_available(): | |
| return torch.cuda.device_count() | |
| except Exception: | |
| pass | |
| try: | |
| out = subprocess.run( | |
| ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], | |
| capture_output=True, text=True, timeout=5, | |
| ) | |
| return len([l for l in out.stdout.splitlines() if l.strip()]) | |
| except Exception: | |
| return 0 | |
| _NUM_GPUS = _detect_gpus() | |
| CONFIG = { | |
| "model_name": _env("MODEL_NAME", "unsloth/Qwen2.5-3B-Instruct"), | |
| "difficulty": _env("DIFFICULTY", "easy"), | |
| "curriculum": _env("CURRICULUM", "1") == "1", | |
| "curriculum_promote": float(_env("CURRICULUM_PROMOTE", "0.55")), | |
| "curriculum_demote": float(_env("CURRICULUM_DEMOTE", "0.10")), | |
| "total_episodes": int(_env("TOTAL_EPISODES", "1500")), | |
| "max_steps": int(_env("MAX_STEPS", "18")), | |
| "num_generations": int(_env("NUM_GENERATIONS", "8")), | |
| "checkpoint_eval_steps": int(_env("CHECKPOINT_EVAL_STEPS", "25")), | |
| "checkpoint_eval_episodes": int(_env("CHECKPOINT_EVAL_EPISODES", "8")), | |
| "eval_episodes": int(_env("EVAL_EPISODES", "32")), | |
| "output_dir": _env("OUTPUT_DIR", "runs/unsloth-grpo"), | |
| "evidence_dir": _env("EVIDENCE_DIR", "evidence"), | |
| "num_gpus": int(_env("NUM_GPUS", str(_NUM_GPUS or 1))), | |
| "hf_username": _env("HF_USERNAME", "anugrah55"), | |
| "push_repo": _env( | |
| "PUSH_REPO", | |
| f"{_env('HF_USERNAME', 'anugrah55')}/cernenv-grpo-qwen2.5-3b", | |
| ), | |
| "autostart": _env("AUTOSTART", "0") == "1", | |
| } | |
| # ── Run state ──────────────────────────────────────────────────────────── | |
| class RunState: | |
| def __init__(self) -> None: | |
| self.lock = threading.Lock() | |
| self.thread: Optional[threading.Thread] = None | |
| self.process: Optional[subprocess.Popen] = None | |
| self.status: str = "idle" # idle | running | finished | failed | |
| self.started_at: Optional[str] = None | |
| self.finished_at: Optional[str] = None | |
| self.last_error: Optional[str] = None | |
| self.last_config: Dict[str, Any] = {} | |
| def to_dict(self) -> Dict[str, Any]: | |
| with self.lock: | |
| return { | |
| "status": self.status, | |
| "started_at": self.started_at, | |
| "finished_at": self.finished_at, | |
| "last_error": self.last_error, | |
| "last_config": self.last_config, | |
| } | |
| STATE = RunState() | |
| # ── Training pipeline ──────────────────────────────────────────────────── | |
| def _stream_subprocess(cmd: list[str], log_handle) -> int: | |
| log_handle.write(f"\n$ {' '.join(cmd)}\n") | |
| log_handle.flush() | |
| proc = subprocess.Popen( | |
| cmd, | |
| cwd=str(REPO_ROOT), | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| bufsize=1, | |
| universal_newlines=True, | |
| env={**os.environ, "PYTHONPATH": str(REPO_ROOT)}, | |
| ) | |
| STATE.process = proc | |
| assert proc.stdout is not None | |
| for line in proc.stdout: | |
| log_handle.write(line) | |
| log_handle.flush() | |
| rc = proc.wait() | |
| log_handle.write(f"[exit code {rc}]\n") | |
| log_handle.flush() | |
| STATE.process = None | |
| return rc | |
| def _build_training_cmd(config: Dict[str, Any]) -> list[str]: | |
| """Compose the training launcher (single-GPU python or multi-GPU accelerate).""" | |
| base = [ | |
| "-m", "training.training_unsloth", | |
| "--model_name", config["model_name"], | |
| "--difficulty", config["difficulty"], | |
| "--total_episodes", str(config["total_episodes"]), | |
| "--max_steps", str(config["max_steps"]), | |
| "--num_generations", str(config["num_generations"]), | |
| "--checkpoint_eval_steps", str(config["checkpoint_eval_steps"]), | |
| "--checkpoint_eval_episodes", str(config["checkpoint_eval_episodes"]), | |
| "--output_dir", config["output_dir"], | |
| "--evidence_dir", config["evidence_dir"], | |
| ] | |
| if config.get("curriculum"): | |
| base.extend([ | |
| "--curriculum", | |
| "--curriculum_promote", str(config["curriculum_promote"]), | |
| "--curriculum_demote", str(config["curriculum_demote"]), | |
| ]) | |
| n = max(int(config.get("num_gpus", 1)), 1) | |
| if n > 1: | |
| return ["accelerate", "launch", "--num_processes", str(n), "--mixed_precision", "bf16"] + base | |
| return [sys.executable] + base | |
| def _push_evidence_to_hub(*, evidence_dir: Path, repo_id: str, log) -> None: | |
| """Upload the entire evidence/ directory to the model repo.""" | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| log.write("\n[skip] HF_TOKEN not set — evidence not pushed\n") | |
| log.flush() | |
| return | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=token) | |
| api.upload_folder( | |
| folder_path=str(evidence_dir), | |
| repo_id=repo_id, | |
| repo_type="model", | |
| path_in_repo="evidence", | |
| commit_message="Upload CERNenv training evidence (curves, evals, plots)", | |
| ) | |
| log.write(f"\n[ok] uploaded evidence/ → https://huggingface.co/{repo_id}/tree/main/evidence\n") | |
| log.flush() | |
| except Exception as exc: | |
| log.write(f"\n[warn] evidence push failed: {exc}\n") | |
| log.flush() | |
| def _training_pipeline(config: Dict[str, Any]) -> None: | |
| started = datetime.now(timezone.utc).isoformat() | |
| with STATE.lock: | |
| STATE.status = "running" | |
| STATE.started_at = started | |
| STATE.finished_at = None | |
| STATE.last_error = None | |
| STATE.last_config = dict(config) | |
| evidence_dir = Path(config["evidence_dir"]).resolve() | |
| evidence_dir.mkdir(parents=True, exist_ok=True) | |
| LOG_FILE.parent.mkdir(parents=True, exist_ok=True) | |
| with open(LOG_FILE, "a") as log: | |
| log.write(f"\n=== Training started {started} ===\n") | |
| log.write(json.dumps(config, indent=2) + "\n") | |
| log.flush() | |
| try: | |
| output_dir = config["output_dir"] | |
| difficulty = config["difficulty"] | |
| max_steps = str(config["max_steps"]) | |
| eval_episodes = str(config["eval_episodes"]) | |
| model_name = config["model_name"] | |
| push_repo = config["push_repo"] | |
| evidence_str = config["evidence_dir"] | |
| pre_jsonl = f"{evidence_str}/pre_eval.jsonl" | |
| post_jsonl = f"{evidence_str}/post_eval.jsonl" | |
| log.write("\n--- baseline sanity check (random / heuristic / oracle) ---\n") | |
| log.flush() | |
| for agent in ("random", "heuristic", "oracle"): | |
| _stream_subprocess( | |
| [ | |
| sys.executable, "-m", "scripts.run_agent", | |
| "--agent", agent, "--difficulty", difficulty, | |
| "--episodes", "3", "--quiet", | |
| ], | |
| log, | |
| ) | |
| log.write(f"\n--- pre-train evaluation ({eval_episodes} eps) ---\n") | |
| log.flush() | |
| rc = _stream_subprocess( | |
| [ | |
| sys.executable, "-m", "training.evaluate", | |
| "--model_name", model_name, | |
| "--difficulty", difficulty, | |
| "--episodes", eval_episodes, | |
| "--max_steps", max_steps, | |
| "--tag", "pre_train", | |
| "--out", pre_jsonl, | |
| ], | |
| log, | |
| ) | |
| if rc != 0: | |
| # don't abort — we still want training + post-eval evidence. | |
| log.write(f"\n[warn] pre-train eval failed (rc={rc}); continuing without baseline\n") | |
| log.flush() | |
| log.write(f"\n--- GRPO training ({config['num_gpus']} GPU process(es)) ---\n") | |
| log.flush() | |
| rc = _stream_subprocess(_build_training_cmd(config), log) | |
| if rc != 0: | |
| raise RuntimeError(f"training failed (rc={rc})") | |
| # ── LoRA save-and-reload smoke test ───────────────────── | |
| # Hackathon FAQ Q9: "Do not upcast a 4-bit model to 16-bit | |
| # and then merge the LoRA weights naively" — the canonical | |
| # cause of a broken push. Before we burn time on the full | |
| # post-train evaluation (32 eps), do a 2-episode cold-load | |
| # rollout against the saved adapters. If that fails, abort | |
| # immediately so we surface a save problem, not a 30-min | |
| # eval timeout. | |
| log.write( | |
| f"\n--- adapter save/reload smoke test " | |
| f"(loading {output_dir} cold-start, 2 eps) ---\n" | |
| ) | |
| log.flush() | |
| rc = _stream_subprocess( | |
| [ | |
| sys.executable, "-m", "training.evaluate", | |
| "--model_name", model_name, | |
| "--adapter_dir", output_dir, | |
| "--difficulty", difficulty, | |
| "--episodes", "2", | |
| "--max_steps", max_steps, | |
| "--tag", "smoke", | |
| "--out", f"{evidence_str}/smoke_eval.jsonl", | |
| ], | |
| log, | |
| ) | |
| if rc != 0: | |
| raise RuntimeError( | |
| f"adapter smoke test failed (rc={rc}); refusing to push " | |
| f"unloadable adapters to the Hub. Inspect {output_dir} and " | |
| "verify adapter_config.json + adapter_model.safetensors exist." | |
| ) | |
| log.write(f"\n--- post-train evaluation ({eval_episodes} eps) ---\n") | |
| log.flush() | |
| rc = _stream_subprocess( | |
| [ | |
| sys.executable, "-m", "training.evaluate", | |
| "--model_name", model_name, | |
| "--adapter_dir", output_dir, | |
| "--difficulty", difficulty, | |
| "--episodes", eval_episodes, | |
| "--max_steps", max_steps, | |
| "--tag", "post_train", | |
| "--out", post_jsonl, | |
| ], | |
| log, | |
| ) | |
| if rc != 0: | |
| log.write(f"\n[warn] post-train eval failed (rc={rc}); evidence will be partial\n") | |
| log.flush() | |
| log.write("\n--- evidence: before/after summary, distribution, trajectories ---\n") | |
| log.flush() | |
| try: | |
| from training.evidence import ( | |
| EvidencePaths, | |
| render_before_after, | |
| render_sample_trajectories, | |
| render_training_curve, | |
| render_reward_components, | |
| render_checkpoint_progression, | |
| ) | |
| paths = EvidencePaths(root=Path(evidence_str)) | |
| paths.ensure() | |
| metrics = render_before_after( | |
| pre_jsonl=Path(pre_jsonl), | |
| post_jsonl=Path(post_jsonl), | |
| summary_png=paths.before_after_summary_png, | |
| distribution_png=paths.reward_distribution_png, | |
| metrics_json=paths.before_after_metrics_json, | |
| ) | |
| render_sample_trajectories( | |
| pre_jsonl=Path(pre_jsonl), | |
| post_jsonl=Path(post_jsonl), | |
| md_path=paths.sample_trajectories_md, | |
| ) | |
| render_training_curve(paths.training_log_csv, paths.training_curve_png) | |
| render_reward_components( | |
| paths.reward_components_csv, paths.reward_components_png, | |
| ) | |
| render_checkpoint_progression( | |
| paths.checkpoint_evals_csv, paths.checkpoint_progression_png, | |
| ) | |
| log.write(json.dumps(metrics, indent=2) + "\n") | |
| log.flush() | |
| except Exception as exc: | |
| log.write(f"[warn] evidence rendering failed: {exc}\n") | |
| log.flush() | |
| if os.environ.get("HF_TOKEN"): | |
| log.write("\n--- push adapters to Hub ---\n") | |
| log.flush() | |
| _stream_subprocess( | |
| [ | |
| sys.executable, "-m", "scripts.push_to_hub", "model", | |
| "--adapter_dir", output_dir, | |
| "--repo_id", push_repo, | |
| "--base_model", model_name, | |
| ], | |
| log, | |
| ) | |
| _push_evidence_to_hub( | |
| evidence_dir=evidence_dir, | |
| repo_id=push_repo, | |
| log=log, | |
| ) | |
| else: | |
| log.write("\n[skip] HF_TOKEN not set — not pushing to Hub\n") | |
| log.flush() | |
| with STATE.lock: | |
| STATE.status = "finished" | |
| except Exception as exc: | |
| logger.exception("training pipeline failed") | |
| with STATE.lock: | |
| STATE.status = "failed" | |
| STATE.last_error = str(exc) | |
| finally: | |
| finished = datetime.now(timezone.utc).isoformat() | |
| log.write(f"\n=== Training ended {finished} ===\n") | |
| log.flush() | |
| with STATE.lock: | |
| STATE.finished_at = finished | |
| def _start_training(config: Dict[str, Any]) -> None: | |
| with STATE.lock: | |
| if STATE.status == "running": | |
| raise RuntimeError("a training run is already in progress") | |
| STATE.thread = threading.Thread( | |
| target=_training_pipeline, | |
| args=(config,), | |
| name="cernenv-trainer", | |
| daemon=True, | |
| ) | |
| STATE.thread.start() | |
| # ── FastAPI app ────────────────────────────────────────────────────────── | |
| app = FastAPI(title="CERNenv Trainer", version="0.1.0") | |
| _HTML = """\ | |
| <!doctype html> | |
| <html lang=en> | |
| <head> | |
| <meta charset=utf-8> | |
| <title>CERNenv Trainer</title> | |
| <style> | |
| body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 2rem auto; | |
| max-width: 1000px; color:#111; padding: 0 1rem; line-height:1.5 } | |
| h1 { margin-bottom: 0 } | |
| h2 { margin-top: 2rem; border-bottom:1px solid #eee; padding-bottom:.25rem } | |
| .muted { color:#666 } | |
| pre { background:#0e1116; color:#e6edf3; padding:1rem; border-radius:6px; | |
| overflow-x:auto; max-height:40vh; font-size:.85em } | |
| button { font-size:1rem; padding:.6rem 1rem; border-radius:6px; border:1px solid #888; | |
| background:#fff; cursor:pointer; margin-right:.4rem } | |
| .pill { display:inline-block; padding:.1rem .55rem; border-radius:999px; | |
| background:#eef; color:#225; font-size:.85em } | |
| .ok { background:#dfd; color:#272 } | |
| .fail { background:#fdd; color:#822 } | |
| .run { background:#fdf6d8; color:#774 } | |
| table { border-collapse:collapse; margin:.5rem 0 } | |
| td, th { padding:.25rem .8rem .25rem 0; vertical-align: top; text-align:left } | |
| th { color:#444; font-weight:600 } | |
| .grid { display:grid; grid-template-columns:1fr 1fr; gap:1rem } | |
| .card { border:1px solid #e5e7eb; border-radius:8px; padding:.75rem; background:#fafafa } | |
| .card img { max-width:100%; border-radius:4px } | |
| .delta-pos { color:#15803d; font-weight:600 } | |
| .delta-neg { color:#b91c1c; font-weight:600 } | |
| code { background:#f4f4f4; padding:.05rem .35rem; border-radius:4px } | |
| a { color:#1d4ed8 } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>⚛️ CERNenv Trainer</h1> | |
| <p class=muted>GRPO + Unsloth + LoRA on the CERNenv LHC discovery environment. Multi-GPU on Hugging Face Spaces.</p> | |
| <h2>Run status</h2> | |
| <p>Status: <span id=status class=pill>?</span></p> | |
| <table id=meta></table> | |
| <p> | |
| <button onclick="startRun()">▶ Start training</button> | |
| <button onclick="refresh()">↻ Refresh</button> | |
| <a href="/evidence" target=_blank><button>📁 Evidence index</button></a> | |
| <a href="/docs" target=_blank><button>🛠 API</button></a> | |
| </p> | |
| <h2>Training-progress evidence</h2> | |
| <p class=muted>Auto-updated as training runs. All artifacts are also saved to <code>evidence/</code> and pushed to the model repo on the Hub.</p> | |
| <div class=grid> | |
| <div class=card><b>Per-step training curve</b><br> | |
| <img id=curve src="/evidence/training_curve.png" onerror="this.style.display='none'"> | |
| <div id=curve_missing class=muted style="display:none">(not yet — waiting for first GRPO step)</div> | |
| </div> | |
| <div class=card><b>Reward components (terminal vs shaping)</b><br> | |
| <img id=components src="/evidence/reward_components.png" onerror="this.style.display='none'"> | |
| <div id=components_missing class=muted style="display:none">(populated after a few rollouts — watches verifier hacks)</div> | |
| </div> | |
| <div class=card><b>Mid-training checkpoint progression</b><br> | |
| <img id=ckpt src="/evidence/checkpoint_progression.png" onerror="this.style.display='none'"> | |
| <div id=ckpt_missing class=muted style="display:none">(not yet — waiting for first checkpoint eval)</div> | |
| </div> | |
| <div class=card><b>Before vs after summary</b><br> | |
| <img id=summary src="/evidence/before_after_summary.png" onerror="this.style.display='none'"> | |
| <div id=summary_missing class=muted style="display:none">(generated after post-train eval)</div> | |
| </div> | |
| <div class=card><b>Reward distribution: pre vs post</b><br> | |
| <img id=dist src="/evidence/reward_distribution.png" onerror="this.style.display='none'"> | |
| <div id=dist_missing class=muted style="display:none">(generated after post-train eval)</div> | |
| </div> | |
| </div> | |
| <h2>Before / after metrics</h2> | |
| <table id=metrics_table> | |
| <tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr> | |
| </table> | |
| <h2>Live logs (tail)</h2> | |
| <pre id=logs>loading…</pre> | |
| <script> | |
| function fmt(v) { | |
| if (v == null) return '–'; | |
| if (typeof v === 'number') return v.toFixed(3); | |
| return v; | |
| } | |
| function fmtDelta(d) { | |
| if (d == null || isNaN(d)) return '–'; | |
| const sign = d >= 0 ? '+' : ''; | |
| const cls = d >= 0 ? 'delta-pos' : 'delta-neg'; | |
| return `<span class="${cls}">${sign}${d.toFixed(3)}</span>`; | |
| } | |
| async function refresh() { | |
| // status | |
| const s = await fetch('/status').then(r => r.json()); | |
| const pill = document.getElementById('status'); | |
| pill.textContent = s.status; | |
| pill.className = 'pill ' + ({idle:'',running:'run',finished:'ok',failed:'fail'}[s.status] || ''); | |
| const meta = document.getElementById('meta'); | |
| meta.innerHTML = ''; | |
| const obj = { | |
| started_at: s.started_at, finished_at: s.finished_at, error: s.last_error, | |
| ...(s.last_config || {}), | |
| }; | |
| for (const [k, v] of Object.entries(obj)) { | |
| if (v == null || v === '') continue; | |
| const tr = document.createElement('tr'); | |
| tr.innerHTML = `<td><b>${k}</b></td><td><code>${v}</code></td>`; | |
| meta.appendChild(tr); | |
| } | |
| // metrics | |
| const m = await fetch('/metrics').then(r => r.json()).catch(() => ({pre:null, post:null})); | |
| const tbody = document.getElementById('metrics_table'); | |
| tbody.innerHTML = '<tr><th>metric</th><th>pre</th><th>post</th><th>Δ</th></tr>'; | |
| const fields = ['mean_reward', 'success_rate', 'mass_acc', 'channel_acc', 'median_reward']; | |
| for (const f of fields) { | |
| const pre = m.pre && m.pre[f]; | |
| const post = m.post && m.post[f]; | |
| const delta = m.delta && m.delta[f]; | |
| const tr = document.createElement('tr'); | |
| tr.innerHTML = `<td><code>${f}</code></td><td>${fmt(pre)}</td><td>${fmt(post)}</td><td>${fmtDelta(delta)}</td>`; | |
| tbody.appendChild(tr); | |
| } | |
| // bust caches on plots | |
| const bust = '?t=' + Date.now(); | |
| for (const [imgId, missingId] of [ | |
| ['curve', 'curve_missing'], | |
| ['components', 'components_missing'], | |
| ['ckpt', 'ckpt_missing'], | |
| ['summary', 'summary_missing'], | |
| ['dist', 'dist_missing'], | |
| ]) { | |
| const img = document.getElementById(imgId); | |
| const miss = document.getElementById(missingId); | |
| const baseSrc = img.getAttribute('src').split('?')[0]; | |
| const probe = new Image(); | |
| probe.onload = () => { img.src = baseSrc + bust; img.style.display=''; miss.style.display='none'; }; | |
| probe.onerror = () => { img.style.display='none'; miss.style.display=''; }; | |
| probe.src = baseSrc + bust; | |
| } | |
| const logs = await fetch('/logs?tail=200').then(r => r.text()); | |
| document.getElementById('logs').textContent = logs || '(no logs yet)'; | |
| } | |
| async function startRun() { | |
| const r = await fetch('/train', {method:'POST'}); | |
| if (!r.ok) alert((await r.json()).detail || 'failed'); | |
| setTimeout(refresh, 500); | |
| } | |
| refresh(); | |
| setInterval(refresh, 5000); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| def index() -> HTMLResponse: | |
| return HTMLResponse(_HTML) | |
| def health() -> Dict[str, str]: | |
| return {"status": "ok"} | |
| def status() -> JSONResponse: | |
| return JSONResponse(STATE.to_dict()) | |
| def metrics() -> JSONResponse: | |
| if METRICS_FILE.exists(): | |
| try: | |
| return JSONResponse(json.loads(METRICS_FILE.read_text())) | |
| except Exception: | |
| return JSONResponse({"error": "metrics file unreadable"}, status_code=500) | |
| return JSONResponse({"pre": None, "post": None, "delta": None}) | |
| def evidence_index() -> JSONResponse: | |
| """List every evidence artifact currently on disk.""" | |
| files = [] | |
| if EVIDENCE_DIR.exists(): | |
| for p in sorted(EVIDENCE_DIR.iterdir()): | |
| if p.is_file(): | |
| files.append({ | |
| "name": p.name, | |
| "size": p.stat().st_size, | |
| "url": f"/evidence/{p.name}", | |
| }) | |
| return JSONResponse({"dir": str(EVIDENCE_DIR), "files": files}) | |
| def evidence_file(name: str): | |
| """Serve a single evidence artifact (PNG/CSV/JSON/MD) by filename.""" | |
| if "/" in name or ".." in name: | |
| raise HTTPException(status_code=400, detail="invalid name") | |
| target = EVIDENCE_DIR / name | |
| if not target.exists() or not target.is_file(): | |
| raise HTTPException(status_code=404, detail=f"{name} not found") | |
| return FileResponse(target) | |
| def logs(tail: int = 400) -> PlainTextResponse: | |
| if not LOG_FILE.exists(): | |
| return PlainTextResponse("") | |
| text = LOG_FILE.read_text() | |
| lines = text.splitlines() | |
| return PlainTextResponse("\n".join(lines[-max(tail, 1):])) | |
| def train() -> JSONResponse: | |
| try: | |
| _start_training(dict(CONFIG)) | |
| except RuntimeError as exc: | |
| raise HTTPException(status_code=409, detail=str(exc)) | |
| return JSONResponse({"status": "started", "config": CONFIG}) | |
| def _maybe_autostart() -> None: | |
| if CONFIG["autostart"]: | |
| try: | |
| _start_training(dict(CONFIG)) | |
| logger.info("autostarted training run") | |
| except RuntimeError as exc: | |
| logger.warning("autostart skipped: %s", exc) | |