"""Public leaderboard endpoint for the Chakravyuh OpenEnv submission. Hackathon plan E.10: expose a `/submit` POST + `/leaderboard` GET pair so external researchers can post their results against `chakravyuh-bench-v0` and have them ranked in one place. Seeded with three internal entries (scripted, v1, v2) so the leaderboard is non-empty at launch. Design choices: - **In-memory + JSONL persistence.** Submissions are written to ``logs/leaderboard.jsonl`` (append-only) so the leaderboard survives server restarts and is auditable. The in-memory list is reloaded at startup. No external DB. - **Open submission, light validation.** Anyone can POST a submission; we validate the payload shape but make no claim that the numbers are correct. The leaderboard URL itself is the audit trail — if someone submits inflated numbers they have to publish a reproducible artifact pointer or be obviously fake. - **MCP-compliance safe.** Routes are ``/leaderboard`` and ``/submit`` — neither name shadows OpenEnv core (``reset``/``step``/``state``/ ``close``). Pinned by ``tests/test_mcp_compliance.py``. - **Headline metric only.** The ranked column is ``f1`` (single number, handles both detection-heavy and FPR-heavy methods fairly). Rich data (per-difficulty, per-language, CIs) is preserved on each entry but not used to rank. """ from __future__ import annotations import json import logging import os import threading from datetime import datetime, timezone from pathlib import Path from typing import Any from fastapi import APIRouter, FastAPI, HTTPException, status from pydantic import BaseModel, ConfigDict, Field logger = logging.getLogger("chakravyuh.leaderboard") DEFAULT_STORE_PATH = Path("logs/leaderboard.jsonl") MAX_ENTRIES_RETURNED = 200 class LeaderboardSubmission(BaseModel): """Payload for ``POST /submit``.""" model_config = ConfigDict(extra="forbid") method: str = Field(min_length=1, max_length=120, description="Method name, e.g. 'GPT-4o (zero-shot)'") submitter: str = Field(min_length=1, max_length=80, description="Person or team handle") detection: float = Field(ge=0.0, le=1.0) fpr: float = Field(ge=0.0, le=1.0) f1: float = Field(ge=0.0, le=1.0) bench_version: str = Field(default="chakravyuh-bench-v0", max_length=40) n_evaluated: int = Field(ge=1, default=174) artifact_url: str | None = Field(default=None, max_length=400, description="HF Hub repo / GitHub gist / blog URL backing the numbers") notes: str | None = Field(default=None, max_length=1000) class LeaderboardEntry(BaseModel): """Stored / returned entry — `LeaderboardSubmission` plus server fields.""" model_config = ConfigDict(extra="ignore") method: str submitter: str detection: float fpr: float f1: float bench_version: str = "chakravyuh-bench-v0" n_evaluated: int = 174 artifact_url: str | None = None notes: str | None = None submitted_at: str # ISO-8601 UTC seeded: bool = False # internal vs external entry _SEED_ENTRIES: list[dict[str, Any]] = [ { "method": "Scripted baseline (rule-based)", "submitter": "chakravyuh-team", "detection": 0.701, "fpr": 0.290, "f1": 0.795, "n_evaluated": 174, "artifact_url": "https://github.com/UjjwalPardeshi/Chakravyuh/blob/main/data/chakravyuh-bench-v0/baselines.json", "notes": "11-signal taxonomy + legit-SMS allowlist. Threshold = 0.50.", "seeded": True, }, { "method": "Chakravyuh-Qwen2.5-LoRA v1 (reward-hacked)", "submitter": "chakravyuh-team", "detection": 1.0, "fpr": 0.360, "f1": 0.96, "n_evaluated": 135, "artifact_url": "https://github.com/UjjwalPardeshi/Chakravyuh/blob/main/logs/eval_v2.json", "notes": "Diagnostic baseline: this is the textbook reward-hacking fingerprint. Kept on the board to motivate v2.", "seeded": True, }, { "method": "Chakravyuh-Qwen2.5-LoRA v2", "submitter": "chakravyuh-team", "detection": 0.993, "fpr": 0.067, "f1": 0.99, "n_evaluated": 174, "artifact_url": "https://huggingface.co/ujjwalpardeshi/chakravyuh-analyzer-lora-v2", "notes": "Bootstrap 95% CIs at logs/bootstrap_v2.json: detection [0.979, 1.000], FPR [0.000, 0.167], F1 [0.976, 1.000].", "seeded": True, }, ] class LeaderboardStore: """Thread-safe in-memory leaderboard with JSONL persistence.""" def __init__(self, store_path: Path = DEFAULT_STORE_PATH) -> None: self.store_path = store_path self._lock = threading.Lock() self._entries: list[LeaderboardEntry] = [] self._load_or_seed() def _load_or_seed(self) -> None: """Load persisted entries; if none, seed and persist.""" if self.store_path.exists(): with self.store_path.open() as f: for line in f: line = line.strip() if not line: continue try: self._entries.append(LeaderboardEntry(**json.loads(line))) except Exception as exc: # pragma: no cover — diagnostic logger.warning("Skipping malformed entry: %s", exc) if self._entries: return # First-run seeding now = datetime.now(timezone.utc).isoformat(timespec="seconds") self.store_path.parent.mkdir(parents=True, exist_ok=True) with self.store_path.open("w") as f: for seed in _SEED_ENTRIES: entry = LeaderboardEntry(submitted_at=now, **seed) self._entries.append(entry) f.write(json.dumps(entry.model_dump()) + "\n") def add(self, submission: LeaderboardSubmission) -> LeaderboardEntry: entry = LeaderboardEntry( **submission.model_dump(), submitted_at=datetime.now(timezone.utc).isoformat(timespec="seconds"), seeded=False, ) with self._lock: self._entries.append(entry) with self.store_path.open("a") as f: f.write(json.dumps(entry.model_dump()) + "\n") return entry def list_ranked(self, limit: int = MAX_ENTRIES_RETURNED) -> list[LeaderboardEntry]: """Return entries sorted by F1 descending; stable on submitted_at.""" with self._lock: ordered = sorted( self._entries, key=lambda e: (-e.f1, e.fpr, e.submitted_at), ) return ordered[:limit] def reset_for_tests(self) -> None: """Test helper — clear entries and reseed. Not exposed via HTTP.""" with self._lock: self._entries = [] try: self.store_path.unlink() except FileNotFoundError: pass self._load_or_seed() def build_router(store: LeaderboardStore | None = None) -> APIRouter: """Return a FastAPI router with `/leaderboard` and `/submit` routes. The router is registered onto the main OpenEnv app in ``server/app.py``. """ if store is None: store_path = Path(os.getenv("CHAKRAVYUH_LEADERBOARD_PATH", str(DEFAULT_STORE_PATH))) store = LeaderboardStore(store_path=store_path) router = APIRouter(tags=["leaderboard"]) @router.get("/leaderboard") def get_leaderboard() -> dict[str, Any]: entries = store.list_ranked() return { "bench": "chakravyuh-bench-v0", "ranked_by": "f1", "n_entries": len(entries), "entries": [e.model_dump() for e in entries], } @router.post("/submit", status_code=status.HTTP_201_CREATED) def post_submit(submission: LeaderboardSubmission) -> dict[str, Any]: try: entry = store.add(submission) except Exception as exc: # pragma: no cover — schema validation upstream raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"failed to persist submission: {exc}", ) from exc rank = next( (i + 1 for i, e in enumerate(store.list_ranked()) if e.submitted_at == entry.submitted_at and e.method == entry.method), None, ) return { "ok": True, "rank": rank, "n_total": len(store.list_ranked(limit=10_000)), "entry": entry.model_dump(), } return router def attach_to_app(app: FastAPI, store: LeaderboardStore | None = None) -> None: """Register the leaderboard router on a FastAPI app instance.""" app.include_router(build_router(store))