Spaces:
Running
Running
| """Public leaderboard endpoint for the Chakravyuh OpenEnv submission. | |
| Hackathon plan E.10: expose a `/submit` POST + `/leaderboard` GET pair so | |
| external researchers can post their results against `chakravyuh-bench-v0` | |
| and have them ranked in one place. Seeded with three internal entries | |
| (scripted, v1, v2) so the leaderboard is non-empty at launch. | |
| Design choices: | |
| - **In-memory + JSONL persistence.** Submissions are written to | |
| ``logs/leaderboard.jsonl`` (append-only) so the leaderboard survives | |
| server restarts and is auditable. The in-memory list is reloaded at | |
| startup. No external DB. | |
| - **Open submission, light validation.** Anyone can POST a submission; | |
| we validate the payload shape but make no claim that the numbers are | |
| correct. The leaderboard URL itself is the audit trail — if someone | |
| submits inflated numbers they have to publish a reproducible artifact | |
| pointer or be obviously fake. | |
| - **MCP-compliance safe.** Routes are ``/leaderboard`` and ``/submit`` | |
| — neither name shadows OpenEnv core (``reset``/``step``/``state``/ | |
| ``close``). Pinned by ``tests/test_mcp_compliance.py``. | |
| - **Headline metric only.** The ranked column is ``f1`` (single number, | |
| handles both detection-heavy and FPR-heavy methods fairly). Rich data | |
| (per-difficulty, per-language, CIs) is preserved on each entry but | |
| not used to rank. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import threading | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any | |
| from fastapi import APIRouter, FastAPI, HTTPException, status | |
| from pydantic import BaseModel, ConfigDict, Field | |
| logger = logging.getLogger("chakravyuh.leaderboard") | |
| DEFAULT_STORE_PATH = Path("logs/leaderboard.jsonl") | |
| MAX_ENTRIES_RETURNED = 200 | |
| class LeaderboardSubmission(BaseModel): | |
| """Payload for ``POST /submit``.""" | |
| model_config = ConfigDict(extra="forbid") | |
| method: str = Field(min_length=1, max_length=120, | |
| description="Method name, e.g. 'GPT-4o (zero-shot)'") | |
| submitter: str = Field(min_length=1, max_length=80, | |
| description="Person or team handle") | |
| detection: float = Field(ge=0.0, le=1.0) | |
| fpr: float = Field(ge=0.0, le=1.0) | |
| f1: float = Field(ge=0.0, le=1.0) | |
| bench_version: str = Field(default="chakravyuh-bench-v0", max_length=40) | |
| n_evaluated: int = Field(ge=1, default=174) | |
| artifact_url: str | None = Field(default=None, max_length=400, | |
| description="HF Hub repo / GitHub gist / blog URL backing the numbers") | |
| notes: str | None = Field(default=None, max_length=1000) | |
| class LeaderboardEntry(BaseModel): | |
| """Stored / returned entry — `LeaderboardSubmission` plus server fields.""" | |
| model_config = ConfigDict(extra="ignore") | |
| method: str | |
| submitter: str | |
| detection: float | |
| fpr: float | |
| f1: float | |
| bench_version: str = "chakravyuh-bench-v0" | |
| n_evaluated: int = 174 | |
| artifact_url: str | None = None | |
| notes: str | None = None | |
| submitted_at: str # ISO-8601 UTC | |
| seeded: bool = False # internal vs external entry | |
| _SEED_ENTRIES: list[dict[str, Any]] = [ | |
| { | |
| "method": "Scripted baseline (rule-based)", | |
| "submitter": "chakravyuh-team", | |
| "detection": 0.701, | |
| "fpr": 0.290, | |
| "f1": 0.795, | |
| "n_evaluated": 174, | |
| "artifact_url": "https://github.com/UjjwalPardeshi/Chakravyuh/blob/main/data/chakravyuh-bench-v0/baselines.json", | |
| "notes": "11-signal taxonomy + legit-SMS allowlist. Threshold = 0.50.", | |
| "seeded": True, | |
| }, | |
| { | |
| "method": "Chakravyuh-Qwen2.5-LoRA v1 (reward-hacked)", | |
| "submitter": "chakravyuh-team", | |
| "detection": 1.0, | |
| "fpr": 0.360, | |
| "f1": 0.96, | |
| "n_evaluated": 135, | |
| "artifact_url": "https://github.com/UjjwalPardeshi/Chakravyuh/blob/main/logs/eval_v2.json", | |
| "notes": "Diagnostic baseline: this is the textbook reward-hacking fingerprint. Kept on the board to motivate v2.", | |
| "seeded": True, | |
| }, | |
| { | |
| "method": "Chakravyuh-Qwen2.5-LoRA v2", | |
| "submitter": "chakravyuh-team", | |
| "detection": 0.993, | |
| "fpr": 0.067, | |
| "f1": 0.99, | |
| "n_evaluated": 174, | |
| "artifact_url": "https://huggingface.co/ujjwalpardeshi/chakravyuh-analyzer-lora-v2", | |
| "notes": "Bootstrap 95% CIs at logs/bootstrap_v2.json: detection [0.979, 1.000], FPR [0.000, 0.167], F1 [0.976, 1.000].", | |
| "seeded": True, | |
| }, | |
| ] | |
| class LeaderboardStore: | |
| """Thread-safe in-memory leaderboard with JSONL persistence.""" | |
| def __init__(self, store_path: Path = DEFAULT_STORE_PATH) -> None: | |
| self.store_path = store_path | |
| self._lock = threading.Lock() | |
| self._entries: list[LeaderboardEntry] = [] | |
| self._load_or_seed() | |
| def _load_or_seed(self) -> None: | |
| """Load persisted entries; if none, seed and persist.""" | |
| if self.store_path.exists(): | |
| with self.store_path.open() as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| self._entries.append(LeaderboardEntry(**json.loads(line))) | |
| except Exception as exc: # pragma: no cover — diagnostic | |
| logger.warning("Skipping malformed entry: %s", exc) | |
| if self._entries: | |
| return | |
| # First-run seeding | |
| now = datetime.now(timezone.utc).isoformat(timespec="seconds") | |
| self.store_path.parent.mkdir(parents=True, exist_ok=True) | |
| with self.store_path.open("w") as f: | |
| for seed in _SEED_ENTRIES: | |
| entry = LeaderboardEntry(submitted_at=now, **seed) | |
| self._entries.append(entry) | |
| f.write(json.dumps(entry.model_dump()) + "\n") | |
| def add(self, submission: LeaderboardSubmission) -> LeaderboardEntry: | |
| entry = LeaderboardEntry( | |
| **submission.model_dump(), | |
| submitted_at=datetime.now(timezone.utc).isoformat(timespec="seconds"), | |
| seeded=False, | |
| ) | |
| with self._lock: | |
| self._entries.append(entry) | |
| with self.store_path.open("a") as f: | |
| f.write(json.dumps(entry.model_dump()) + "\n") | |
| return entry | |
| def list_ranked(self, limit: int = MAX_ENTRIES_RETURNED) -> list[LeaderboardEntry]: | |
| """Return entries sorted by F1 descending; stable on submitted_at.""" | |
| with self._lock: | |
| ordered = sorted( | |
| self._entries, | |
| key=lambda e: (-e.f1, e.fpr, e.submitted_at), | |
| ) | |
| return ordered[:limit] | |
| def reset_for_tests(self) -> None: | |
| """Test helper — clear entries and reseed. Not exposed via HTTP.""" | |
| with self._lock: | |
| self._entries = [] | |
| try: | |
| self.store_path.unlink() | |
| except FileNotFoundError: | |
| pass | |
| self._load_or_seed() | |
| def build_router(store: LeaderboardStore | None = None) -> APIRouter: | |
| """Return a FastAPI router with `/leaderboard` and `/submit` routes. | |
| The router is registered onto the main OpenEnv app in ``server/app.py``. | |
| """ | |
| if store is None: | |
| store_path = Path(os.getenv("CHAKRAVYUH_LEADERBOARD_PATH", str(DEFAULT_STORE_PATH))) | |
| store = LeaderboardStore(store_path=store_path) | |
| router = APIRouter(tags=["leaderboard"]) | |
| def get_leaderboard() -> dict[str, Any]: | |
| entries = store.list_ranked() | |
| return { | |
| "bench": "chakravyuh-bench-v0", | |
| "ranked_by": "f1", | |
| "n_entries": len(entries), | |
| "entries": [e.model_dump() for e in entries], | |
| } | |
| def post_submit(submission: LeaderboardSubmission) -> dict[str, Any]: | |
| try: | |
| entry = store.add(submission) | |
| except Exception as exc: # pragma: no cover — schema validation upstream | |
| raise HTTPException( | |
| status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
| detail=f"failed to persist submission: {exc}", | |
| ) from exc | |
| rank = next( | |
| (i + 1 for i, e in enumerate(store.list_ranked()) if e.submitted_at == entry.submitted_at and e.method == entry.method), | |
| None, | |
| ) | |
| return { | |
| "ok": True, | |
| "rank": rank, | |
| "n_total": len(store.list_ranked(limit=10_000)), | |
| "entry": entry.model_dump(), | |
| } | |
| return router | |
| def attach_to_app(app: FastAPI, store: LeaderboardStore | None = None) -> None: | |
| """Register the leaderboard router on a FastAPI app instance.""" | |
| app.include_router(build_router(store)) | |