from __future__ import annotations import json from datetime import datetime, timezone from pathlib import Path from typing import Any def _utc_now() -> str: return datetime.now(tz=timezone.utc).replace(microsecond=0).isoformat() def load_leaderboard(path: str | Path) -> list[dict[str, Any]]: file_path = Path(path) if not file_path.exists(): return [] with file_path.open("r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, list): return [] return data def save_leaderboard(path: str | Path, records: list[dict[str, Any]]) -> None: file_path = Path(path) file_path.parent.mkdir(parents=True, exist_ok=True) with file_path.open("w", encoding="utf-8") as f: json.dump(records, f, indent=2, sort_keys=True) def _metric_value(record: dict[str, Any], sort_by: str) -> float: metrics = record.get("metrics", {}) return float(metrics.get(sort_by, 0.0)) def sorted_leaderboard(records: list[dict[str, Any]], sort_by: str = "leaderboard_score") -> list[dict[str, Any]]: return sorted(records, key=lambda r: _metric_value(r, sort_by), reverse=True) def append_leaderboard_record( path: str | Path, summary: dict[str, Any], episodes: int, run_name: str | None = None, config: dict[str, Any] | None = None, ) -> dict[str, Any]: records = load_leaderboard(path) run_id = f"run_{len(records) + 1:04d}" record = { "run_id": run_id, "run_name": run_name or run_id, "created_at": _utc_now(), "episodes": int(episodes), "config": config or {}, "metrics": summary, } records.append(record) save_leaderboard(path, records) return record def render_leaderboard_table(records: list[dict[str, Any]], top_k: int = 20, sort_by: str = "leaderboard_score") -> str: ranked = sorted_leaderboard(records, sort_by=sort_by)[:top_k] header = "| rank | run | score | success | graph_f1 | retrieval | structural | spawn | reward | tool_eff |\n" sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|\n" rows: list[str] = [] for idx, rec in enumerate(ranked, start=1): m = rec.get("metrics", {}) rows.append( "| {rank} | {run} | {score:.4f} | {succ:.3f} | {f1:.3f} | {retrieval:.3f} | {structural:.3f} | {spawn:.3f} | {reward:.3f} | {tool:.3f} |".format( rank=idx, run=rec.get("run_name", rec.get("run_id", "run")), score=float(m.get("leaderboard_score", 0.0)), succ=float(m.get("task_success_rate", 0.0)), f1=float(m.get("avg_graph_f1", 0.0)), retrieval=float(m.get("retrieval_signal", 0.0)), structural=float(m.get("structural_signal", 0.0)), spawn=float(m.get("spawn_signal", 0.0)), reward=float(m.get("avg_reward", 0.0)), tool=float(m.get("tool_efficiency", 0.0)), ) ) return header + sep + "\n".join(rows)