Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
adding details
Browse files- app.py +39 -1
- backend/config.py +1 -0
- backend/data_loader.py +431 -1
- frontend/leaderboard.html +140 -5
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -6,7 +6,13 @@ from contextlib import asynccontextmanager
|
|
| 6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 7 |
import logging
|
| 8 |
|
| 9 |
-
from backend.data_loader import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from backend.submission_handler import submit_model
|
| 11 |
from backend.config import TASKS, API, hf_api_token
|
| 12 |
from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
|
|
@@ -17,6 +23,7 @@ logging.getLogger("apscheduler").setLevel(logging.WARNING)
|
|
| 17 |
# --- Global Cache Variables ---
|
| 18 |
GLOBAL_LEADERBOARD_DATA = []
|
| 19 |
GLOBAL_QUEUE_DATA = {}
|
|
|
|
| 20 |
|
| 21 |
ACCEPTED_PAGES = ["about.html", "header.html", "leaderboard.html", "submit.html"]
|
| 22 |
|
|
@@ -84,6 +91,15 @@ def update_queue_cache():
|
|
| 84 |
except Exception as e:
|
| 85 |
logging.error(f"❌ Error updating queue cache: {e}")
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
# --- Lifespan & Scheduler ---
|
| 88 |
@asynccontextmanager
|
| 89 |
async def lifespan(app: FastAPI):
|
|
@@ -91,6 +107,7 @@ async def lifespan(app: FastAPI):
|
|
| 91 |
download_dataset_snapshots()
|
| 92 |
update_leaderboard_cache()
|
| 93 |
update_queue_cache()
|
|
|
|
| 94 |
|
| 95 |
# 2. Schedule periodic updates
|
| 96 |
scheduler = BackgroundScheduler()
|
|
@@ -101,6 +118,7 @@ async def lifespan(app: FastAPI):
|
|
| 101 |
# Cache updates (every 10 mins)
|
| 102 |
scheduler.add_job(update_leaderboard_cache, "interval", minutes=10)
|
| 103 |
scheduler.add_job(update_queue_cache, "interval", minutes=10)
|
|
|
|
| 104 |
|
| 105 |
scheduler.start()
|
| 106 |
|
|
@@ -143,6 +161,26 @@ async def get_model_likes(
|
|
| 143 |
logging.error(f"Error fetching likes for {model_name}: {e}")
|
| 144 |
return JSONResponse(content={"error": str(e)}, status_code=400)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
@app.post("/api/submit")
|
| 147 |
async def handle_submission(
|
| 148 |
model_name: str = Form(...),
|
|
|
|
| 6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 7 |
import logging
|
| 8 |
|
| 9 |
+
from backend.data_loader import (
|
| 10 |
+
download_dataset_snapshots,
|
| 11 |
+
load_scoreboard,
|
| 12 |
+
load_requests,
|
| 13 |
+
build_details_index,
|
| 14 |
+
load_benchmark_details,
|
| 15 |
+
)
|
| 16 |
from backend.submission_handler import submit_model
|
| 17 |
from backend.config import TASKS, API, hf_api_token
|
| 18 |
from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
|
|
|
|
| 23 |
# --- Global Cache Variables ---
|
| 24 |
GLOBAL_LEADERBOARD_DATA = []
|
| 25 |
GLOBAL_QUEUE_DATA = {}
|
| 26 |
+
GLOBAL_DETAILS_INDEX = {}
|
| 27 |
|
| 28 |
ACCEPTED_PAGES = ["about.html", "header.html", "leaderboard.html", "submit.html"]
|
| 29 |
|
|
|
|
| 91 |
except Exception as e:
|
| 92 |
logging.error(f"❌ Error updating queue cache: {e}")
|
| 93 |
|
| 94 |
+
|
| 95 |
+
def update_details_cache():
|
| 96 |
+
"""Builds details-parquet index and updates the global variable."""
|
| 97 |
+
global GLOBAL_DETAILS_INDEX
|
| 98 |
+
try:
|
| 99 |
+
GLOBAL_DETAILS_INDEX = build_details_index()
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logging.error(f"❌ Error updating details cache: {e}")
|
| 102 |
+
|
| 103 |
# --- Lifespan & Scheduler ---
|
| 104 |
@asynccontextmanager
|
| 105 |
async def lifespan(app: FastAPI):
|
|
|
|
| 107 |
download_dataset_snapshots()
|
| 108 |
update_leaderboard_cache()
|
| 109 |
update_queue_cache()
|
| 110 |
+
update_details_cache()
|
| 111 |
|
| 112 |
# 2. Schedule periodic updates
|
| 113 |
scheduler = BackgroundScheduler()
|
|
|
|
| 118 |
# Cache updates (every 10 mins)
|
| 119 |
scheduler.add_job(update_leaderboard_cache, "interval", minutes=10)
|
| 120 |
scheduler.add_job(update_queue_cache, "interval", minutes=10)
|
| 121 |
+
scheduler.add_job(update_details_cache, "interval", minutes=10)
|
| 122 |
|
| 123 |
scheduler.start()
|
| 124 |
|
|
|
|
| 161 |
logging.error(f"Error fetching likes for {model_name}: {e}")
|
| 162 |
return JSONResponse(content={"error": str(e)}, status_code=400)
|
| 163 |
|
| 164 |
+
|
| 165 |
+
@app.post("/api/benchmark-details")
|
| 166 |
+
async def get_benchmark_details(
|
| 167 |
+
model_name: str = Form(...),
|
| 168 |
+
benchmark: str = Form(...),
|
| 169 |
+
):
|
| 170 |
+
"""Fetches per-question details for a specific model benchmark score."""
|
| 171 |
+
try:
|
| 172 |
+
if not GLOBAL_DETAILS_INDEX:
|
| 173 |
+
update_details_cache()
|
| 174 |
+
payload = load_benchmark_details(
|
| 175 |
+
model_name=model_name,
|
| 176 |
+
benchmark_display=benchmark,
|
| 177 |
+
details_index=GLOBAL_DETAILS_INDEX,
|
| 178 |
+
)
|
| 179 |
+
return JSONResponse(content=payload)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logging.error(f"Error fetching benchmark details for {model_name}/{benchmark}: {e}")
|
| 182 |
+
return JSONResponse(content={"error": str(e)}, status_code=400)
|
| 183 |
+
|
| 184 |
@app.post("/api/submit")
|
| 185 |
async def handle_submission(
|
| 186 |
model_name: str = Form(...),
|
backend/config.py
CHANGED
|
@@ -7,6 +7,7 @@ OWNER: str = "qimma"
|
|
| 7 |
REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
|
| 8 |
RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
|
| 9 |
REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
|
|
|
|
| 10 |
|
| 11 |
SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
|
| 12 |
|
|
|
|
| 7 |
REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
|
| 8 |
RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
|
| 9 |
REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
|
| 10 |
+
DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details"
|
| 11 |
|
| 12 |
SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
|
| 13 |
|
backend/data_loader.py
CHANGED
|
@@ -5,16 +5,22 @@ import os
|
|
| 5 |
import contextlib
|
| 6 |
import io
|
| 7 |
import logging
|
|
|
|
|
|
|
| 8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import Dict, List, Any, Optional
|
|
|
|
| 11 |
|
| 12 |
import numpy as np
|
| 13 |
import pandas as pd
|
|
|
|
| 14 |
from huggingface_hub import snapshot_download
|
| 15 |
from datetime import datetime
|
|
|
|
| 16 |
from backend.config import (
|
| 17 |
API,
|
|
|
|
| 18 |
REQUESTS_REPO_ID,
|
| 19 |
RESULTS_REPO_ID,
|
| 20 |
TASKS,
|
|
@@ -37,15 +43,136 @@ _TASKS_BY_SOURCE = {
|
|
| 37 |
}
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# -----------------------------------------------------------------------------
|
| 41 |
# Utilities
|
| 42 |
# -----------------------------------------------------------------------------
|
| 43 |
|
| 44 |
def silent_snapshot_download(**kwargs):
|
| 45 |
-
with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
|
| 46 |
return snapshot_download(**kwargs)
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def download_datasets():
|
| 50 |
"""
|
| 51 |
Download requests + results datasets (read-only, anonymous).
|
|
@@ -64,6 +191,10 @@ def download_datasets():
|
|
| 64 |
)
|
| 65 |
os.environ["EVAL_RESULTS_PATH"] = res_path
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# -----------------------------------------------------------------------------
|
| 69 |
# Requests
|
|
@@ -154,6 +285,305 @@ def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
|
|
| 154 |
return row
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
def _fetch_hf_metadata(model_name: str) -> Dict[str, Any]:
|
| 158 |
try:
|
| 159 |
info = API.model_info(repo_id=model_name, token=hf_api_token)
|
|
|
|
| 5 |
import contextlib
|
| 6 |
import io
|
| 7 |
import logging
|
| 8 |
+
import re
|
| 9 |
+
import ast
|
| 10 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Dict, List, Any, Optional
|
| 13 |
+
from urllib.parse import quote
|
| 14 |
|
| 15 |
import numpy as np
|
| 16 |
import pandas as pd
|
| 17 |
+
import requests
|
| 18 |
from huggingface_hub import snapshot_download
|
| 19 |
from datetime import datetime
|
| 20 |
+
from huggingface_hub.constants import HF_HUB_CACHE
|
| 21 |
from backend.config import (
|
| 22 |
API,
|
| 23 |
+
DETAILS_REPO_ID,
|
| 24 |
REQUESTS_REPO_ID,
|
| 25 |
RESULTS_REPO_ID,
|
| 26 |
TASKS,
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
|
| 46 |
+
def _extract_task_bases(task_key: Any) -> List[str]:
|
| 47 |
+
if isinstance(task_key, list):
|
| 48 |
+
bases: List[str] = []
|
| 49 |
+
for item in task_key:
|
| 50 |
+
bases.extend(_extract_task_bases(item))
|
| 51 |
+
return bases
|
| 52 |
+
|
| 53 |
+
if not isinstance(task_key, str):
|
| 54 |
+
return []
|
| 55 |
+
|
| 56 |
+
key = task_key.strip()
|
| 57 |
+
if not key:
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
return [key.split(":", 1)[0].split("|", 1)[0].strip()]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
BENCHMARK_DISPLAY_TO_BASES: Dict[str, List[str]] = {}
|
| 64 |
+
for task_key, _, display in TASKS:
|
| 65 |
+
bases = BENCHMARK_DISPLAY_TO_BASES.setdefault(display, [])
|
| 66 |
+
for base in _extract_task_bases(task_key):
|
| 67 |
+
if base and base not in bases:
|
| 68 |
+
bases.append(base)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
# -----------------------------------------------------------------------------
|
| 72 |
# Utilities
|
| 73 |
# -----------------------------------------------------------------------------
|
| 74 |
|
| 75 |
def silent_snapshot_download(**kwargs):
|
| 76 |
+
# with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
|
| 77 |
return snapshot_download(**kwargs)
|
| 78 |
|
| 79 |
|
| 80 |
+
def _resolve_details_base_path() -> Path:
|
| 81 |
+
repo_cache_root = Path(HF_HUB_CACHE) / f"datasets--{DETAILS_REPO_ID.replace('/', '--')}"
|
| 82 |
+
snapshots_root = repo_cache_root / "snapshots"
|
| 83 |
+
if snapshots_root.exists():
|
| 84 |
+
candidates = [p for p in snapshots_root.iterdir() if p.is_dir()]
|
| 85 |
+
if candidates:
|
| 86 |
+
return max(candidates, key=lambda p: p.stat().st_mtime)
|
| 87 |
+
|
| 88 |
+
manual_root = repo_cache_root / "manual-snapshot"
|
| 89 |
+
manual_root.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
return manual_root
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _download_details_file(relative_path: str, base_path: Path, retries: int = 3) -> bool:
|
| 94 |
+
encoded_rel_path = quote(relative_path, safe="/")
|
| 95 |
+
url = f"https://huggingface.co/datasets/{DETAILS_REPO_ID}/resolve/main/{encoded_rel_path}"
|
| 96 |
+
headers = {}
|
| 97 |
+
if hf_api_token:
|
| 98 |
+
headers["Authorization"] = f"Bearer {hf_api_token}"
|
| 99 |
+
|
| 100 |
+
target_path = base_path / relative_path
|
| 101 |
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
| 102 |
+
partial_path = target_path.with_suffix(target_path.suffix + ".part")
|
| 103 |
+
|
| 104 |
+
for attempt in range(1, retries + 1):
|
| 105 |
+
try:
|
| 106 |
+
with requests.get(url, stream=True, timeout=(10, 90), headers=headers) as resp:
|
| 107 |
+
resp.raise_for_status()
|
| 108 |
+
with open(partial_path, "wb") as f:
|
| 109 |
+
for chunk in resp.iter_content(chunk_size=1024 * 1024):
|
| 110 |
+
if chunk:
|
| 111 |
+
f.write(chunk)
|
| 112 |
+
os.replace(partial_path, target_path)
|
| 113 |
+
return True
|
| 114 |
+
except Exception as e:
|
| 115 |
+
with contextlib.suppress(Exception):
|
| 116 |
+
partial_path.unlink(missing_ok=True)
|
| 117 |
+
logger.warning(
|
| 118 |
+
"Retry %s/%s for details file '%s' failed: %s",
|
| 119 |
+
attempt,
|
| 120 |
+
retries,
|
| 121 |
+
relative_path,
|
| 122 |
+
e,
|
| 123 |
+
)
|
| 124 |
+
return False
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _sync_details_dataset(base_path: Path):
|
| 128 |
+
try:
|
| 129 |
+
remote_files = [
|
| 130 |
+
f for f in API.list_repo_files(repo_id=DETAILS_REPO_ID, repo_type="dataset")
|
| 131 |
+
if f.endswith(".parquet")
|
| 132 |
+
]
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.warning("Could not list files for details repo '%s': %s", DETAILS_REPO_ID, e)
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
local_files = {
|
| 138 |
+
str(p.relative_to(base_path)).replace(os.sep, "/")
|
| 139 |
+
for p in base_path.rglob("*.parquet")
|
| 140 |
+
}
|
| 141 |
+
missing_files = [f for f in remote_files if f not in local_files]
|
| 142 |
+
|
| 143 |
+
total_count = len(remote_files)
|
| 144 |
+
local_count = len(local_files)
|
| 145 |
+
if not missing_files:
|
| 146 |
+
logger.info("Details files ready: %s/%s", local_count, total_count)
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
logger.info(
|
| 150 |
+
"Details files ready: %s/%s. Downloading %s missing files...",
|
| 151 |
+
local_count,
|
| 152 |
+
total_count,
|
| 153 |
+
len(missing_files),
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
failed_files: List[str] = []
|
| 157 |
+
total_missing = len(missing_files)
|
| 158 |
+
for idx, rel_path in enumerate(missing_files, start=1):
|
| 159 |
+
logger.info("Downloading missing details file %s/%s: %s", idx, total_missing, rel_path)
|
| 160 |
+
if not _download_details_file(rel_path, base_path):
|
| 161 |
+
failed_files.append(rel_path)
|
| 162 |
+
|
| 163 |
+
if failed_files:
|
| 164 |
+
logger.warning(
|
| 165 |
+
"Details sync incomplete. Downloaded %s/%s missing files. Still missing %s files.",
|
| 166 |
+
total_missing - len(failed_files),
|
| 167 |
+
total_missing,
|
| 168 |
+
len(failed_files),
|
| 169 |
+
)
|
| 170 |
+
for rel_path in failed_files:
|
| 171 |
+
logger.warning("Still missing: %s", rel_path)
|
| 172 |
+
else:
|
| 173 |
+
logger.info("Details sync complete: downloaded %s/%s missing files.", total_missing, total_missing)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
def download_datasets():
|
| 177 |
"""
|
| 178 |
Download requests + results datasets (read-only, anonymous).
|
|
|
|
| 191 |
)
|
| 192 |
os.environ["EVAL_RESULTS_PATH"] = res_path
|
| 193 |
|
| 194 |
+
details_base = _resolve_details_base_path()
|
| 195 |
+
_sync_details_dataset(details_base)
|
| 196 |
+
os.environ["EVAL_DETAILS_PATH"] = str(details_base)
|
| 197 |
+
|
| 198 |
|
| 199 |
# -----------------------------------------------------------------------------
|
| 200 |
# Requests
|
|
|
|
| 285 |
return row
|
| 286 |
|
| 287 |
|
| 288 |
+
def _parse_details_filename(path: Path) -> Optional[Dict[str, Any]]:
|
| 289 |
+
stem = path.stem
|
| 290 |
+
if "_" not in stem:
|
| 291 |
+
return None
|
| 292 |
+
|
| 293 |
+
details_part, dt_str = stem.rsplit("_", 1)
|
| 294 |
+
if not details_part.startswith("details_"):
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
parsed_dt = datetime.strptime(dt_str, "%Y-%m-%dT%H-%M-%S.%f")
|
| 299 |
+
except Exception:
|
| 300 |
+
return None
|
| 301 |
+
|
| 302 |
+
task_full = details_part[len("details_"):].strip()
|
| 303 |
+
if not task_full:
|
| 304 |
+
return None
|
| 305 |
+
|
| 306 |
+
benchmark_base = task_full.split(":", 1)[0].split("|", 1)[0].strip()
|
| 307 |
+
if ":" in task_full:
|
| 308 |
+
subtask = task_full.split(":", 1)[1].strip()
|
| 309 |
+
else:
|
| 310 |
+
subtask = benchmark_base
|
| 311 |
+
|
| 312 |
+
subtask = re.sub(r"\|\d+$", "", subtask).strip() or "overall"
|
| 313 |
+
|
| 314 |
+
return {
|
| 315 |
+
"benchmark_base": benchmark_base,
|
| 316 |
+
"subtask": subtask,
|
| 317 |
+
"datetime": parsed_dt,
|
| 318 |
+
"task_full": task_full,
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def build_details_index() -> Dict[str, Dict[str, Dict[str, Dict[str, Any]]]]:
|
| 323 |
+
"""
|
| 324 |
+
Build an index of latest detail parquet paths per model/benchmark/subtask.
|
| 325 |
+
"""
|
| 326 |
+
details_base = os.getenv("EVAL_DETAILS_PATH")
|
| 327 |
+
if not details_base:
|
| 328 |
+
return {}
|
| 329 |
+
|
| 330 |
+
base_path = Path(details_base)
|
| 331 |
+
if not base_path.exists():
|
| 332 |
+
return {}
|
| 333 |
+
|
| 334 |
+
index: Dict[str, Dict[str, Dict[str, Dict[str, Any]]]] = {}
|
| 335 |
+
|
| 336 |
+
for p in base_path.rglob("*.parquet"):
|
| 337 |
+
parsed = _parse_details_filename(p)
|
| 338 |
+
if not parsed:
|
| 339 |
+
continue
|
| 340 |
+
|
| 341 |
+
try:
|
| 342 |
+
rel_parts = p.relative_to(base_path).parts
|
| 343 |
+
except Exception:
|
| 344 |
+
continue
|
| 345 |
+
if len(rel_parts) < 2:
|
| 346 |
+
continue
|
| 347 |
+
|
| 348 |
+
model_name = "/".join(rel_parts[:-1]).strip("/")
|
| 349 |
+
if not model_name:
|
| 350 |
+
continue
|
| 351 |
+
|
| 352 |
+
benchmark_base = parsed["benchmark_base"]
|
| 353 |
+
subtask = parsed["subtask"]
|
| 354 |
+
dt = parsed["datetime"]
|
| 355 |
+
|
| 356 |
+
model_bucket = index.setdefault(model_name, {})
|
| 357 |
+
bench_bucket = model_bucket.setdefault(benchmark_base, {})
|
| 358 |
+
current = bench_bucket.get(subtask)
|
| 359 |
+
if current is None or dt > current["datetime"]:
|
| 360 |
+
bench_bucket[subtask] = {
|
| 361 |
+
"path": str(p),
|
| 362 |
+
"datetime": dt,
|
| 363 |
+
"task_full": parsed["task_full"],
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
return index
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def _as_list(value: Any) -> List[Any]:
|
| 370 |
+
if value is None:
|
| 371 |
+
return []
|
| 372 |
+
if isinstance(value, list):
|
| 373 |
+
return value
|
| 374 |
+
if isinstance(value, tuple):
|
| 375 |
+
return list(value)
|
| 376 |
+
if isinstance(value, np.ndarray):
|
| 377 |
+
return value.tolist()
|
| 378 |
+
return [value]
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def _as_dict(value: Any) -> Dict[str, Any]:
|
| 382 |
+
if isinstance(value, dict):
|
| 383 |
+
return value
|
| 384 |
+
|
| 385 |
+
if isinstance(value, (bytes, bytearray)):
|
| 386 |
+
try:
|
| 387 |
+
value = value.decode("utf-8", errors="ignore")
|
| 388 |
+
except Exception:
|
| 389 |
+
return {}
|
| 390 |
+
|
| 391 |
+
if isinstance(value, str):
|
| 392 |
+
s = value.strip()
|
| 393 |
+
if not s:
|
| 394 |
+
return {}
|
| 395 |
+
try:
|
| 396 |
+
parsed = json.loads(s)
|
| 397 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 398 |
+
except Exception:
|
| 399 |
+
try:
|
| 400 |
+
parsed = ast.literal_eval(s)
|
| 401 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 402 |
+
except Exception:
|
| 403 |
+
return {}
|
| 404 |
+
|
| 405 |
+
if isinstance(value, list):
|
| 406 |
+
# Some parquet backends can expose map-like structs as list of pairs.
|
| 407 |
+
try:
|
| 408 |
+
if all(isinstance(item, (list, tuple)) and len(item) == 2 for item in value):
|
| 409 |
+
return {str(k): v for k, v in value}
|
| 410 |
+
except Exception:
|
| 411 |
+
return {}
|
| 412 |
+
|
| 413 |
+
return {}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def _py_scalar(value: Any) -> Any:
|
| 417 |
+
if isinstance(value, np.generic):
|
| 418 |
+
return value.item()
|
| 419 |
+
return value
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
|
| 423 |
+
logprobs = model_response.get("logprobs")
|
| 424 |
+
if logprobs is not None and choices:
|
| 425 |
+
values = _as_list(logprobs)
|
| 426 |
+
try:
|
| 427 |
+
idx = int(np.argmax(np.asarray(values, dtype=float)))
|
| 428 |
+
if 0 <= idx < len(choices):
|
| 429 |
+
return choices[idx]
|
| 430 |
+
except Exception:
|
| 431 |
+
pass
|
| 432 |
+
|
| 433 |
+
text_post_processed = _as_list(model_response.get("text_post_processed"))
|
| 434 |
+
if text_post_processed:
|
| 435 |
+
return text_post_processed[0]
|
| 436 |
+
|
| 437 |
+
text = _as_list(model_response.get("text"))
|
| 438 |
+
if text:
|
| 439 |
+
return text[0]
|
| 440 |
+
|
| 441 |
+
return None
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def _first_non_empty(values: Any) -> Optional[str]:
|
| 445 |
+
for v in _as_list(values):
|
| 446 |
+
if v is None:
|
| 447 |
+
continue
|
| 448 |
+
s = str(v).strip()
|
| 449 |
+
if s:
|
| 450 |
+
return s
|
| 451 |
+
return None
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
|
| 455 |
+
try:
|
| 456 |
+
df = pd.read_parquet(path)
|
| 457 |
+
except Exception as e:
|
| 458 |
+
logger.warning("Could not read details parquet '%s': %s", path, e)
|
| 459 |
+
return []
|
| 460 |
+
|
| 461 |
+
rows: List[Dict[str, Any]] = []
|
| 462 |
+
for record in df.to_dict(orient="records"):
|
| 463 |
+
doc = _as_dict(record.get("doc"))
|
| 464 |
+
metric = _as_dict(record.get("metric"))
|
| 465 |
+
model_response = _as_dict(record.get("model_response"))
|
| 466 |
+
|
| 467 |
+
choices = _as_list(doc.get("choices"))
|
| 468 |
+
choices = [_py_scalar(c) for c in choices]
|
| 469 |
+
gold_idx = doc.get("gold_index")
|
| 470 |
+
gold_answer = None
|
| 471 |
+
if isinstance(gold_idx, (int, np.integer)) and 0 <= int(gold_idx) < len(choices):
|
| 472 |
+
gold_answer = choices[int(gold_idx)]
|
| 473 |
+
|
| 474 |
+
metric_value = None
|
| 475 |
+
metric_name = None
|
| 476 |
+
if isinstance(metric, dict) and metric:
|
| 477 |
+
metric_name = next(iter(metric.keys()))
|
| 478 |
+
try:
|
| 479 |
+
metric_value = float(next(iter(metric.values())))
|
| 480 |
+
except Exception:
|
| 481 |
+
metric_value = None
|
| 482 |
+
|
| 483 |
+
model_response_dict = model_response if isinstance(model_response, dict) else {}
|
| 484 |
+
predicted_answer = _extract_predicted_answer(model_response_dict, choices)
|
| 485 |
+
output_text = _first_non_empty(model_response_dict.get("text_post_processed"))
|
| 486 |
+
if output_text is None:
|
| 487 |
+
output_text = _first_non_empty(model_response_dict.get("text"))
|
| 488 |
+
if output_text is None and predicted_answer is not None:
|
| 489 |
+
output_text = str(predicted_answer)
|
| 490 |
+
|
| 491 |
+
is_correct = None
|
| 492 |
+
if metric_value is not None and metric_value in (0.0, 1.0):
|
| 493 |
+
is_correct = bool(metric_value)
|
| 494 |
+
|
| 495 |
+
prompt = (
|
| 496 |
+
doc.get("query")
|
| 497 |
+
or doc.get("original_query")
|
| 498 |
+
or doc.get("instruction")
|
| 499 |
+
or model_response_dict.get("input")
|
| 500 |
+
or ""
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
rows.append({
|
| 504 |
+
"subtask": subtask,
|
| 505 |
+
"question_id": _py_scalar(doc.get("id")),
|
| 506 |
+
"task_name": _py_scalar(doc.get("task_name")),
|
| 507 |
+
"prompt": prompt,
|
| 508 |
+
"input_prompt": model_response_dict.get("input"),
|
| 509 |
+
"output": output_text,
|
| 510 |
+
"choices": [str(c) for c in choices],
|
| 511 |
+
"gold_answer": _py_scalar(gold_answer),
|
| 512 |
+
"predicted_answer": _py_scalar(predicted_answer),
|
| 513 |
+
"is_correct": is_correct,
|
| 514 |
+
"metric_name": metric_name,
|
| 515 |
+
"metric": metric_value,
|
| 516 |
+
})
|
| 517 |
+
|
| 518 |
+
return rows
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def load_benchmark_details(
|
| 522 |
+
model_name: str,
|
| 523 |
+
benchmark_display: str,
|
| 524 |
+
details_index: Dict[str, Dict[str, Dict[str, Dict[str, Any]]]],
|
| 525 |
+
max_rows: int = 250,
|
| 526 |
+
) -> Dict[str, Any]:
|
| 527 |
+
"""
|
| 528 |
+
Load per-question benchmark details for a model from indexed parquet files.
|
| 529 |
+
"""
|
| 530 |
+
model_bucket = details_index.get(model_name, {})
|
| 531 |
+
if not model_bucket:
|
| 532 |
+
target_model = model_name.strip().lower()
|
| 533 |
+
for indexed_model, bucket in details_index.items():
|
| 534 |
+
if indexed_model.strip().lower() == target_model:
|
| 535 |
+
model_bucket = bucket
|
| 536 |
+
break
|
| 537 |
+
|
| 538 |
+
benchmark_bases = BENCHMARK_DISPLAY_TO_BASES.get(benchmark_display, [])
|
| 539 |
+
if not benchmark_bases:
|
| 540 |
+
benchmark_bases = [benchmark_display]
|
| 541 |
+
|
| 542 |
+
selected_entries: List[tuple[str, Dict[str, Any]]] = []
|
| 543 |
+
for base in benchmark_bases:
|
| 544 |
+
subtasks = model_bucket.get(base, {})
|
| 545 |
+
if not subtasks:
|
| 546 |
+
base_l = base.strip().lower()
|
| 547 |
+
for indexed_base, bucket in model_bucket.items():
|
| 548 |
+
if indexed_base.strip().lower() == base_l:
|
| 549 |
+
subtasks = bucket
|
| 550 |
+
break
|
| 551 |
+
for subtask, info in subtasks.items():
|
| 552 |
+
selected_entries.append((subtask, info))
|
| 553 |
+
|
| 554 |
+
if not selected_entries:
|
| 555 |
+
return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
|
| 556 |
+
|
| 557 |
+
selected_entries.sort(key=lambda x: x[0].lower())
|
| 558 |
+
|
| 559 |
+
all_rows: List[Dict[str, Any]] = []
|
| 560 |
+
subtasks_summary: List[Dict[str, Any]] = []
|
| 561 |
+
for subtask, info in selected_entries:
|
| 562 |
+
rows = _read_detail_parquet(info["path"], subtask)
|
| 563 |
+
all_rows.extend(rows)
|
| 564 |
+
|
| 565 |
+
valid = [r for r in rows if isinstance(r.get("is_correct"), bool)]
|
| 566 |
+
correct = sum(1 for r in valid if r["is_correct"])
|
| 567 |
+
total = len(valid)
|
| 568 |
+
accuracy = round((correct / total) * 100, 2) if total > 0 else None
|
| 569 |
+
subtasks_summary.append({
|
| 570 |
+
"subtask": subtask,
|
| 571 |
+
"total": len(rows),
|
| 572 |
+
"scored": total,
|
| 573 |
+
"correct": correct,
|
| 574 |
+
"accuracy": accuracy,
|
| 575 |
+
})
|
| 576 |
+
|
| 577 |
+
if len(all_rows) > max_rows:
|
| 578 |
+
all_rows = all_rows[:max_rows]
|
| 579 |
+
|
| 580 |
+
return {
|
| 581 |
+
"benchmark": benchmark_display,
|
| 582 |
+
"subtasks": subtasks_summary,
|
| 583 |
+
"rows": all_rows,
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
|
| 587 |
def _fetch_hf_metadata(model_name: str) -> Dict[str, Any]:
|
| 588 |
try:
|
| 589 |
info = API.model_info(repo_id=model_name, token=hf_api_token)
|
frontend/leaderboard.html
CHANGED
|
@@ -546,6 +546,48 @@
|
|
| 546 |
</div>
|
| 547 |
</div>
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
<script>
|
| 550 |
(function () {
|
| 551 |
const $ = s => document.querySelector(s);
|
|
@@ -564,6 +606,12 @@
|
|
| 564 |
const n = toNumber(v);
|
| 565 |
return n === null ? "Unknown" : String(Math.floor(n));
|
| 566 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
|
| 569 |
let currentSort = { colId: null, dir: 'none' };
|
|
@@ -683,6 +731,81 @@
|
|
| 683 |
applyFilters();
|
| 684 |
}
|
| 685 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
// --- MODAL LOGIC ---
|
| 687 |
window.openModelDetails = function (modelName) {
|
| 688 |
const model = lbData.find(r => r["Model Name"] === modelName);
|
|
@@ -910,6 +1033,7 @@
|
|
| 910 |
function prepareColumns(data) {
|
| 911 |
const keys = Object.keys(data[0] || {});
|
| 912 |
const typeIdx = keys.findIndex(k => ["T", "Type", "Full Type"].includes(k));
|
|
|
|
| 913 |
const vis = tableColumns.reduce((acc, c) => ({ ...acc, [c.id]: c.hidden }), {});
|
| 914 |
|
| 915 |
tableColumns = keys.map(key => {
|
|
@@ -960,15 +1084,23 @@
|
|
| 960 |
} else if (isScore || isAvg) {
|
| 961 |
|
| 962 |
// -- NEW LOGIC FOR SCORE DISPLAY STATUS --
|
| 963 |
-
const renderBar = (c) => {
|
| 964 |
const n = parseFloat(c); if (isNaN(n)) return c;
|
| 965 |
const h = (Math.max(0, Math.min(100, n)) / 100) * 120;
|
| 966 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 967 |
};
|
| 968 |
|
| 969 |
-
const renderRaw = (c) => {
|
| 970 |
const n = parseFloat(c); if (isNaN(n)) return c;
|
| 971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
};
|
| 973 |
|
| 974 |
let shouldUseBar = false;
|
|
@@ -981,7 +1113,10 @@
|
|
| 981 |
shouldUseBar = false;
|
| 982 |
}
|
| 983 |
|
| 984 |
-
def.formatter =
|
|
|
|
|
|
|
|
|
|
| 985 |
|
| 986 |
} else if (key === "Rank") {
|
| 987 |
def.width = '110px';
|
|
|
|
| 546 |
</div>
|
| 547 |
</div>
|
| 548 |
|
| 549 |
+
<div id="benchmarkModal" class="hidden fixed inset-0 z-[110]" aria-labelledby="benchmark-modal-title" role="dialog"
|
| 550 |
+
aria-modal="true">
|
| 551 |
+
<div class="fixed inset-0 bg-slate-900/60 backdrop-blur-sm transition-opacity modal-backdrop"
|
| 552 |
+
onclick="window.closeBenchmarkDetails()"></div>
|
| 553 |
+
|
| 554 |
+
<div
|
| 555 |
+
class="fixed top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-[111] w-[95%] md:w-[75%] max-h-[90vh] overflow-y-auto bg-white dark:bg-slate-900 rounded-2xl shadow-2xl border border-slate-200 dark:border-slate-700 modal-content">
|
| 556 |
+
<div
|
| 557 |
+
class="sticky top-0 z-10 flex items-start justify-between px-6 py-5 bg-white/80 dark:bg-slate-900/80 backdrop-blur-md border-b border-slate-100 dark:border-slate-800">
|
| 558 |
+
<div>
|
| 559 |
+
<h3 id="benchmarkModalTitle"
|
| 560 |
+
class="text-xl md:text-2xl font-bold text-slate-900 dark:text-white leading-tight break-words pr-4">
|
| 561 |
+
</h3>
|
| 562 |
+
</div>
|
| 563 |
+
<button type="button" onclick="window.closeBenchmarkDetails()"
|
| 564 |
+
class="text-slate-400 hover:text-slate-600 dark:hover:text-slate-300 transition-colors p-1 rounded-full hover:bg-slate-100 dark:hover:bg-slate-800">
|
| 565 |
+
<i data-lucide="x" class="w-6 h-6"></i>
|
| 566 |
+
</button>
|
| 567 |
+
</div>
|
| 568 |
+
|
| 569 |
+
<div class="p-6 space-y-6">
|
| 570 |
+
<div id="benchmarkSummary" class="grid grid-cols-1 md:grid-cols-3 gap-3"></div>
|
| 571 |
+
<div class="border border-slate-200 dark:border-slate-700 rounded-xl overflow-hidden">
|
| 572 |
+
<div
|
| 573 |
+
class="grid grid-cols-12 text-xs font-bold uppercase tracking-wide bg-slate-50 dark:bg-slate-800 text-slate-500 dark:text-slate-400 px-4 py-3">
|
| 574 |
+
<div class="col-span-2">Subtask</div>
|
| 575 |
+
<div class="col-span-6">Prompt / Output</div>
|
| 576 |
+
<div class="col-span-2">Gold</div>
|
| 577 |
+
<div class="col-span-2">Predicted</div>
|
| 578 |
+
</div>
|
| 579 |
+
<div id="benchmarkRows" class="divide-y divide-slate-100 dark:divide-slate-800"></div>
|
| 580 |
+
</div>
|
| 581 |
+
</div>
|
| 582 |
+
|
| 583 |
+
<div
|
| 584 |
+
class="bg-slate-50 dark:bg-slate-800/50 px-6 py-4 border-t border-slate-100 dark:border-slate-800 text-center">
|
| 585 |
+
<button onclick="window.closeBenchmarkDetails()"
|
| 586 |
+
class="text-sm text-slate-500 hover:text-slate-800 dark:hover:text-slate-200">Close Details</button>
|
| 587 |
+
</div>
|
| 588 |
+
</div>
|
| 589 |
+
</div>
|
| 590 |
+
|
| 591 |
<script>
|
| 592 |
(function () {
|
| 593 |
const $ = s => document.querySelector(s);
|
|
|
|
| 606 |
const n = toNumber(v);
|
| 607 |
return n === null ? "Unknown" : String(Math.floor(n));
|
| 608 |
};
|
| 609 |
+
const escapeHtml = (value) => String(value ?? "")
|
| 610 |
+
.replace(/&/g, "&")
|
| 611 |
+
.replace(/</g, "<")
|
| 612 |
+
.replace(/>/g, ">")
|
| 613 |
+
.replace(/\"/g, """)
|
| 614 |
+
.replace(/'/g, "'");
|
| 615 |
|
| 616 |
let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
|
| 617 |
let currentSort = { colId: null, dir: 'none' };
|
|
|
|
| 731 |
applyFilters();
|
| 732 |
}
|
| 733 |
|
| 734 |
+
// --- BENCHMARK DETAILS MODAL ---
|
| 735 |
+
window.openBenchmarkDetails = async function (modelName, benchmark) {
|
| 736 |
+
$('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
|
| 737 |
+
$('#benchmarkSummary').innerHTML = "";
|
| 738 |
+
$('#benchmarkRows').innerHTML = `<div class="p-6 text-sm text-slate-500 dark:text-slate-400">Loading details...</div>`;
|
| 739 |
+
|
| 740 |
+
$('#benchmarkModal').classList.remove('hidden');
|
| 741 |
+
document.body.style.overflow = 'hidden';
|
| 742 |
+
if (window.lucide) lucide.createIcons();
|
| 743 |
+
|
| 744 |
+
const formData = new FormData();
|
| 745 |
+
formData.append("model_name", modelName);
|
| 746 |
+
formData.append("benchmark", benchmark);
|
| 747 |
+
|
| 748 |
+
try {
|
| 749 |
+
const res = await fetch("/api/benchmark-details", { method: "POST", body: formData });
|
| 750 |
+
const payload = await res.json();
|
| 751 |
+
if (!res.ok) throw new Error(payload.error || "Failed to load details");
|
| 752 |
+
|
| 753 |
+
const subtasks = payload.subtasks || [];
|
| 754 |
+
const rows = payload.rows || [];
|
| 755 |
+
|
| 756 |
+
if (!subtasks.length && !rows.length) {
|
| 757 |
+
$('#benchmarkSummary').innerHTML = `<div class="col-span-full p-4 rounded-lg bg-slate-50 dark:bg-slate-800 text-sm text-slate-500 dark:text-slate-400">No details found for this benchmark/model.</div>`;
|
| 758 |
+
$('#benchmarkRows').innerHTML = "";
|
| 759 |
+
return;
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
$('#benchmarkSummary').innerHTML = subtasks.map(s => `
|
| 763 |
+
<div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
|
| 764 |
+
<div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
|
| 765 |
+
<div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
|
| 766 |
+
<div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.correct}/${s.scored} correct</div>
|
| 767 |
+
</div>
|
| 768 |
+
`).join("");
|
| 769 |
+
|
| 770 |
+
$('#benchmarkRows').innerHTML = rows.map(r => {
|
| 771 |
+
const correctBadge = r.is_correct === true
|
| 772 |
+
? `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`
|
| 773 |
+
: (r.is_correct === false
|
| 774 |
+
? `<span class="text-rose-600 dark:text-rose-400 font-semibold">Wrong</span>`
|
| 775 |
+
: `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`);
|
| 776 |
+
const prompt = escapeHtml(asUnknown(r.prompt));
|
| 777 |
+
const output = escapeHtml(asUnknown(r.output));
|
| 778 |
+
const sampleMeta = [
|
| 779 |
+
r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
|
| 780 |
+
r.metric_name ? `metric: ${escapeHtml(r.metric_name)}` : null,
|
| 781 |
+
r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
|
| 782 |
+
].filter(Boolean).join(" | ");
|
| 783 |
+
|
| 784 |
+
return `
|
| 785 |
+
<div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
|
| 786 |
+
<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>
|
| 787 |
+
<div class="col-span-6 text-slate-700 dark:text-slate-200">
|
| 788 |
+
<div class="font-semibold text-slate-800 dark:text-slate-100 whitespace-pre-wrap">${prompt}</div>
|
| 789 |
+
<div class="mt-2 text-slate-500 dark:text-slate-400 whitespace-pre-wrap"><span class="font-semibold">Output:</span> ${output}</div>
|
| 790 |
+
${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
|
| 791 |
+
<div class="mt-1">${correctBadge}</div>
|
| 792 |
+
</div>
|
| 793 |
+
<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(asUnknown(r.gold_answer))}</div>
|
| 794 |
+
<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(asUnknown(r.predicted_answer))}</div>
|
| 795 |
+
</div>
|
| 796 |
+
`;
|
| 797 |
+
}).join("");
|
| 798 |
+
} catch (err) {
|
| 799 |
+
$('#benchmarkSummary').innerHTML = "";
|
| 800 |
+
$('#benchmarkRows').innerHTML = `<div class="p-6 text-sm text-rose-600 dark:text-rose-400">${escapeHtml(err.message || "Failed to load details")}</div>`;
|
| 801 |
+
}
|
| 802 |
+
};
|
| 803 |
+
|
| 804 |
+
window.closeBenchmarkDetails = function () {
|
| 805 |
+
$('#benchmarkModal').classList.add('hidden');
|
| 806 |
+
document.body.style.overflow = '';
|
| 807 |
+
};
|
| 808 |
+
|
| 809 |
// --- MODAL LOGIC ---
|
| 810 |
window.openModelDetails = function (modelName) {
|
| 811 |
const model = lbData.find(r => r["Model Name"] === modelName);
|
|
|
|
| 1033 |
function prepareColumns(data) {
|
| 1034 |
const keys = Object.keys(data[0] || {});
|
| 1035 |
const typeIdx = keys.findIndex(k => ["T", "Type", "Full Type"].includes(k));
|
| 1036 |
+
const modelNameIdx = keys.findIndex(k => k === "Model Name");
|
| 1037 |
const vis = tableColumns.reduce((acc, c) => ({ ...acc, [c.id]: c.hidden }), {});
|
| 1038 |
|
| 1039 |
tableColumns = keys.map(key => {
|
|
|
|
| 1084 |
} else if (isScore || isAvg) {
|
| 1085 |
|
| 1086 |
// -- NEW LOGIC FOR SCORE DISPLAY STATUS --
|
| 1087 |
+
const renderBar = (c, modelName, benchmark) => {
|
| 1088 |
const n = parseFloat(c); if (isNaN(n)) return c;
|
| 1089 |
const h = (Math.max(0, Math.min(100, n)) / 100) * 120;
|
| 1090 |
+
const bar = `<div class="flex justify-center"><div style="background: linear-gradient(to right, hsla(${h},85%,50%,0.3) ${n}%, hsla(${h},85%,50%,0.05) ${n}%); border: 1px solid hsla(${h},85%,40%,0.3);" class="w-24 py-1 rounded-md text-center text-xs font-bold text-slate-700 dark:text-slate-200 shadow-sm">${n.toFixed(2)}<span class="text-[10px] font-normal opacity-70 ml-0.5">%</span></div></div>`;
|
| 1091 |
+
if (!isScore || !modelName) return gridjs.html(bar);
|
| 1092 |
+
const em = encodeURIComponent(modelName);
|
| 1093 |
+
const eb = encodeURIComponent(benchmark);
|
| 1094 |
+
return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${bar}</button>`);
|
| 1095 |
};
|
| 1096 |
|
| 1097 |
+
const renderRaw = (c, modelName, benchmark) => {
|
| 1098 |
const n = parseFloat(c); if (isNaN(n)) return c;
|
| 1099 |
+
const raw = `<div class="flex justify-center text-xs font-bold text-slate-700 dark:text-slate-300 py-1">${n.toFixed(2)}</div>`;
|
| 1100 |
+
if (!isScore || !modelName) return gridjs.html(raw);
|
| 1101 |
+
const em = encodeURIComponent(modelName);
|
| 1102 |
+
const eb = encodeURIComponent(benchmark);
|
| 1103 |
+
return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
|
| 1104 |
};
|
| 1105 |
|
| 1106 |
let shouldUseBar = false;
|
|
|
|
| 1113 |
shouldUseBar = false;
|
| 1114 |
}
|
| 1115 |
|
| 1116 |
+
def.formatter = (c, r) => {
|
| 1117 |
+
const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";
|
| 1118 |
+
return shouldUseBar ? renderBar(c, modelName, key) : renderRaw(c, modelName, key);
|
| 1119 |
+
};
|
| 1120 |
|
| 1121 |
} else if (key === "Rank") {
|
| 1122 |
def.width = '110px';
|
requirements.txt
CHANGED
|
@@ -10,3 +10,5 @@ transformers==5.1.0
|
|
| 10 |
Jinja2==3.1.6
|
| 11 |
python-multipart==0.0.22
|
| 12 |
tiktoken
|
|
|
|
|
|
|
|
|
| 10 |
Jinja2==3.1.6
|
| 11 |
python-multipart==0.0.22
|
| 12 |
tiktoken
|
| 13 |
+
# fastparquet
|
| 14 |
+
# pyarrow
|