Alyafeai commited on
Commit
3725eb1
·
1 Parent(s): 163662f

adding details

Browse files
app.py CHANGED
@@ -6,7 +6,13 @@ from contextlib import asynccontextmanager
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  import logging
8
 
9
- from backend.data_loader import download_dataset_snapshots, load_scoreboard, load_requests
 
 
 
 
 
 
10
  from backend.submission_handler import submit_model
11
  from backend.config import TASKS, API, hf_api_token
12
  from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
@@ -17,6 +23,7 @@ logging.getLogger("apscheduler").setLevel(logging.WARNING)
17
  # --- Global Cache Variables ---
18
  GLOBAL_LEADERBOARD_DATA = []
19
  GLOBAL_QUEUE_DATA = {}
 
20
 
21
  ACCEPTED_PAGES = ["about.html", "header.html", "leaderboard.html", "submit.html"]
22
 
@@ -84,6 +91,15 @@ def update_queue_cache():
84
  except Exception as e:
85
  logging.error(f"❌ Error updating queue cache: {e}")
86
 
 
 
 
 
 
 
 
 
 
87
  # --- Lifespan & Scheduler ---
88
  @asynccontextmanager
89
  async def lifespan(app: FastAPI):
@@ -91,6 +107,7 @@ async def lifespan(app: FastAPI):
91
  download_dataset_snapshots()
92
  update_leaderboard_cache()
93
  update_queue_cache()
 
94
 
95
  # 2. Schedule periodic updates
96
  scheduler = BackgroundScheduler()
@@ -101,6 +118,7 @@ async def lifespan(app: FastAPI):
101
  # Cache updates (every 10 mins)
102
  scheduler.add_job(update_leaderboard_cache, "interval", minutes=10)
103
  scheduler.add_job(update_queue_cache, "interval", minutes=10)
 
104
 
105
  scheduler.start()
106
 
@@ -143,6 +161,26 @@ async def get_model_likes(
143
  logging.error(f"Error fetching likes for {model_name}: {e}")
144
  return JSONResponse(content={"error": str(e)}, status_code=400)
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  @app.post("/api/submit")
147
  async def handle_submission(
148
  model_name: str = Form(...),
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  import logging
8
 
9
+ from backend.data_loader import (
10
+ download_dataset_snapshots,
11
+ load_scoreboard,
12
+ load_requests,
13
+ build_details_index,
14
+ load_benchmark_details,
15
+ )
16
  from backend.submission_handler import submit_model
17
  from backend.config import TASKS, API, hf_api_token
18
  from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
 
23
  # --- Global Cache Variables ---
24
  GLOBAL_LEADERBOARD_DATA = []
25
  GLOBAL_QUEUE_DATA = {}
26
+ GLOBAL_DETAILS_INDEX = {}
27
 
28
  ACCEPTED_PAGES = ["about.html", "header.html", "leaderboard.html", "submit.html"]
29
 
 
91
  except Exception as e:
92
  logging.error(f"❌ Error updating queue cache: {e}")
93
 
94
+
95
+ def update_details_cache():
96
+ """Builds details-parquet index and updates the global variable."""
97
+ global GLOBAL_DETAILS_INDEX
98
+ try:
99
+ GLOBAL_DETAILS_INDEX = build_details_index()
100
+ except Exception as e:
101
+ logging.error(f"❌ Error updating details cache: {e}")
102
+
103
  # --- Lifespan & Scheduler ---
104
  @asynccontextmanager
105
  async def lifespan(app: FastAPI):
 
107
  download_dataset_snapshots()
108
  update_leaderboard_cache()
109
  update_queue_cache()
110
+ update_details_cache()
111
 
112
  # 2. Schedule periodic updates
113
  scheduler = BackgroundScheduler()
 
118
  # Cache updates (every 10 mins)
119
  scheduler.add_job(update_leaderboard_cache, "interval", minutes=10)
120
  scheduler.add_job(update_queue_cache, "interval", minutes=10)
121
+ scheduler.add_job(update_details_cache, "interval", minutes=10)
122
 
123
  scheduler.start()
124
 
 
161
  logging.error(f"Error fetching likes for {model_name}: {e}")
162
  return JSONResponse(content={"error": str(e)}, status_code=400)
163
 
164
+
165
+ @app.post("/api/benchmark-details")
166
+ async def get_benchmark_details(
167
+ model_name: str = Form(...),
168
+ benchmark: str = Form(...),
169
+ ):
170
+ """Fetches per-question details for a specific model benchmark score."""
171
+ try:
172
+ if not GLOBAL_DETAILS_INDEX:
173
+ update_details_cache()
174
+ payload = load_benchmark_details(
175
+ model_name=model_name,
176
+ benchmark_display=benchmark,
177
+ details_index=GLOBAL_DETAILS_INDEX,
178
+ )
179
+ return JSONResponse(content=payload)
180
+ except Exception as e:
181
+ logging.error(f"Error fetching benchmark details for {model_name}/{benchmark}: {e}")
182
+ return JSONResponse(content={"error": str(e)}, status_code=400)
183
+
184
  @app.post("/api/submit")
185
  async def handle_submission(
186
  model_name: str = Form(...),
backend/config.py CHANGED
@@ -7,6 +7,7 @@ OWNER: str = "qimma"
7
  REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
8
  RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
9
  REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
 
10
 
11
  SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
12
 
 
7
  REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
8
  RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
9
  REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
10
+ DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details"
11
 
12
  SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
13
 
backend/data_loader.py CHANGED
@@ -5,16 +5,22 @@ import os
5
  import contextlib
6
  import io
7
  import logging
 
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  from pathlib import Path
10
  from typing import Dict, List, Any, Optional
 
11
 
12
  import numpy as np
13
  import pandas as pd
 
14
  from huggingface_hub import snapshot_download
15
  from datetime import datetime
 
16
  from backend.config import (
17
  API,
 
18
  REQUESTS_REPO_ID,
19
  RESULTS_REPO_ID,
20
  TASKS,
@@ -37,15 +43,136 @@ _TASKS_BY_SOURCE = {
37
  }
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # -----------------------------------------------------------------------------
41
  # Utilities
42
  # -----------------------------------------------------------------------------
43
 
44
  def silent_snapshot_download(**kwargs):
45
- with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
46
  return snapshot_download(**kwargs)
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def download_datasets():
50
  """
51
  Download requests + results datasets (read-only, anonymous).
@@ -64,6 +191,10 @@ def download_datasets():
64
  )
65
  os.environ["EVAL_RESULTS_PATH"] = res_path
66
 
 
 
 
 
67
 
68
  # -----------------------------------------------------------------------------
69
  # Requests
@@ -154,6 +285,305 @@ def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
154
  return row
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def _fetch_hf_metadata(model_name: str) -> Dict[str, Any]:
158
  try:
159
  info = API.model_info(repo_id=model_name, token=hf_api_token)
 
5
  import contextlib
6
  import io
7
  import logging
8
+ import re
9
+ import ast
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
11
  from pathlib import Path
12
  from typing import Dict, List, Any, Optional
13
+ from urllib.parse import quote
14
 
15
  import numpy as np
16
  import pandas as pd
17
+ import requests
18
  from huggingface_hub import snapshot_download
19
  from datetime import datetime
20
+ from huggingface_hub.constants import HF_HUB_CACHE
21
  from backend.config import (
22
  API,
23
+ DETAILS_REPO_ID,
24
  REQUESTS_REPO_ID,
25
  RESULTS_REPO_ID,
26
  TASKS,
 
43
  }
44
 
45
 
46
+ def _extract_task_bases(task_key: Any) -> List[str]:
47
+ if isinstance(task_key, list):
48
+ bases: List[str] = []
49
+ for item in task_key:
50
+ bases.extend(_extract_task_bases(item))
51
+ return bases
52
+
53
+ if not isinstance(task_key, str):
54
+ return []
55
+
56
+ key = task_key.strip()
57
+ if not key:
58
+ return []
59
+
60
+ return [key.split(":", 1)[0].split("|", 1)[0].strip()]
61
+
62
+
63
+ BENCHMARK_DISPLAY_TO_BASES: Dict[str, List[str]] = {}
64
+ for task_key, _, display in TASKS:
65
+ bases = BENCHMARK_DISPLAY_TO_BASES.setdefault(display, [])
66
+ for base in _extract_task_bases(task_key):
67
+ if base and base not in bases:
68
+ bases.append(base)
69
+
70
+
71
  # -----------------------------------------------------------------------------
72
  # Utilities
73
  # -----------------------------------------------------------------------------
74
 
75
  def silent_snapshot_download(**kwargs):
76
+ # with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
77
  return snapshot_download(**kwargs)
78
 
79
 
80
+ def _resolve_details_base_path() -> Path:
81
+ repo_cache_root = Path(HF_HUB_CACHE) / f"datasets--{DETAILS_REPO_ID.replace('/', '--')}"
82
+ snapshots_root = repo_cache_root / "snapshots"
83
+ if snapshots_root.exists():
84
+ candidates = [p for p in snapshots_root.iterdir() if p.is_dir()]
85
+ if candidates:
86
+ return max(candidates, key=lambda p: p.stat().st_mtime)
87
+
88
+ manual_root = repo_cache_root / "manual-snapshot"
89
+ manual_root.mkdir(parents=True, exist_ok=True)
90
+ return manual_root
91
+
92
+
93
+ def _download_details_file(relative_path: str, base_path: Path, retries: int = 3) -> bool:
94
+ encoded_rel_path = quote(relative_path, safe="/")
95
+ url = f"https://huggingface.co/datasets/{DETAILS_REPO_ID}/resolve/main/{encoded_rel_path}"
96
+ headers = {}
97
+ if hf_api_token:
98
+ headers["Authorization"] = f"Bearer {hf_api_token}"
99
+
100
+ target_path = base_path / relative_path
101
+ target_path.parent.mkdir(parents=True, exist_ok=True)
102
+ partial_path = target_path.with_suffix(target_path.suffix + ".part")
103
+
104
+ for attempt in range(1, retries + 1):
105
+ try:
106
+ with requests.get(url, stream=True, timeout=(10, 90), headers=headers) as resp:
107
+ resp.raise_for_status()
108
+ with open(partial_path, "wb") as f:
109
+ for chunk in resp.iter_content(chunk_size=1024 * 1024):
110
+ if chunk:
111
+ f.write(chunk)
112
+ os.replace(partial_path, target_path)
113
+ return True
114
+ except Exception as e:
115
+ with contextlib.suppress(Exception):
116
+ partial_path.unlink(missing_ok=True)
117
+ logger.warning(
118
+ "Retry %s/%s for details file '%s' failed: %s",
119
+ attempt,
120
+ retries,
121
+ relative_path,
122
+ e,
123
+ )
124
+ return False
125
+
126
+
127
+ def _sync_details_dataset(base_path: Path):
128
+ try:
129
+ remote_files = [
130
+ f for f in API.list_repo_files(repo_id=DETAILS_REPO_ID, repo_type="dataset")
131
+ if f.endswith(".parquet")
132
+ ]
133
+ except Exception as e:
134
+ logger.warning("Could not list files for details repo '%s': %s", DETAILS_REPO_ID, e)
135
+ return
136
+
137
+ local_files = {
138
+ str(p.relative_to(base_path)).replace(os.sep, "/")
139
+ for p in base_path.rglob("*.parquet")
140
+ }
141
+ missing_files = [f for f in remote_files if f not in local_files]
142
+
143
+ total_count = len(remote_files)
144
+ local_count = len(local_files)
145
+ if not missing_files:
146
+ logger.info("Details files ready: %s/%s", local_count, total_count)
147
+ return
148
+
149
+ logger.info(
150
+ "Details files ready: %s/%s. Downloading %s missing files...",
151
+ local_count,
152
+ total_count,
153
+ len(missing_files),
154
+ )
155
+
156
+ failed_files: List[str] = []
157
+ total_missing = len(missing_files)
158
+ for idx, rel_path in enumerate(missing_files, start=1):
159
+ logger.info("Downloading missing details file %s/%s: %s", idx, total_missing, rel_path)
160
+ if not _download_details_file(rel_path, base_path):
161
+ failed_files.append(rel_path)
162
+
163
+ if failed_files:
164
+ logger.warning(
165
+ "Details sync incomplete. Downloaded %s/%s missing files. Still missing %s files.",
166
+ total_missing - len(failed_files),
167
+ total_missing,
168
+ len(failed_files),
169
+ )
170
+ for rel_path in failed_files:
171
+ logger.warning("Still missing: %s", rel_path)
172
+ else:
173
+ logger.info("Details sync complete: downloaded %s/%s missing files.", total_missing, total_missing)
174
+
175
+
176
  def download_datasets():
177
  """
178
  Download requests + results datasets (read-only, anonymous).
 
191
  )
192
  os.environ["EVAL_RESULTS_PATH"] = res_path
193
 
194
+ details_base = _resolve_details_base_path()
195
+ _sync_details_dataset(details_base)
196
+ os.environ["EVAL_DETAILS_PATH"] = str(details_base)
197
+
198
 
199
  # -----------------------------------------------------------------------------
200
  # Requests
 
285
  return row
286
 
287
 
288
+ def _parse_details_filename(path: Path) -> Optional[Dict[str, Any]]:
289
+ stem = path.stem
290
+ if "_" not in stem:
291
+ return None
292
+
293
+ details_part, dt_str = stem.rsplit("_", 1)
294
+ if not details_part.startswith("details_"):
295
+ return None
296
+
297
+ try:
298
+ parsed_dt = datetime.strptime(dt_str, "%Y-%m-%dT%H-%M-%S.%f")
299
+ except Exception:
300
+ return None
301
+
302
+ task_full = details_part[len("details_"):].strip()
303
+ if not task_full:
304
+ return None
305
+
306
+ benchmark_base = task_full.split(":", 1)[0].split("|", 1)[0].strip()
307
+ if ":" in task_full:
308
+ subtask = task_full.split(":", 1)[1].strip()
309
+ else:
310
+ subtask = benchmark_base
311
+
312
+ subtask = re.sub(r"\|\d+$", "", subtask).strip() or "overall"
313
+
314
+ return {
315
+ "benchmark_base": benchmark_base,
316
+ "subtask": subtask,
317
+ "datetime": parsed_dt,
318
+ "task_full": task_full,
319
+ }
320
+
321
+
322
+ def build_details_index() -> Dict[str, Dict[str, Dict[str, Dict[str, Any]]]]:
323
+ """
324
+ Build an index of latest detail parquet paths per model/benchmark/subtask.
325
+ """
326
+ details_base = os.getenv("EVAL_DETAILS_PATH")
327
+ if not details_base:
328
+ return {}
329
+
330
+ base_path = Path(details_base)
331
+ if not base_path.exists():
332
+ return {}
333
+
334
+ index: Dict[str, Dict[str, Dict[str, Dict[str, Any]]]] = {}
335
+
336
+ for p in base_path.rglob("*.parquet"):
337
+ parsed = _parse_details_filename(p)
338
+ if not parsed:
339
+ continue
340
+
341
+ try:
342
+ rel_parts = p.relative_to(base_path).parts
343
+ except Exception:
344
+ continue
345
+ if len(rel_parts) < 2:
346
+ continue
347
+
348
+ model_name = "/".join(rel_parts[:-1]).strip("/")
349
+ if not model_name:
350
+ continue
351
+
352
+ benchmark_base = parsed["benchmark_base"]
353
+ subtask = parsed["subtask"]
354
+ dt = parsed["datetime"]
355
+
356
+ model_bucket = index.setdefault(model_name, {})
357
+ bench_bucket = model_bucket.setdefault(benchmark_base, {})
358
+ current = bench_bucket.get(subtask)
359
+ if current is None or dt > current["datetime"]:
360
+ bench_bucket[subtask] = {
361
+ "path": str(p),
362
+ "datetime": dt,
363
+ "task_full": parsed["task_full"],
364
+ }
365
+
366
+ return index
367
+
368
+
369
+ def _as_list(value: Any) -> List[Any]:
370
+ if value is None:
371
+ return []
372
+ if isinstance(value, list):
373
+ return value
374
+ if isinstance(value, tuple):
375
+ return list(value)
376
+ if isinstance(value, np.ndarray):
377
+ return value.tolist()
378
+ return [value]
379
+
380
+
381
+ def _as_dict(value: Any) -> Dict[str, Any]:
382
+ if isinstance(value, dict):
383
+ return value
384
+
385
+ if isinstance(value, (bytes, bytearray)):
386
+ try:
387
+ value = value.decode("utf-8", errors="ignore")
388
+ except Exception:
389
+ return {}
390
+
391
+ if isinstance(value, str):
392
+ s = value.strip()
393
+ if not s:
394
+ return {}
395
+ try:
396
+ parsed = json.loads(s)
397
+ return parsed if isinstance(parsed, dict) else {}
398
+ except Exception:
399
+ try:
400
+ parsed = ast.literal_eval(s)
401
+ return parsed if isinstance(parsed, dict) else {}
402
+ except Exception:
403
+ return {}
404
+
405
+ if isinstance(value, list):
406
+ # Some parquet backends can expose map-like structs as list of pairs.
407
+ try:
408
+ if all(isinstance(item, (list, tuple)) and len(item) == 2 for item in value):
409
+ return {str(k): v for k, v in value}
410
+ except Exception:
411
+ return {}
412
+
413
+ return {}
414
+
415
+
416
+ def _py_scalar(value: Any) -> Any:
417
+ if isinstance(value, np.generic):
418
+ return value.item()
419
+ return value
420
+
421
+
422
+ def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
423
+ logprobs = model_response.get("logprobs")
424
+ if logprobs is not None and choices:
425
+ values = _as_list(logprobs)
426
+ try:
427
+ idx = int(np.argmax(np.asarray(values, dtype=float)))
428
+ if 0 <= idx < len(choices):
429
+ return choices[idx]
430
+ except Exception:
431
+ pass
432
+
433
+ text_post_processed = _as_list(model_response.get("text_post_processed"))
434
+ if text_post_processed:
435
+ return text_post_processed[0]
436
+
437
+ text = _as_list(model_response.get("text"))
438
+ if text:
439
+ return text[0]
440
+
441
+ return None
442
+
443
+
444
+ def _first_non_empty(values: Any) -> Optional[str]:
445
+ for v in _as_list(values):
446
+ if v is None:
447
+ continue
448
+ s = str(v).strip()
449
+ if s:
450
+ return s
451
+ return None
452
+
453
+
454
+ def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
455
+ try:
456
+ df = pd.read_parquet(path)
457
+ except Exception as e:
458
+ logger.warning("Could not read details parquet '%s': %s", path, e)
459
+ return []
460
+
461
+ rows: List[Dict[str, Any]] = []
462
+ for record in df.to_dict(orient="records"):
463
+ doc = _as_dict(record.get("doc"))
464
+ metric = _as_dict(record.get("metric"))
465
+ model_response = _as_dict(record.get("model_response"))
466
+
467
+ choices = _as_list(doc.get("choices"))
468
+ choices = [_py_scalar(c) for c in choices]
469
+ gold_idx = doc.get("gold_index")
470
+ gold_answer = None
471
+ if isinstance(gold_idx, (int, np.integer)) and 0 <= int(gold_idx) < len(choices):
472
+ gold_answer = choices[int(gold_idx)]
473
+
474
+ metric_value = None
475
+ metric_name = None
476
+ if isinstance(metric, dict) and metric:
477
+ metric_name = next(iter(metric.keys()))
478
+ try:
479
+ metric_value = float(next(iter(metric.values())))
480
+ except Exception:
481
+ metric_value = None
482
+
483
+ model_response_dict = model_response if isinstance(model_response, dict) else {}
484
+ predicted_answer = _extract_predicted_answer(model_response_dict, choices)
485
+ output_text = _first_non_empty(model_response_dict.get("text_post_processed"))
486
+ if output_text is None:
487
+ output_text = _first_non_empty(model_response_dict.get("text"))
488
+ if output_text is None and predicted_answer is not None:
489
+ output_text = str(predicted_answer)
490
+
491
+ is_correct = None
492
+ if metric_value is not None and metric_value in (0.0, 1.0):
493
+ is_correct = bool(metric_value)
494
+
495
+ prompt = (
496
+ doc.get("query")
497
+ or doc.get("original_query")
498
+ or doc.get("instruction")
499
+ or model_response_dict.get("input")
500
+ or ""
501
+ )
502
+
503
+ rows.append({
504
+ "subtask": subtask,
505
+ "question_id": _py_scalar(doc.get("id")),
506
+ "task_name": _py_scalar(doc.get("task_name")),
507
+ "prompt": prompt,
508
+ "input_prompt": model_response_dict.get("input"),
509
+ "output": output_text,
510
+ "choices": [str(c) for c in choices],
511
+ "gold_answer": _py_scalar(gold_answer),
512
+ "predicted_answer": _py_scalar(predicted_answer),
513
+ "is_correct": is_correct,
514
+ "metric_name": metric_name,
515
+ "metric": metric_value,
516
+ })
517
+
518
+ return rows
519
+
520
+
521
+ def load_benchmark_details(
522
+ model_name: str,
523
+ benchmark_display: str,
524
+ details_index: Dict[str, Dict[str, Dict[str, Dict[str, Any]]]],
525
+ max_rows: int = 250,
526
+ ) -> Dict[str, Any]:
527
+ """
528
+ Load per-question benchmark details for a model from indexed parquet files.
529
+ """
530
+ model_bucket = details_index.get(model_name, {})
531
+ if not model_bucket:
532
+ target_model = model_name.strip().lower()
533
+ for indexed_model, bucket in details_index.items():
534
+ if indexed_model.strip().lower() == target_model:
535
+ model_bucket = bucket
536
+ break
537
+
538
+ benchmark_bases = BENCHMARK_DISPLAY_TO_BASES.get(benchmark_display, [])
539
+ if not benchmark_bases:
540
+ benchmark_bases = [benchmark_display]
541
+
542
+ selected_entries: List[tuple[str, Dict[str, Any]]] = []
543
+ for base in benchmark_bases:
544
+ subtasks = model_bucket.get(base, {})
545
+ if not subtasks:
546
+ base_l = base.strip().lower()
547
+ for indexed_base, bucket in model_bucket.items():
548
+ if indexed_base.strip().lower() == base_l:
549
+ subtasks = bucket
550
+ break
551
+ for subtask, info in subtasks.items():
552
+ selected_entries.append((subtask, info))
553
+
554
+ if not selected_entries:
555
+ return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
556
+
557
+ selected_entries.sort(key=lambda x: x[0].lower())
558
+
559
+ all_rows: List[Dict[str, Any]] = []
560
+ subtasks_summary: List[Dict[str, Any]] = []
561
+ for subtask, info in selected_entries:
562
+ rows = _read_detail_parquet(info["path"], subtask)
563
+ all_rows.extend(rows)
564
+
565
+ valid = [r for r in rows if isinstance(r.get("is_correct"), bool)]
566
+ correct = sum(1 for r in valid if r["is_correct"])
567
+ total = len(valid)
568
+ accuracy = round((correct / total) * 100, 2) if total > 0 else None
569
+ subtasks_summary.append({
570
+ "subtask": subtask,
571
+ "total": len(rows),
572
+ "scored": total,
573
+ "correct": correct,
574
+ "accuracy": accuracy,
575
+ })
576
+
577
+ if len(all_rows) > max_rows:
578
+ all_rows = all_rows[:max_rows]
579
+
580
+ return {
581
+ "benchmark": benchmark_display,
582
+ "subtasks": subtasks_summary,
583
+ "rows": all_rows,
584
+ }
585
+
586
+
587
  def _fetch_hf_metadata(model_name: str) -> Dict[str, Any]:
588
  try:
589
  info = API.model_info(repo_id=model_name, token=hf_api_token)
frontend/leaderboard.html CHANGED
@@ -546,6 +546,48 @@
546
  </div>
547
  </div>
548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  <script>
550
  (function () {
551
  const $ = s => document.querySelector(s);
@@ -564,6 +606,12 @@
564
  const n = toNumber(v);
565
  return n === null ? "Unknown" : String(Math.floor(n));
566
  };
 
 
 
 
 
 
567
 
568
  let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
569
  let currentSort = { colId: null, dir: 'none' };
@@ -683,6 +731,81 @@
683
  applyFilters();
684
  }
685
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  // --- MODAL LOGIC ---
687
  window.openModelDetails = function (modelName) {
688
  const model = lbData.find(r => r["Model Name"] === modelName);
@@ -910,6 +1033,7 @@
910
  function prepareColumns(data) {
911
  const keys = Object.keys(data[0] || {});
912
  const typeIdx = keys.findIndex(k => ["T", "Type", "Full Type"].includes(k));
 
913
  const vis = tableColumns.reduce((acc, c) => ({ ...acc, [c.id]: c.hidden }), {});
914
 
915
  tableColumns = keys.map(key => {
@@ -960,15 +1084,23 @@
960
  } else if (isScore || isAvg) {
961
 
962
  // -- NEW LOGIC FOR SCORE DISPLAY STATUS --
963
- const renderBar = (c) => {
964
  const n = parseFloat(c); if (isNaN(n)) return c;
965
  const h = (Math.max(0, Math.min(100, n)) / 100) * 120;
966
- return gridjs.html(`<div class="flex justify-center"><div style="background: linear-gradient(to right, hsla(${h},85%,50%,0.3) ${n}%, hsla(${h},85%,50%,0.05) ${n}%); border: 1px solid hsla(${h},85%,40%,0.3);" class="w-24 py-1 rounded-md text-center text-xs font-bold text-slate-700 dark:text-slate-200 shadow-sm">${n.toFixed(2)}<span class="text-[10px] font-normal opacity-70 ml-0.5">%</span></div></div>`);
 
 
 
 
967
  };
968
 
969
- const renderRaw = (c) => {
970
  const n = parseFloat(c); if (isNaN(n)) return c;
971
- return gridjs.html(`<div class="flex justify-center text-xs font-bold text-slate-700 dark:text-slate-300 py-1">${n.toFixed(2)}</div>`);
 
 
 
 
972
  };
973
 
974
  let shouldUseBar = false;
@@ -981,7 +1113,10 @@
981
  shouldUseBar = false;
982
  }
983
 
984
- def.formatter = shouldUseBar ? renderBar : renderRaw;
 
 
 
985
 
986
  } else if (key === "Rank") {
987
  def.width = '110px';
 
546
  </div>
547
  </div>
548
 
549
+ <div id="benchmarkModal" class="hidden fixed inset-0 z-[110]" aria-labelledby="benchmark-modal-title" role="dialog"
550
+ aria-modal="true">
551
+ <div class="fixed inset-0 bg-slate-900/60 backdrop-blur-sm transition-opacity modal-backdrop"
552
+ onclick="window.closeBenchmarkDetails()"></div>
553
+
554
+ <div
555
+ class="fixed top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-[111] w-[95%] md:w-[75%] max-h-[90vh] overflow-y-auto bg-white dark:bg-slate-900 rounded-2xl shadow-2xl border border-slate-200 dark:border-slate-700 modal-content">
556
+ <div
557
+ class="sticky top-0 z-10 flex items-start justify-between px-6 py-5 bg-white/80 dark:bg-slate-900/80 backdrop-blur-md border-b border-slate-100 dark:border-slate-800">
558
+ <div>
559
+ <h3 id="benchmarkModalTitle"
560
+ class="text-xl md:text-2xl font-bold text-slate-900 dark:text-white leading-tight break-words pr-4">
561
+ </h3>
562
+ </div>
563
+ <button type="button" onclick="window.closeBenchmarkDetails()"
564
+ class="text-slate-400 hover:text-slate-600 dark:hover:text-slate-300 transition-colors p-1 rounded-full hover:bg-slate-100 dark:hover:bg-slate-800">
565
+ <i data-lucide="x" class="w-6 h-6"></i>
566
+ </button>
567
+ </div>
568
+
569
+ <div class="p-6 space-y-6">
570
+ <div id="benchmarkSummary" class="grid grid-cols-1 md:grid-cols-3 gap-3"></div>
571
+ <div class="border border-slate-200 dark:border-slate-700 rounded-xl overflow-hidden">
572
+ <div
573
+ class="grid grid-cols-12 text-xs font-bold uppercase tracking-wide bg-slate-50 dark:bg-slate-800 text-slate-500 dark:text-slate-400 px-4 py-3">
574
+ <div class="col-span-2">Subtask</div>
575
+ <div class="col-span-6">Prompt / Output</div>
576
+ <div class="col-span-2">Gold</div>
577
+ <div class="col-span-2">Predicted</div>
578
+ </div>
579
+ <div id="benchmarkRows" class="divide-y divide-slate-100 dark:divide-slate-800"></div>
580
+ </div>
581
+ </div>
582
+
583
+ <div
584
+ class="bg-slate-50 dark:bg-slate-800/50 px-6 py-4 border-t border-slate-100 dark:border-slate-800 text-center">
585
+ <button onclick="window.closeBenchmarkDetails()"
586
+ class="text-sm text-slate-500 hover:text-slate-800 dark:hover:text-slate-200">Close Details</button>
587
+ </div>
588
+ </div>
589
+ </div>
590
+
591
  <script>
592
  (function () {
593
  const $ = s => document.querySelector(s);
 
606
  const n = toNumber(v);
607
  return n === null ? "Unknown" : String(Math.floor(n));
608
  };
609
+ const escapeHtml = (value) => String(value ?? "")
610
+ .replace(/&/g, "&amp;")
611
+ .replace(/</g, "&lt;")
612
+ .replace(/>/g, "&gt;")
613
+ .replace(/\"/g, "&quot;")
614
+ .replace(/'/g, "&#039;");
615
 
616
  let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
617
  let currentSort = { colId: null, dir: 'none' };
 
731
  applyFilters();
732
  }
733
 
734
+ // --- BENCHMARK DETAILS MODAL ---
735
+ window.openBenchmarkDetails = async function (modelName, benchmark) {
736
+ $('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
737
+ $('#benchmarkSummary').innerHTML = "";
738
+ $('#benchmarkRows').innerHTML = `<div class="p-6 text-sm text-slate-500 dark:text-slate-400">Loading details...</div>`;
739
+
740
+ $('#benchmarkModal').classList.remove('hidden');
741
+ document.body.style.overflow = 'hidden';
742
+ if (window.lucide) lucide.createIcons();
743
+
744
+ const formData = new FormData();
745
+ formData.append("model_name", modelName);
746
+ formData.append("benchmark", benchmark);
747
+
748
+ try {
749
+ const res = await fetch("/api/benchmark-details", { method: "POST", body: formData });
750
+ const payload = await res.json();
751
+ if (!res.ok) throw new Error(payload.error || "Failed to load details");
752
+
753
+ const subtasks = payload.subtasks || [];
754
+ const rows = payload.rows || [];
755
+
756
+ if (!subtasks.length && !rows.length) {
757
+ $('#benchmarkSummary').innerHTML = `<div class="col-span-full p-4 rounded-lg bg-slate-50 dark:bg-slate-800 text-sm text-slate-500 dark:text-slate-400">No details found for this benchmark/model.</div>`;
758
+ $('#benchmarkRows').innerHTML = "";
759
+ return;
760
+ }
761
+
762
+ $('#benchmarkSummary').innerHTML = subtasks.map(s => `
763
+ <div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
764
+ <div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
765
+ <div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
766
+ <div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.correct}/${s.scored} correct</div>
767
+ </div>
768
+ `).join("");
769
+
770
+ $('#benchmarkRows').innerHTML = rows.map(r => {
771
+ const correctBadge = r.is_correct === true
772
+ ? `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`
773
+ : (r.is_correct === false
774
+ ? `<span class="text-rose-600 dark:text-rose-400 font-semibold">Wrong</span>`
775
+ : `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`);
776
+ const prompt = escapeHtml(asUnknown(r.prompt));
777
+ const output = escapeHtml(asUnknown(r.output));
778
+ const sampleMeta = [
779
+ r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
780
+ r.metric_name ? `metric: ${escapeHtml(r.metric_name)}` : null,
781
+ r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
782
+ ].filter(Boolean).join(" | ");
783
+
784
+ return `
785
+ <div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
786
+ <div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>
787
+ <div class="col-span-6 text-slate-700 dark:text-slate-200">
788
+ <div class="font-semibold text-slate-800 dark:text-slate-100 whitespace-pre-wrap">${prompt}</div>
789
+ <div class="mt-2 text-slate-500 dark:text-slate-400 whitespace-pre-wrap"><span class="font-semibold">Output:</span> ${output}</div>
790
+ ${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
791
+ <div class="mt-1">${correctBadge}</div>
792
+ </div>
793
+ <div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(asUnknown(r.gold_answer))}</div>
794
+ <div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(asUnknown(r.predicted_answer))}</div>
795
+ </div>
796
+ `;
797
+ }).join("");
798
+ } catch (err) {
799
+ $('#benchmarkSummary').innerHTML = "";
800
+ $('#benchmarkRows').innerHTML = `<div class="p-6 text-sm text-rose-600 dark:text-rose-400">${escapeHtml(err.message || "Failed to load details")}</div>`;
801
+ }
802
+ };
803
+
804
+ window.closeBenchmarkDetails = function () {
805
+ $('#benchmarkModal').classList.add('hidden');
806
+ document.body.style.overflow = '';
807
+ };
808
+
809
  // --- MODAL LOGIC ---
810
  window.openModelDetails = function (modelName) {
811
  const model = lbData.find(r => r["Model Name"] === modelName);
 
1033
  function prepareColumns(data) {
1034
  const keys = Object.keys(data[0] || {});
1035
  const typeIdx = keys.findIndex(k => ["T", "Type", "Full Type"].includes(k));
1036
+ const modelNameIdx = keys.findIndex(k => k === "Model Name");
1037
  const vis = tableColumns.reduce((acc, c) => ({ ...acc, [c.id]: c.hidden }), {});
1038
 
1039
  tableColumns = keys.map(key => {
 
1084
  } else if (isScore || isAvg) {
1085
 
1086
  // -- NEW LOGIC FOR SCORE DISPLAY STATUS --
1087
+ const renderBar = (c, modelName, benchmark) => {
1088
  const n = parseFloat(c); if (isNaN(n)) return c;
1089
  const h = (Math.max(0, Math.min(100, n)) / 100) * 120;
1090
+ const bar = `<div class="flex justify-center"><div style="background: linear-gradient(to right, hsla(${h},85%,50%,0.3) ${n}%, hsla(${h},85%,50%,0.05) ${n}%); border: 1px solid hsla(${h},85%,40%,0.3);" class="w-24 py-1 rounded-md text-center text-xs font-bold text-slate-700 dark:text-slate-200 shadow-sm">${n.toFixed(2)}<span class="text-[10px] font-normal opacity-70 ml-0.5">%</span></div></div>`;
1091
+ if (!isScore || !modelName) return gridjs.html(bar);
1092
+ const em = encodeURIComponent(modelName);
1093
+ const eb = encodeURIComponent(benchmark);
1094
+ return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${bar}</button>`);
1095
  };
1096
 
1097
+ const renderRaw = (c, modelName, benchmark) => {
1098
  const n = parseFloat(c); if (isNaN(n)) return c;
1099
+ const raw = `<div class="flex justify-center text-xs font-bold text-slate-700 dark:text-slate-300 py-1">${n.toFixed(2)}</div>`;
1100
+ if (!isScore || !modelName) return gridjs.html(raw);
1101
+ const em = encodeURIComponent(modelName);
1102
+ const eb = encodeURIComponent(benchmark);
1103
+ return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
1104
  };
1105
 
1106
  let shouldUseBar = false;
 
1113
  shouldUseBar = false;
1114
  }
1115
 
1116
+ def.formatter = (c, r) => {
1117
+ const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";
1118
+ return shouldUseBar ? renderBar(c, modelName, key) : renderRaw(c, modelName, key);
1119
+ };
1120
 
1121
  } else if (key === "Rank") {
1122
  def.width = '110px';
requirements.txt CHANGED
@@ -10,3 +10,5 @@ transformers==5.1.0
10
  Jinja2==3.1.6
11
  python-multipart==0.0.22
12
  tiktoken
 
 
 
10
  Jinja2==3.1.6
11
  python-multipart==0.0.22
12
  tiktoken
13
+ # fastparquet
14
+ # pyarrow