LeenAlQadi commited on
Commit
f20c7d0
·
1 Parent(s): aa18243

added filters and fixed adaptive filtered average

Browse files
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  **/__pycache__/**
2
  .vscode/
3
  scripts/*
4
- samples/*
 
 
1
  **/__pycache__/**
2
  .vscode/
3
  scripts/*
4
+ samples/*
5
+ *.log
app.py CHANGED
@@ -16,7 +16,7 @@ from backend.data_loader import (
16
  load_benchmark_details,
17
  )
18
  from backend.submission_handler import submit_model
19
- from backend.config import TASKS, API, hf_api_token, BENCHMARK_DETAILS_PAGE_SIZE
20
  from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
21
 
22
  # Logging setup
@@ -48,7 +48,7 @@ def update_leaderboard_cache():
48
  df = df.drop(columns=["Model Size Filter"], errors="ignore")
49
 
50
  # Keep scores numeric, but show Unknown for missing metadata fields.
51
- score_cols = [t[2] for t in TASKS] + ["Average", "Rank"]
52
  for col in score_cols:
53
  if col in df.columns:
54
  df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
@@ -164,7 +164,11 @@ templates = Jinja2Templates(directory="frontend")
164
  @app.get("/", response_class=HTMLResponse)
165
  async def read_root(request: Request):
166
  eval_columns = [t[2] for t in TASKS]
167
- return templates.TemplateResponse("index.html", {"request": request, "eval_columns": eval_columns})
 
 
 
 
168
 
169
  @app.get("/api/leaderboard")
170
  async def get_leaderboard_data():
@@ -268,4 +272,4 @@ async def read_page(request: Request, page_name: str):
268
  return templates.TemplateResponse(page_name, {"request": request})
269
 
270
  if __name__ == "__main__":
271
- uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True, access_log=False)
 
16
  load_benchmark_details,
17
  )
18
  from backend.submission_handler import submit_model
19
+ from backend.config import TASKS, HIDDEN_TASKS, BENCHMARK_METADATA, API, hf_api_token, BENCHMARK_DETAILS_PAGE_SIZE
20
  from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
21
 
22
  # Logging setup
 
48
  df = df.drop(columns=["Model Size Filter"], errors="ignore")
49
 
50
  # Keep scores numeric, but show Unknown for missing metadata fields.
51
+ score_cols = [t[2] for t in TASKS] + [t[2] for t in HIDDEN_TASKS] + ["Average", "Rank"]
52
  for col in score_cols:
53
  if col in df.columns:
54
  df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
 
164
  @app.get("/", response_class=HTMLResponse)
165
  async def read_root(request: Request):
166
  eval_columns = [t[2] for t in TASKS]
167
+ return templates.TemplateResponse("index.html", {
168
+ "request": request,
169
+ "eval_columns": eval_columns,
170
+ "benchmark_metadata": BENCHMARK_METADATA,
171
+ })
172
 
173
  @app.get("/api/leaderboard")
174
  async def get_leaderboard_data():
 
272
  return templates.TemplateResponse(page_name, {"request": request})
273
 
274
  if __name__ == "__main__":
275
+ uvicorn.run("app:app", host="0.0.0.0", port=7866, reload=True, access_log=False)
backend/config.py CHANGED
@@ -62,6 +62,40 @@ TASK_SOURCES: Dict[str, Dict[str, Any]] = {
62
 
63
  TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # --- 3. Model Metadata ---
66
 
67
  MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
 
62
 
63
  TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
64
 
65
+ # Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and
66
+ # the Average calculation. Used to expose MedArabiQ subset-level scores when
67
+ # the Task Type filter is set to MCQ-only or QA-only on the frontend.
68
+ HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [
69
+ (
70
+ "qimma-MedArabiQ:_average|0",
71
+ "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",
72
+ "MedArabiQ_MCQ",
73
+ ),
74
+ (
75
+ "qimma-MedArabicQ-QA:_average|0",
76
+ "BERTScore-F",
77
+ "MedArabiQ_QA",
78
+ ),
79
+ ]
80
+
81
+ # Per-benchmark metadata consumed by the frontend Task Type and Domain filters.
82
+ BENCHMARK_METADATA: Dict[str, Dict[str, str]] = {
83
+ "AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"},
84
+ "ArabicMMLU": {"domain": "STEM", "task_type": "MCQ"},
85
+ "ArabCulture": {"domain": "Cultural", "task_type": "MCQ"},
86
+ "PALMX": {"domain": "Cultural", "task_type": "MCQ"},
87
+ "3LM STEM": {"domain": "STEM", "task_type": "MCQ"},
88
+ "AraTrust": {"domain": "Safety", "task_type": "MCQ"},
89
+ "MizanQA": {"domain": "Legal", "task_type": "MCQ"},
90
+ "MedArabiQ": {"domain": "Medical", "task_type": "mixed"},
91
+ "ArabLegalQA": {"domain": "Legal", "task_type": "QA"},
92
+ "GAT": {"domain": "STEM", "task_type": "MCQ"},
93
+ "MedAraBench": {"domain": "Medical", "task_type": "MCQ"},
94
+ "HumanEval+": {"domain": "Coding", "task_type": "Code"},
95
+ "MBPP+": {"domain": "Coding", "task_type": "Code"},
96
+ "FannOrFlop": {"domain": "Poetry", "task_type": "MCQ"},
97
+ }
98
+
99
  # --- 3. Model Metadata ---
100
 
101
  MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
backend/data_loader.py CHANGED
@@ -26,6 +26,7 @@ from backend.config import (
26
  RESULTS_REPO_ID,
27
  TASKS,
28
  TASK_SOURCES,
 
29
  MODEL_TYPE_TO_EMOJI,
30
  hf_api_token,
31
  )
@@ -42,6 +43,9 @@ _TASKS_BY_SOURCE = {
42
  source: cfg.get("tasks", [])
43
  for source, cfg in TASK_SOURCES.items()
44
  }
 
 
 
45
  _RESULT_SCORE_CACHE: Dict[tuple[str, str], Optional[float]] = {}
46
 
47
 
@@ -1250,7 +1254,10 @@ def load_scoreboard() -> pd.DataFrame:
1250
  df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
1251
 
1252
  task_cols = [t[2] for t in TASKS]
1253
- for col in task_cols:
 
 
 
1254
  if col not in df.columns:
1255
  df[col] = np.nan
1256
 
@@ -1273,11 +1280,11 @@ def load_scoreboard() -> pd.DataFrame:
1273
  "datetime": "max",
1274
  "Precision": first_valid_precision,
1275
  }
1276
- agg_map.update({col: first_non_null for col in task_cols})
1277
  df = df.groupby("Model Name", as_index=False).agg(agg_map)
1278
 
1279
- # numeric
1280
- for col in task_cols:
1281
  df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
1282
  df["Average"] = df[task_cols].mean(axis=1).round(2)
1283
 
 
26
  RESULTS_REPO_ID,
27
  TASKS,
28
  TASK_SOURCES,
29
+ HIDDEN_TASKS,
30
  MODEL_TYPE_TO_EMOJI,
31
  hf_api_token,
32
  )
 
43
  source: cfg.get("tasks", [])
44
  for source, cfg in TASK_SOURCES.items()
45
  }
46
+ # Wire hidden tasks into the "results" source so _parse_result_file extracts
47
+ # them alongside normal tasks without touching the shared TASK_SOURCES dict.
48
+ _TASKS_BY_SOURCE["results"] = list(_TASKS_BY_SOURCE.get("results", [])) + HIDDEN_TASKS
49
  _RESULT_SCORE_CACHE: Dict[tuple[str, str], Optional[float]] = {}
50
 
51
 
 
1254
  df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
1255
 
1256
  task_cols = [t[2] for t in TASKS]
1257
+ hidden_cols = [t[2] for t in HIDDEN_TASKS]
1258
+ all_score_cols = task_cols + hidden_cols
1259
+
1260
+ for col in all_score_cols:
1261
  if col not in df.columns:
1262
  df[col] = np.nan
1263
 
 
1280
  "datetime": "max",
1281
  "Precision": first_valid_precision,
1282
  }
1283
+ agg_map.update({col: first_non_null for col in all_score_cols})
1284
  df = df.groupby("Model Name", as_index=False).agg(agg_map)
1285
 
1286
+ # numeric — hidden_cols converted but excluded from Average
1287
+ for col in all_score_cols:
1288
  df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
1289
  df["Average"] = df[task_cols].mean(axis=1).round(2)
1290
 
frontend/index.html CHANGED
@@ -60,9 +60,13 @@
60
  <script id="eval-columns-data" type="application/json">
61
  {{ eval_columns | tojson }}
62
  </script>
 
 
 
63
 
64
  <script>
65
  window.EVAL_COLUMNS = JSON.parse(document.getElementById('eval-columns-data').textContent);
 
66
  const $ = s => document.querySelector(s);
67
 
68
  function toggleDarkMode() {
 
60
  <script id="eval-columns-data" type="application/json">
61
  {{ eval_columns | tojson }}
62
  </script>
63
+ <script id="benchmark-metadata-data" type="application/json">
64
+ {{ benchmark_metadata | tojson }}
65
+ </script>
66
 
67
  <script>
68
  window.EVAL_COLUMNS = JSON.parse(document.getElementById('eval-columns-data').textContent);
69
+ window.BENCHMARK_METADATA = JSON.parse(document.getElementById('benchmark-metadata-data').textContent);
70
  const $ = s => document.querySelector(s);
71
 
72
  function toggleDarkMode() {
frontend/leaderboard.html CHANGED
@@ -703,7 +703,7 @@
703
  </button>
704
  <!-- Table Options Menu -->
705
  <div id="tableOpsMenu"
706
- class="hidden absolute top-10 right-40 z-50 w-72 p-4 bg-white dark:bg-slate-800 rounded-xl shadow-xl border border-slate-200 dark:border-slate-700">
707
 
708
  <!-- Header with Reset -->
709
  <div class="flex justify-between items-center mb-4 pb-2 border-b border-slate-100 dark:border-slate-700">
@@ -723,8 +723,8 @@
723
  <div
724
  class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
725
  <p>
726
- <strong>Static:</strong> shows the average value across all benchmarks.<br>
727
- <strong>Dynamic:</strong> updates the rank based on the current sort and filters.
728
  </p>
729
  <div
730
  class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
@@ -733,11 +733,11 @@
733
  </div>
734
 
735
  </h4>
736
- <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
737
  <button onclick="window.setRankMode('static')" id="rankBtnStatic"
738
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">Static</button>
739
  <button onclick="window.setRankMode('dynamic')" id="rankBtnDynamic"
740
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Dynamic</button>
741
  </div>
742
  </div>
743
 
@@ -751,9 +751,8 @@
751
  <div
752
  class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
753
  <p>
754
- <strong>All Scores</strong> Calculates the average across all benchmarks.<br>
755
- <strong>Visible Only</strong> Recalculates the average using only the evaluation columns
756
- currently shown.
757
  </p>
758
  <div
759
  class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
@@ -762,46 +761,26 @@
762
 
763
  </div>
764
  </h4>
765
- <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
766
  <button onclick="window.setAvgMode('all')" id="avgBtnAll"
767
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">All
768
- Scores</button>
769
  <button onclick="window.setAvgMode('visible')" id="avgBtnVisible"
770
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Visible
771
- Only</button>
772
  </div>
773
  </div>
774
 
775
- <!-- Section 3: Display Score -->
776
  <div class="mt-5">
777
- <h4 class="text-xs font-bold text-slate-400 mb-3 flex items-center gap-2">
778
- Score Status Display
779
- <div class="relative tooltip-trigger group cursor-help">
780
- <i data-lucide="info"
781
- class="w-3.5 h-3.5 text-slate-400 hover:text-slate-600 dark:hover:text-slate-200"></i>
782
- <div
783
- class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
784
- <p>
785
- <strong>All:</strong> Visual progress bars on all score columns.<br>
786
- <strong>Avg Only:</strong> Progress bar on Average, raw text on other score columns.<br>
787
- <strong>Raw:</strong> Raw text numbers on all score columns.
788
- </p>
789
- <div
790
- class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
791
- </div>
792
- </div>
793
- </div>
794
- </h4>
795
- <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
796
- <button onclick="window.setScoreDisplay('all')" id="scoreBtnAll"
797
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">All</button>
798
- <button onclick="window.setScoreDisplay('avg')" id="scoreBtnAvg"
799
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Avg
800
- Only</button>
801
- <button onclick="window.setScoreDisplay('raw')" id="scoreBtnRaw"
802
- class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Raw</button>
803
- </div>
804
  </div>
 
 
 
 
 
 
 
805
  </div>
806
 
807
  <!-- Col Visibility -->
@@ -1079,11 +1058,15 @@
1079
 
1080
  // New State
1081
  let tableOptions = {
1082
- rankMode: 'static', // 'static' | 'dynamic'
1083
- avgMode: 'all', // 'all' | 'visible'
1084
- scoreDisplay: 'all' // 'all' | 'avg' | 'raw'
 
1085
  };
1086
 
 
 
 
1087
  window.initLeaderboard = function (data) {
1088
  lbData = data;
1089
 
@@ -1103,6 +1086,7 @@
1103
  prepareColumns(lbData);
1104
  renderTable(lbData);
1105
  applyFilters();
 
1106
  buildColMenuDOM();
1107
  setupScoreClickHintToast();
1108
 
@@ -1163,7 +1147,6 @@
1163
  window.toggleTableOps = toggleTableOps;
1164
  window.setRankMode = setRankMode;
1165
  window.setAvgMode = setAvgMode;
1166
- window.setScoreDisplay = setScoreDisplay;
1167
 
1168
  function toggleColMenu(e) {
1169
  e?.stopPropagation();
@@ -1183,19 +1166,24 @@
1183
  window.resetTableOps = function () {
1184
  tableOptions.rankMode = 'static';
1185
  tableOptions.avgMode = 'all';
1186
- tableOptions.scoreDisplay = 'all';
 
 
 
 
 
1187
  updateOptionUI();
1188
- prepareColumns(lbData); // Re-prepare columns to reset formatters
1189
  applyFilters();
1190
  };
1191
 
1192
  function updateOptionUI() {
1193
- const activeClass = "bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300";
1194
- const inactiveClass = "text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200";
1195
 
1196
  const setBtn = (id, isActive) => {
1197
  const btn = $(id);
1198
- if (btn) btn.className = `flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all ${isActive ? activeClass : inactiveClass}`;
1199
  }
1200
 
1201
  setBtn('#rankBtnStatic', tableOptions.rankMode === 'static');
@@ -1203,12 +1191,90 @@
1203
  setBtn('#avgBtnAll', tableOptions.avgMode === 'all');
1204
  setBtn('#avgBtnVisible', tableOptions.avgMode === 'visible');
1205
 
1206
- // New UI Update
1207
- setBtn('#scoreBtnAll', tableOptions.scoreDisplay === 'all');
1208
- setBtn('#scoreBtnAvg', tableOptions.scoreDisplay === 'avg');
1209
- setBtn('#scoreBtnRaw', tableOptions.scoreDisplay === 'raw');
1210
  }
1211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
  function setRankMode(mode) {
1213
  tableOptions.rankMode = mode;
1214
  updateOptionUI();
@@ -1221,14 +1287,7 @@
1221
  applyFilters();
1222
  }
1223
 
1224
- function setScoreDisplay(mode) {
1225
- tableOptions.scoreDisplay = mode;
1226
- updateOptionUI();
1227
- prepareColumns(lbData); // Formatting changed, so we must rebuild columns
1228
- applyFilters();
1229
- }
1230
-
1231
- window.toggleExpandText = function (idBase, btn) {
1232
  const shortEl = document.getElementById(`${idBase}-short`);
1233
  const fullEl = document.getElementById(`${idBase}-full`);
1234
  if (!shortEl || !fullEl || !btn) return;
@@ -1364,8 +1423,23 @@
1364
  if (!res.ok) throw new Error(payload.error || "Failed to load details");
1365
  if (requestId !== benchmarkDetailsState.requestId) return;
1366
 
1367
- const subtasks = payload.subtasks || [];
1368
- const rows = payload.rows || [];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1369
  if (reset) {
1370
  const benchmarkNorm = String(benchmarkDetailsState.benchmark || "").trim().toLowerCase();
1371
  const hasDistinctSubtask = subtasks.some(s => {
@@ -1774,16 +1848,24 @@
1774
  (!lics.length || lics.includes(r["License"]))
1775
  ).map(row => ({ ...row }));
1776
 
1777
- // 2. Handle Average Score Calculation (If Visible Only)
 
1778
  if (tableOptions.avgMode === 'visible') {
 
 
 
 
 
 
 
 
 
 
1779
  const visibleCols = tableColumns.filter(c => EVAL_COLUMNS.includes(c.id) && !c.hidden).map(c => c.id);
1780
  filtered.forEach(row => {
1781
- if (visibleCols.length > 0) {
1782
- const sum = visibleCols.reduce((acc, col) => acc + (parseFloat(row[col]) || 0), 0);
1783
- row['Average'] = (sum / visibleCols.length);
1784
- } else {
1785
- row['Average'] = 0;
1786
- }
1787
  });
1788
  }
1789
 
@@ -1842,7 +1924,19 @@
1842
 
1843
  tableColumns = keys.map(key => {
1844
  const isScore = EVAL_COLUMNS.includes(key), isAvg = key.includes("Average");
 
 
 
 
1845
  let name = isAvg ? key.toUpperCase() : key;
 
 
 
 
 
 
 
 
1846
  let icon = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="lucide lucide-chevrons-up-down"><path d="m7 15 5 5 5-5"/><path d="m7 9 5-5 5 5"/></svg>`;
1847
  let cls = "text-slate-400 hover:text-slate-600 dark:text-slate-500 dark:hover:text-slate-300";
1848
 
@@ -1908,15 +2002,7 @@
1908
  return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
1909
  };
1910
 
1911
- let shouldUseBar = false;
1912
-
1913
- if (tableOptions.scoreDisplay === 'all') {
1914
- shouldUseBar = true;
1915
- } else if (tableOptions.scoreDisplay === 'avg') {
1916
- shouldUseBar = isAvg;
1917
- } else if (tableOptions.scoreDisplay === 'raw') {
1918
- shouldUseBar = false;
1919
- }
1920
 
1921
  def.formatter = (c, r) => {
1922
  const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";
 
703
  </button>
704
  <!-- Table Options Menu -->
705
  <div id="tableOpsMenu"
706
+ class="hidden absolute top-10 right-40 z-50 w-80 p-4 bg-white dark:bg-slate-800 rounded-xl shadow-xl border border-slate-200 dark:border-slate-700 max-h-[80vh] overflow-y-auto">
707
 
708
  <!-- Header with Reset -->
709
  <div class="flex justify-between items-center mb-4 pb-2 border-b border-slate-100 dark:border-slate-700">
 
723
  <div
724
  class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
725
  <p>
726
+ <strong>Overall Rank:</strong> shows the rank based on the global average across all benchmarks.<br>
727
+ <strong>Current View Rank:</strong> updates the rank based on the current sort and filters.
728
  </p>
729
  <div
730
  class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
 
733
  </div>
734
 
735
  </h4>
736
+ <div class="flex flex-wrap gap-1.5">
737
  <button onclick="window.setRankMode('static')" id="rankBtnStatic"
738
+ class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300">Overall Rank</button>
739
  <button onclick="window.setRankMode('dynamic')" id="rankBtnDynamic"
740
+ class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600">Current View Rank</button>
741
  </div>
742
  </div>
743
 
 
751
  <div
752
  class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
753
  <p>
754
+ <strong>Global Average:</strong> Calculates the average across all benchmarks.<br>
755
+ <strong>Filtered Average:</strong> Recalculates the average using only the currently visible evaluation columns.
 
756
  </p>
757
  <div
758
  class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
 
761
 
762
  </div>
763
  </h4>
764
+ <div class="flex flex-wrap gap-1.5">
765
  <button onclick="window.setAvgMode('all')" id="avgBtnAll"
766
+ class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300">Global Average</button>
 
767
  <button onclick="window.setAvgMode('visible')" id="avgBtnVisible"
768
+ class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600">Filtered Average</button>
 
769
  </div>
770
  </div>
771
 
772
+ <!-- Section 3: Task Type -->
773
  <div class="mt-5">
774
+ <h4 class="text-xs font-bold text-slate-400 mb-2">Task Type</h4>
775
+ <div id="taskTypeFilterBtns" class="flex flex-wrap gap-1.5"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  </div>
777
+
778
+ <!-- Section 4: Domain -->
779
+ <div class="mt-5">
780
+ <h4 class="text-xs font-bold text-slate-400 mb-2">Domain</h4>
781
+ <div id="domainFilterBtns" class="flex flex-wrap gap-1.5"></div>
782
+ </div>
783
+
784
  </div>
785
 
786
  <!-- Col Visibility -->
 
1058
 
1059
  // New State
1060
  let tableOptions = {
1061
+ rankMode: 'static', // 'static' | 'dynamic'
1062
+ avgMode: 'all', // 'all' | 'visible'
1063
+ taskTypes: [], // [] = All; otherwise selected subset
1064
+ domains: [], // [] = All; otherwise selected subset
1065
  };
1066
 
1067
+ const ALL_TASK_TYPES = ['Code', 'MCQ', 'QA'];
1068
+ const ALL_DOMAINS = ['Cultural', 'STEM', 'Legal', 'Medical', 'Safety', 'Coding', 'Poetry'];
1069
+
1070
  window.initLeaderboard = function (data) {
1071
  lbData = data;
1072
 
 
1086
  prepareColumns(lbData);
1087
  renderTable(lbData);
1088
  applyFilters();
1089
+ setupFilterPills();
1090
  buildColMenuDOM();
1091
  setupScoreClickHintToast();
1092
 
 
1147
  window.toggleTableOps = toggleTableOps;
1148
  window.setRankMode = setRankMode;
1149
  window.setAvgMode = setAvgMode;
 
1150
 
1151
  function toggleColMenu(e) {
1152
  e?.stopPropagation();
 
1166
  window.resetTableOps = function () {
1167
  tableOptions.rankMode = 'static';
1168
  tableOptions.avgMode = 'all';
1169
+ tableOptions.taskTypes = [];
1170
+ tableOptions.domains = [];
1171
+ // Restore all benchmark columns that were hidden by filters
1172
+ tableColumns.forEach(col => {
1173
+ if (EVAL_COLUMNS.includes(col.id)) col.hidden = false;
1174
+ });
1175
  updateOptionUI();
1176
+ prepareColumns(lbData);
1177
  applyFilters();
1178
  };
1179
 
1180
  function updateOptionUI() {
1181
+ const activeClass = "bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300";
1182
+ const inactiveClass = "bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600";
1183
 
1184
  const setBtn = (id, isActive) => {
1185
  const btn = $(id);
1186
+ if (btn) btn.className = `px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none ${isActive ? activeClass : inactiveClass}`;
1187
  }
1188
 
1189
  setBtn('#rankBtnStatic', tableOptions.rankMode === 'static');
 
1191
  setBtn('#avgBtnAll', tableOptions.avgMode === 'all');
1192
  setBtn('#avgBtnVisible', tableOptions.avgMode === 'visible');
1193
 
1194
+ setupFilterPills();
 
 
 
1195
  }
1196
 
1197
+ function setupFilterPills() {
1198
+ const pill = (active) =>
1199
+ `px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none ` +
1200
+ (active
1201
+ ? 'bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300'
1202
+ : 'bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600');
1203
+
1204
+ const renderPills = (containerId, items, activeList, toggleFn) => {
1205
+ const container = $(`#${containerId}`);
1206
+ if (!container) return;
1207
+ const allActive = activeList.length === 0;
1208
+ container.innerHTML = [
1209
+ `<button onclick="window.${toggleFn}('all')" class="${pill(allActive)}">All</button>`,
1210
+ ...items.map(item =>
1211
+ `<button onclick="window.${toggleFn}('${item}')" class="${pill(activeList.includes(item))}">${item}</button>`)
1212
+ ].join('');
1213
+ };
1214
+
1215
+ renderPills('taskTypeFilterBtns', ALL_TASK_TYPES, tableOptions.taskTypes, 'toggleTaskType');
1216
+ renderPills('domainFilterBtns', ALL_DOMAINS, tableOptions.domains, 'toggleDomain');
1217
+ }
1218
+
1219
+ function applyColumnFilters() {
1220
+ const meta = window.BENCHMARK_METADATA || {};
1221
+ const activeTasks = tableOptions.taskTypes;
1222
+ const activeDomains = tableOptions.domains;
1223
+
1224
+ tableColumns.forEach(col => {
1225
+ if (!EVAL_COLUMNS.includes(col.id)) return;
1226
+ const m = meta[col.id];
1227
+ if (!m) return;
1228
+
1229
+ // mixed (MedArabiQ): visible if MCQ or QA is among selected types
1230
+ const taskMatch = activeTasks.length === 0 ||
1231
+ (m.task_type === 'mixed'
1232
+ ? activeTasks.some(t => t === 'MCQ' || t === 'QA')
1233
+ : activeTasks.includes(m.task_type));
1234
+
1235
+ const domainMatch = activeDomains.length === 0 ||
1236
+ activeDomains.includes(m.domain);
1237
+
1238
+ col.hidden = !(taskMatch && domainMatch);
1239
+ });
1240
+ }
1241
+
1242
+ window.toggleTaskType = function (type) {
1243
+ if (type === 'all') {
1244
+ tableOptions.taskTypes = [];
1245
+ } else {
1246
+ const idx = tableOptions.taskTypes.indexOf(type);
1247
+ if (idx === -1) tableOptions.taskTypes.push(type);
1248
+ else tableOptions.taskTypes.splice(idx, 1);
1249
+ if (tableOptions.taskTypes.length === ALL_TASK_TYPES.length)
1250
+ tableOptions.taskTypes = [];
1251
+ }
1252
+ const hasFilters = tableOptions.taskTypes.length > 0 || tableOptions.domains.length > 0;
1253
+ tableOptions.avgMode = hasFilters ? 'visible' : 'all';
1254
+ applyColumnFilters();
1255
+ updateOptionUI();
1256
+ prepareColumns(lbData);
1257
+ applyFilters();
1258
+ };
1259
+
1260
+ window.toggleDomain = function (domain) {
1261
+ if (domain === 'all') {
1262
+ tableOptions.domains = [];
1263
+ } else {
1264
+ const idx = tableOptions.domains.indexOf(domain);
1265
+ if (idx === -1) tableOptions.domains.push(domain);
1266
+ else tableOptions.domains.splice(idx, 1);
1267
+ if (tableOptions.domains.length === ALL_DOMAINS.length)
1268
+ tableOptions.domains = [];
1269
+ }
1270
+ const hasFilters = tableOptions.taskTypes.length > 0 || tableOptions.domains.length > 0;
1271
+ tableOptions.avgMode = hasFilters ? 'visible' : 'all';
1272
+ applyColumnFilters();
1273
+ updateOptionUI();
1274
+ prepareColumns(lbData);
1275
+ applyFilters();
1276
+ };
1277
+
1278
  function setRankMode(mode) {
1279
  tableOptions.rankMode = mode;
1280
  updateOptionUI();
 
1287
  applyFilters();
1288
  }
1289
 
1290
+ window.toggleExpandText = function (idBase, btn) {
 
 
 
 
 
 
 
1291
  const shortEl = document.getElementById(`${idBase}-short`);
1292
  const fullEl = document.getElementById(`${idBase}-full`);
1293
  if (!shortEl || !fullEl || !btn) return;
 
1423
  if (!res.ok) throw new Error(payload.error || "Failed to load details");
1424
  if (requestId !== benchmarkDetailsState.requestId) return;
1425
 
1426
+ let subtasks = payload.subtasks || [];
1427
+ let rows = payload.rows || [];
1428
+
1429
+ // Filter MedArabiQ subtasks/rows when MCQ-only or QA-only filter is active
1430
+ if (benchmarkDetailsState.benchmark === 'MedArabiQ') {
1431
+ const _at = tableOptions.taskTypes;
1432
+ const _mcqOnly = _at.length > 0 && _at.every(t => t === 'MCQ');
1433
+ const _qaOnly = _at.length > 0 && _at.every(t => t === 'QA');
1434
+ if (_mcqOnly) {
1435
+ subtasks = subtasks.filter(s => String(s.subtask || '').toLowerCase().includes('mcq'));
1436
+ rows = rows.filter(r => String(r.subtask || '').toLowerCase().includes('mcq'));
1437
+ } else if (_qaOnly) {
1438
+ subtasks = subtasks.filter(s => !String(s.subtask || '').toLowerCase().includes('mcq'));
1439
+ rows = rows.filter(r => !String(r.subtask || '').toLowerCase().includes('mcq'));
1440
+ }
1441
+ }
1442
+
1443
  if (reset) {
1444
  const benchmarkNorm = String(benchmarkDetailsState.benchmark || "").trim().toLowerCase();
1445
  const hasDistinctSubtask = subtasks.some(s => {
 
1848
  (!lics.length || lics.includes(r["License"]))
1849
  ).map(row => ({ ...row }));
1850
 
1851
+ // 2 & 3. When "Filtered Average" is active: swap MedArabiQ to the
1852
+ // relevant subset score, then recalculate average over visible cols.
1853
  if (tableOptions.avgMode === 'visible') {
1854
+ const _activeTasks = tableOptions.taskTypes;
1855
+ const _mcqOnly = _activeTasks.length > 0 && _activeTasks.every(t => t === 'MCQ');
1856
+ const _qaOnly = _activeTasks.length > 0 && _activeTasks.every(t => t === 'QA');
1857
+ if (_mcqOnly || _qaOnly) {
1858
+ const subKey = _mcqOnly ? 'MedArabiQ_MCQ' : 'MedArabiQ_QA';
1859
+ filtered.forEach(row => {
1860
+ const sub = parseFloat(row[subKey]);
1861
+ if (!isNaN(sub) && sub > 0) row['MedArabiQ'] = sub;
1862
+ });
1863
+ }
1864
  const visibleCols = tableColumns.filter(c => EVAL_COLUMNS.includes(c.id) && !c.hidden).map(c => c.id);
1865
  filtered.forEach(row => {
1866
+ row['Average'] = visibleCols.length > 0
1867
+ ? visibleCols.reduce((acc, col) => acc + (parseFloat(row[col]) || 0), 0) / visibleCols.length
1868
+ : 0;
 
 
 
1869
  });
1870
  }
1871
 
 
1924
 
1925
  tableColumns = keys.map(key => {
1926
  const isScore = EVAL_COLUMNS.includes(key), isAvg = key.includes("Average");
1927
+ // Skip hidden subset columns — they are data-only, never rendered as columns
1928
+ if (key === 'MedArabiQ_MCQ' || key === 'MedArabiQ_QA') {
1929
+ return { id: key, name: key, hidden: true, sort: false };
1930
+ }
1931
  let name = isAvg ? key.toUpperCase() : key;
1932
+ // MedArabiQ: show subset subtitle when a single task type is active
1933
+ if (key === 'MedArabiQ') {
1934
+ const at = tableOptions.taskTypes;
1935
+ if (at.length === 1 && at[0] === 'MCQ')
1936
+ name = `MedArabiQ<br><span style="font-size:9px;font-weight:400;line-height:1;opacity:0.85;">(MCQ subsets)</span>`;
1937
+ else if (at.length === 1 && at[0] === 'QA')
1938
+ name = `MedArabiQ<br><span style="font-size:9px;font-weight:400;line-height:1;opacity:0.85;">(QA subsets)</span>`;
1939
+ }
1940
  let icon = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="lucide lucide-chevrons-up-down"><path d="m7 15 5 5 5-5"/><path d="m7 9 5-5 5 5"/></svg>`;
1941
  let cls = "text-slate-400 hover:text-slate-600 dark:text-slate-500 dark:hover:text-slate-300";
1942
 
 
2002
  return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
2003
  };
2004
 
2005
+ const shouldUseBar = true;
 
 
 
 
 
 
 
 
2006
 
2007
  def.formatter = (c, r) => {
2008
  const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";