Spaces:

qimma
/

leaderboard

Running on CPU Upgrade

App Files Files Community

LeenAlQadi commited on Mar 23

Commit

f20c7d0

1 Parent(s): aa18243

added filters and fixed adaptive filtered average

Browse files

Files changed (6) hide show

.gitignore +2 -1
app.py +8 -4
backend/config.py +34 -0
backend/data_loader.py +11 -4
frontend/index.html +4 -0
frontend/leaderboard.html +167 -81

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 **/__pycache__/**
 .vscode/
 scripts/*
-samples/*

 **/__pycache__/**
 .vscode/
 scripts/*
+samples/*
+*.log

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from backend.data_loader import (
     load_benchmark_details,
 )
 from backend.submission_handler import submit_model
-from backend.config import TASKS, API, hf_api_token, BENCHMARK_DETAILS_PAGE_SIZE
 from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
 # Logging setup
@@ -48,7 +48,7 @@ def update_leaderboard_cache():
             df = df.drop(columns=["Model Size Filter"], errors="ignore")
             # Keep scores numeric, but show Unknown for missing metadata fields.
-            score_cols = [t[2] for t in TASKS] + ["Average", "Rank"]
             for col in score_cols:
                 if col in df.columns:
                     df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
@@ -164,7 +164,11 @@ templates = Jinja2Templates(directory="frontend")
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     eval_columns = [t[2] for t in TASKS]
-    return templates.TemplateResponse("index.html", {"request": request, "eval_columns": eval_columns})
 @app.get("/api/leaderboard")
 async def get_leaderboard_data():
@@ -268,4 +272,4 @@ async def read_page(request: Request, page_name: str):
     return templates.TemplateResponse(page_name, {"request": request})
 if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True, access_log=False)

     load_benchmark_details,
 )
 from backend.submission_handler import submit_model
+from backend.config import TASKS, HIDDEN_TASKS, BENCHMARK_METADATA, API, hf_api_token, BENCHMARK_DETAILS_PAGE_SIZE
 from fastapi import FastAPI, Request, Form, BackgroundTasks, HTTPException
 # Logging setup
             df = df.drop(columns=["Model Size Filter"], errors="ignore")
             # Keep scores numeric, but show Unknown for missing metadata fields.
+            score_cols = [t[2] for t in TASKS] + [t[2] for t in HIDDEN_TASKS] + ["Average", "Rank"]
             for col in score_cols:
                 if col in df.columns:
                     df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     eval_columns = [t[2] for t in TASKS]
+    return templates.TemplateResponse("index.html", {
+        "request": request,
+        "eval_columns": eval_columns,
+        "benchmark_metadata": BENCHMARK_METADATA,
+    })
 @app.get("/api/leaderboard")
 async def get_leaderboard_data():
     return templates.TemplateResponse(page_name, {"request": request})
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7866, reload=True, access_log=False)

backend/config.py CHANGED Viewed

@@ -62,6 +62,40 @@ TASK_SOURCES: Dict[str, Dict[str, Any]] = {
 TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
 # --- 3. Model Metadata ---
 MODEL_TYPE_TO_EMOJI: Dict[str, str] = {

 TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
+# Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and
+# the Average calculation.  Used to expose MedArabiQ subset-level scores when
+# the Task Type filter is set to MCQ-only or QA-only on the frontend.
+HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [
+    (
+        "qimma-MedArabiQ:_average|0",
+        "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",
+        "MedArabiQ_MCQ",
+    ),
+    (
+        "qimma-MedArabicQ-QA:_average|0",
+        "BERTScore-F",
+        "MedArabiQ_QA",
+    ),
+]
+# Per-benchmark metadata consumed by the frontend Task Type and Domain filters.
+BENCHMARK_METADATA: Dict[str, Dict[str, str]] = {
+    "AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"},
+    "ArabicMMLU":      {"domain": "STEM",     "task_type": "MCQ"},
+    "ArabCulture":     {"domain": "Cultural", "task_type": "MCQ"},
+    "PALMX":           {"domain": "Cultural", "task_type": "MCQ"},
+    "3LM STEM":        {"domain": "STEM",     "task_type": "MCQ"},
+    "AraTrust":        {"domain": "Safety",   "task_type": "MCQ"},
+    "MizanQA":         {"domain": "Legal",    "task_type": "MCQ"},
+    "MedArabiQ":       {"domain": "Medical",  "task_type": "mixed"},
+    "ArabLegalQA":     {"domain": "Legal",    "task_type": "QA"},
+    "GAT":             {"domain": "STEM",     "task_type": "MCQ"},
+    "MedAraBench":     {"domain": "Medical",  "task_type": "MCQ"},
+    "HumanEval+":      {"domain": "Coding",   "task_type": "Code"},
+    "MBPP+":           {"domain": "Coding",   "task_type": "Code"},
+    "FannOrFlop":      {"domain": "Poetry",   "task_type": "MCQ"},
+}
 # --- 3. Model Metadata ---
 MODEL_TYPE_TO_EMOJI: Dict[str, str] = {

backend/data_loader.py CHANGED Viewed

@@ -26,6 +26,7 @@ from backend.config import (
     RESULTS_REPO_ID,
     TASKS,
     TASK_SOURCES,
     MODEL_TYPE_TO_EMOJI,
     hf_api_token,
 )
@@ -42,6 +43,9 @@ _TASKS_BY_SOURCE = {
     source: cfg.get("tasks", [])
     for source, cfg in TASK_SOURCES.items()
 }
 _RESULT_SCORE_CACHE: Dict[tuple[str, str], Optional[float]] = {}
@@ -1250,7 +1254,10 @@ def load_scoreboard() -> pd.DataFrame:
     df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
     task_cols = [t[2] for t in TASKS]
-    for col in task_cols:
         if col not in df.columns:
             df[col] = np.nan
@@ -1273,11 +1280,11 @@ def load_scoreboard() -> pd.DataFrame:
         "datetime": "max",
         "Precision": first_valid_precision,
     }
-    agg_map.update({col: first_non_null for col in task_cols})
     df = df.groupby("Model Name", as_index=False).agg(agg_map)
-    # numeric
-    for col in task_cols:
         df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
     df["Average"] = df[task_cols].mean(axis=1).round(2)

     RESULTS_REPO_ID,
     TASKS,
     TASK_SOURCES,
+    HIDDEN_TASKS,
     MODEL_TYPE_TO_EMOJI,
     hf_api_token,
 )
     source: cfg.get("tasks", [])
     for source, cfg in TASK_SOURCES.items()
 }
+# Wire hidden tasks into the "results" source so _parse_result_file extracts
+# them alongside normal tasks without touching the shared TASK_SOURCES dict.
+_TASKS_BY_SOURCE["results"] = list(_TASKS_BY_SOURCE.get("results", [])) + HIDDEN_TASKS
 _RESULT_SCORE_CACHE: Dict[tuple[str, str], Optional[float]] = {}
     df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
     task_cols = [t[2] for t in TASKS]
+    hidden_cols = [t[2] for t in HIDDEN_TASKS]
+    all_score_cols = task_cols + hidden_cols
+    for col in all_score_cols:
         if col not in df.columns:
             df[col] = np.nan
         "datetime": "max",
         "Precision": first_valid_precision,
     }
+    agg_map.update({col: first_non_null for col in all_score_cols})
     df = df.groupby("Model Name", as_index=False).agg(agg_map)
+    # numeric — hidden_cols converted but excluded from Average
+    for col in all_score_cols:
         df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
     df["Average"] = df[task_cols].mean(axis=1).round(2)

frontend/index.html CHANGED Viewed

@@ -60,9 +60,13 @@
     <script id="eval-columns-data" type="application/json">
         {{ eval_columns | tojson }}
     </script>
     <script>
         window.EVAL_COLUMNS = JSON.parse(document.getElementById('eval-columns-data').textContent);
         const $ = s => document.querySelector(s);
         function toggleDarkMode() {

     <script id="eval-columns-data" type="application/json">
         {{ eval_columns | tojson }}
     </script>
+    <script id="benchmark-metadata-data" type="application/json">
+        {{ benchmark_metadata | tojson }}
+    </script>
     <script>
         window.EVAL_COLUMNS = JSON.parse(document.getElementById('eval-columns-data').textContent);
+        window.BENCHMARK_METADATA = JSON.parse(document.getElementById('benchmark-metadata-data').textContent);
         const $ = s => document.querySelector(s);
         function toggleDarkMode() {

frontend/leaderboard.html CHANGED Viewed

@@ -703,7 +703,7 @@
         </button>
         <!-- Table Options Menu -->
         <div id="tableOpsMenu"
-            class="hidden absolute top-10 right-40 z-50 w-72 p-4 bg-white dark:bg-slate-800 rounded-xl shadow-xl border border-slate-200 dark:border-slate-700">
             <!-- Header with Reset -->
             <div class="flex justify-between items-center mb-4 pb-2 border-b border-slate-100 dark:border-slate-700">
@@ -723,8 +723,8 @@
                         <div
                             class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
                             <p>
-                                <strong>Static:</strong> shows the average value across all benchmarks.<br>
-                                <strong>Dynamic:</strong> updates the rank based on the current sort and filters.
                             </p>
                             <div
                                 class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
@@ -733,11 +733,11 @@
                     </div>
                 </h4>
-                <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
                     <button onclick="window.setRankMode('static')" id="rankBtnStatic"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">Static</button>
                     <button onclick="window.setRankMode('dynamic')" id="rankBtnDynamic"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Dynamic</button>
                 </div>
             </div>
@@ -751,9 +751,8 @@
                         <div
                             class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
                             <p>
-                                <strong>All Scores</strong> Calculates the average across all benchmarks.<br>
-                                <strong>Visible Only</strong> Recalculates the average using only the evaluation columns
-                                currently shown.
                             </p>
                             <div
                                 class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
@@ -762,46 +761,26 @@
                     </div>
                 </h4>
-                <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
                     <button onclick="window.setAvgMode('all')" id="avgBtnAll"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">All
-                        Scores</button>
                     <button onclick="window.setAvgMode('visible')" id="avgBtnVisible"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Visible
-                        Only</button>
                 </div>
             </div>
-            <!-- Section 3: Display Score -->
             <div class="mt-5">
-                <h4 class="text-xs font-bold text-slate-400 mb-3 flex items-center gap-2">
-                    Score Status Display
-                    <div class="relative tooltip-trigger group cursor-help">
-                        <i data-lucide="info"
-                            class="w-3.5 h-3.5 text-slate-400 hover:text-slate-600 dark:hover:text-slate-200"></i>
-                        <div
-                            class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
-                            <p>
-                                <strong>All:</strong> Visual progress bars on all score columns.<br>
-                                <strong>Avg Only:</strong> Progress bar on Average, raw text on other score columns.<br>
-                                <strong>Raw:</strong> Raw text numbers on all score columns.
-                            </p>
-                            <div
-                                class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
-                            </div>
-                        </div>
-                    </div>
-                </h4>
-                <div class="flex bg-slate-100 dark:bg-slate-700/50 p-1 rounded-lg">
-                    <button onclick="window.setScoreDisplay('all')" id="scoreBtnAll"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300">All</button>
-                    <button onclick="window.setScoreDisplay('avg')" id="scoreBtnAvg"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Avg
-                        Only</button>
-                    <button onclick="window.setScoreDisplay('raw')" id="scoreBtnRaw"
-                        class="flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200">Raw</button>
-                </div>
             </div>
         </div>
         <!-- Col Visibility -->
@@ -1079,11 +1058,15 @@
                 // New State
                 let tableOptions = {
-                    rankMode: 'static', // 'static' | 'dynamic'
-                    avgMode: 'all',      // 'all' | 'visible'
-                    scoreDisplay: 'all'  // 'all' | 'avg' | 'raw'
                 };
                 window.initLeaderboard = function (data) {
                     lbData = data;
@@ -1103,6 +1086,7 @@
                     prepareColumns(lbData);
                     renderTable(lbData);
                     applyFilters();
                     buildColMenuDOM();
                     setupScoreClickHintToast();
@@ -1163,7 +1147,6 @@
                 window.toggleTableOps = toggleTableOps;
                 window.setRankMode = setRankMode;
                 window.setAvgMode = setAvgMode;
-                window.setScoreDisplay = setScoreDisplay;
                 function toggleColMenu(e) {
                     e?.stopPropagation();
@@ -1183,19 +1166,24 @@
                 window.resetTableOps = function () {
                     tableOptions.rankMode = 'static';
                     tableOptions.avgMode = 'all';
-                    tableOptions.scoreDisplay = 'all';
                     updateOptionUI();
-                    prepareColumns(lbData); // Re-prepare columns to reset formatters
                     applyFilters();
                 };
                 function updateOptionUI() {
-                    const activeClass = "bg-white dark:bg-slate-600 shadow-sm text-indigo-600 dark:text-indigo-300";
-                    const inactiveClass = "text-slate-500 dark:text-slate-400 hover:text-slate-700 dark:hover:text-slate-200";
                     const setBtn = (id, isActive) => {
                         const btn = $(id);
-                        if (btn) btn.className = `flex-1 px-3 py-1.5 text-xs font-medium rounded-md transition-all ${isActive ? activeClass : inactiveClass}`;
                     }
                     setBtn('#rankBtnStatic', tableOptions.rankMode === 'static');
@@ -1203,12 +1191,90 @@
                     setBtn('#avgBtnAll', tableOptions.avgMode === 'all');
                     setBtn('#avgBtnVisible', tableOptions.avgMode === 'visible');
-                    // New UI Update
-                    setBtn('#scoreBtnAll', tableOptions.scoreDisplay === 'all');
-                    setBtn('#scoreBtnAvg', tableOptions.scoreDisplay === 'avg');
-                    setBtn('#scoreBtnRaw', tableOptions.scoreDisplay === 'raw');
                 }
                 function setRankMode(mode) {
                     tableOptions.rankMode = mode;
                     updateOptionUI();
@@ -1221,14 +1287,7 @@
                     applyFilters();
                 }
-                function setScoreDisplay(mode) {
-                    tableOptions.scoreDisplay = mode;
-                    updateOptionUI();
-                    prepareColumns(lbData); // Formatting changed, so we must rebuild columns
-                    applyFilters();
-                }
-                window.toggleExpandText = function (idBase, btn) {
                     const shortEl = document.getElementById(`${idBase}-short`);
                     const fullEl = document.getElementById(`${idBase}-full`);
                     if (!shortEl || !fullEl || !btn) return;
@@ -1364,8 +1423,23 @@
                         if (!res.ok) throw new Error(payload.error || "Failed to load details");
                         if (requestId !== benchmarkDetailsState.requestId) return;
-                        const subtasks = payload.subtasks || [];
-                        const rows = payload.rows || [];
                         if (reset) {
                             const benchmarkNorm = String(benchmarkDetailsState.benchmark || "").trim().toLowerCase();
                             const hasDistinctSubtask = subtasks.some(s => {
@@ -1774,16 +1848,24 @@
                         (!lics.length || lics.includes(r["License"]))
                     ).map(row => ({ ...row }));
-                    // 2. Handle Average Score Calculation (If Visible Only)
                     if (tableOptions.avgMode === 'visible') {
                         const visibleCols = tableColumns.filter(c => EVAL_COLUMNS.includes(c.id) && !c.hidden).map(c => c.id);
                         filtered.forEach(row => {
-                            if (visibleCols.length > 0) {
-                                const sum = visibleCols.reduce((acc, col) => acc + (parseFloat(row[col]) || 0), 0);
-                                row['Average'] = (sum / visibleCols.length);
-                            } else {
-                                row['Average'] = 0;
-                            }
                         });
                     }
@@ -1842,7 +1924,19 @@
                     tableColumns = keys.map(key => {
                         const isScore = EVAL_COLUMNS.includes(key), isAvg = key.includes("Average");
                         let name = isAvg ? key.toUpperCase() : key;
                         let icon = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="lucide lucide-chevrons-up-down"><path d="m7 15 5 5 5-5"/><path d="m7 9 5-5 5 5"/></svg>`;
                         let cls = "text-slate-400 hover:text-slate-600 dark:text-slate-500 dark:hover:text-slate-300";
@@ -1908,15 +2002,7 @@
                                 return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
                             };
-                            let shouldUseBar = false;
-                            if (tableOptions.scoreDisplay === 'all') {
-                                shouldUseBar = true;
-                            } else if (tableOptions.scoreDisplay === 'avg') {
-                                shouldUseBar = isAvg;
-                            } else if (tableOptions.scoreDisplay === 'raw') {
-                                shouldUseBar = false;
-                            }
                             def.formatter = (c, r) => {
                                 const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";

         </button>
         <!-- Table Options Menu -->
         <div id="tableOpsMenu"
+            class="hidden absolute top-10 right-40 z-50 w-80 p-4 bg-white dark:bg-slate-800 rounded-xl shadow-xl border border-slate-200 dark:border-slate-700 max-h-[80vh] overflow-y-auto">
             <!-- Header with Reset -->
             <div class="flex justify-between items-center mb-4 pb-2 border-b border-slate-100 dark:border-slate-700">
                         <div
                             class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
                             <p>
+                                <strong>Overall Rank:</strong> shows the rank based on the global average across all benchmarks.<br>
+                                <strong>Current View Rank:</strong> updates the rank based on the current sort and filters.
                             </p>
                             <div
                                 class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
                     </div>
                 </h4>
+                <div class="flex flex-wrap gap-1.5">
                     <button onclick="window.setRankMode('static')" id="rankBtnStatic"
+                        class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300">Overall Rank</button>
                     <button onclick="window.setRankMode('dynamic')" id="rankBtnDynamic"
+                        class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600">Current View Rank</button>
                 </div>
             </div>
                         <div
                             class="tooltip-content invisible opacity-0 absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-slate-800 text-white text-xs rounded shadow-lg transition-all z-50 text-left pointer-events-none">
                             <p>
+                                <strong>Global Average:</strong> Calculates the average across all benchmarks.<br>
+                                <strong>Filtered Average:</strong> Recalculates the average using only the currently visible evaluation columns.
                             </p>
                             <div
                                 class="absolute top-full left-1/2 -translate-x-1/2 border-4 border-transparent border-t-slate-800">
                     </div>
                 </h4>
+                <div class="flex flex-wrap gap-1.5">
                     <button onclick="window.setAvgMode('all')" id="avgBtnAll"
+                        class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300">Global Average</button>
                     <button onclick="window.setAvgMode('visible')" id="avgBtnVisible"
+                        class="px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600">Filtered Average</button>
                 </div>
             </div>
+            <!-- Section 3: Task Type -->
             <div class="mt-5">
+                <h4 class="text-xs font-bold text-slate-400 mb-2">Task Type</h4>
+                <div id="taskTypeFilterBtns" class="flex flex-wrap gap-1.5"></div>
             </div>
+            <!-- Section 4: Domain -->
+            <div class="mt-5">
+                <h4 class="text-xs font-bold text-slate-400 mb-2">Domain</h4>
+                <div id="domainFilterBtns" class="flex flex-wrap gap-1.5"></div>
+            </div>
         </div>
         <!-- Col Visibility -->
                 // New State
                 let tableOptions = {
+                    rankMode: 'static',  // 'static' | 'dynamic'
+                    avgMode: 'all',       // 'all' | 'visible'
+taskTypes: [],        // [] = All; otherwise selected subset
+                    domains: [],          // [] = All; otherwise selected subset
                 };
+                const ALL_TASK_TYPES = ['Code', 'MCQ', 'QA'];
+                const ALL_DOMAINS    = ['Cultural', 'STEM', 'Legal', 'Medical', 'Safety', 'Coding', 'Poetry'];
                 window.initLeaderboard = function (data) {
                     lbData = data;
                     prepareColumns(lbData);
                     renderTable(lbData);
                     applyFilters();
+                    setupFilterPills();
                     buildColMenuDOM();
                     setupScoreClickHintToast();
                 window.toggleTableOps = toggleTableOps;
                 window.setRankMode = setRankMode;
                 window.setAvgMode = setAvgMode;
                 function toggleColMenu(e) {
                     e?.stopPropagation();
                 window.resetTableOps = function () {
                     tableOptions.rankMode = 'static';
                     tableOptions.avgMode = 'all';
+                    tableOptions.taskTypes = [];
+                    tableOptions.domains = [];
+                    // Restore all benchmark columns that were hidden by filters
+                    tableColumns.forEach(col => {
+                        if (EVAL_COLUMNS.includes(col.id)) col.hidden = false;
+                    });
                     updateOptionUI();
+                    prepareColumns(lbData);
                     applyFilters();
                 };
                 function updateOptionUI() {
+                    const activeClass = "bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300";
+                    const inactiveClass = "bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600";
                     const setBtn = (id, isActive) => {
                         const btn = $(id);
+                        if (btn) btn.className = `px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none ${isActive ? activeClass : inactiveClass}`;
                     }
                     setBtn('#rankBtnStatic', tableOptions.rankMode === 'static');
                     setBtn('#avgBtnAll', tableOptions.avgMode === 'all');
                     setBtn('#avgBtnVisible', tableOptions.avgMode === 'visible');
+setupFilterPills();
                 }
+                function setupFilterPills() {
+                    const pill = (active) =>
+                        `px-3 py-1.5 rounded-lg border text-xs font-medium cursor-pointer transition-all select-none ` +
+                        (active
+                            ? 'bg-indigo-50 border-indigo-400 text-indigo-600 dark:bg-indigo-900/30 dark:border-indigo-500 dark:text-indigo-300'
+                            : 'bg-white border-slate-200 text-slate-500 hover:bg-slate-50 dark:bg-slate-700 dark:border-slate-600 dark:text-slate-300 dark:hover:bg-slate-600');
+                    const renderPills = (containerId, items, activeList, toggleFn) => {
+                        const container = $(`#${containerId}`);
+                        if (!container) return;
+                        const allActive = activeList.length === 0;
+                        container.innerHTML = [
+                            `<button onclick="window.${toggleFn}('all')" class="${pill(allActive)}">All</button>`,
+                            ...items.map(item =>
+                                `<button onclick="window.${toggleFn}('${item}')" class="${pill(activeList.includes(item))}">${item}</button>`)
+                        ].join('');
+                    };
+                    renderPills('taskTypeFilterBtns', ALL_TASK_TYPES, tableOptions.taskTypes, 'toggleTaskType');
+                    renderPills('domainFilterBtns',   ALL_DOMAINS,    tableOptions.domains,    'toggleDomain');
+                }
+                function applyColumnFilters() {
+                    const meta = window.BENCHMARK_METADATA || {};
+                    const activeTasks   = tableOptions.taskTypes;
+                    const activeDomains = tableOptions.domains;
+                    tableColumns.forEach(col => {
+                        if (!EVAL_COLUMNS.includes(col.id)) return;
+                        const m = meta[col.id];
+                        if (!m) return;
+                        // mixed (MedArabiQ): visible if MCQ or QA is among selected types
+                        const taskMatch = activeTasks.length === 0 ||
+                            (m.task_type === 'mixed'
+                                ? activeTasks.some(t => t === 'MCQ' || t === 'QA')
+                                : activeTasks.includes(m.task_type));
+                        const domainMatch = activeDomains.length === 0 ||
+                            activeDomains.includes(m.domain);
+                        col.hidden = !(taskMatch && domainMatch);
+                    });
+                }
+                window.toggleTaskType = function (type) {
+                    if (type === 'all') {
+                        tableOptions.taskTypes = [];
+                    } else {
+                        const idx = tableOptions.taskTypes.indexOf(type);
+                        if (idx === -1) tableOptions.taskTypes.push(type);
+                        else tableOptions.taskTypes.splice(idx, 1);
+                        if (tableOptions.taskTypes.length === ALL_TASK_TYPES.length)
+                            tableOptions.taskTypes = [];
+                    }
+                    const hasFilters = tableOptions.taskTypes.length > 0 || tableOptions.domains.length > 0;
+                    tableOptions.avgMode = hasFilters ? 'visible' : 'all';
+                    applyColumnFilters();
+                    updateOptionUI();
+                    prepareColumns(lbData);
+                    applyFilters();
+                };
+                window.toggleDomain = function (domain) {
+                    if (domain === 'all') {
+                        tableOptions.domains = [];
+                    } else {
+                        const idx = tableOptions.domains.indexOf(domain);
+                        if (idx === -1) tableOptions.domains.push(domain);
+                        else tableOptions.domains.splice(idx, 1);
+                        if (tableOptions.domains.length === ALL_DOMAINS.length)
+                            tableOptions.domains = [];
+                    }
+                    const hasFilters = tableOptions.taskTypes.length > 0 || tableOptions.domains.length > 0;
+                    tableOptions.avgMode = hasFilters ? 'visible' : 'all';
+                    applyColumnFilters();
+                    updateOptionUI();
+                    prepareColumns(lbData);
+                    applyFilters();
+                };
                 function setRankMode(mode) {
                     tableOptions.rankMode = mode;
                     updateOptionUI();
                     applyFilters();
                 }
+window.toggleExpandText = function (idBase, btn) {
                     const shortEl = document.getElementById(`${idBase}-short`);
                     const fullEl = document.getElementById(`${idBase}-full`);
                     if (!shortEl || !fullEl || !btn) return;
                         if (!res.ok) throw new Error(payload.error || "Failed to load details");
                         if (requestId !== benchmarkDetailsState.requestId) return;
+                        let subtasks = payload.subtasks || [];
+                        let rows = payload.rows || [];
+                        // Filter MedArabiQ subtasks/rows when MCQ-only or QA-only filter is active
+                        if (benchmarkDetailsState.benchmark === 'MedArabiQ') {
+                            const _at = tableOptions.taskTypes;
+                            const _mcqOnly = _at.length > 0 && _at.every(t => t === 'MCQ');
+                            const _qaOnly  = _at.length > 0 && _at.every(t => t === 'QA');
+                            if (_mcqOnly) {
+                                subtasks = subtasks.filter(s => String(s.subtask || '').toLowerCase().includes('mcq'));
+                                rows     = rows.filter(r => String(r.subtask || '').toLowerCase().includes('mcq'));
+                            } else if (_qaOnly) {
+                                subtasks = subtasks.filter(s => !String(s.subtask || '').toLowerCase().includes('mcq'));
+                                rows     = rows.filter(r => !String(r.subtask || '').toLowerCase().includes('mcq'));
+                            }
+                        }
                         if (reset) {
                             const benchmarkNorm = String(benchmarkDetailsState.benchmark || "").trim().toLowerCase();
                             const hasDistinctSubtask = subtasks.some(s => {
                         (!lics.length || lics.includes(r["License"]))
                     ).map(row => ({ ...row }));
+                    // 2 & 3. When "Filtered Average" is active: swap MedArabiQ to the
+                    //        relevant subset score, then recalculate average over visible cols.
                     if (tableOptions.avgMode === 'visible') {
+                        const _activeTasks = tableOptions.taskTypes;
+                        const _mcqOnly = _activeTasks.length > 0 && _activeTasks.every(t => t === 'MCQ');
+                        const _qaOnly  = _activeTasks.length > 0 && _activeTasks.every(t => t === 'QA');
+                        if (_mcqOnly || _qaOnly) {
+                            const subKey = _mcqOnly ? 'MedArabiQ_MCQ' : 'MedArabiQ_QA';
+                            filtered.forEach(row => {
+                                const sub = parseFloat(row[subKey]);
+                                if (!isNaN(sub) && sub > 0) row['MedArabiQ'] = sub;
+                            });
+                        }
                         const visibleCols = tableColumns.filter(c => EVAL_COLUMNS.includes(c.id) && !c.hidden).map(c => c.id);
                         filtered.forEach(row => {
+                            row['Average'] = visibleCols.length > 0
+                                ? visibleCols.reduce((acc, col) => acc + (parseFloat(row[col]) || 0), 0) / visibleCols.length
+                                : 0;
                         });
                     }
                     tableColumns = keys.map(key => {
                         const isScore = EVAL_COLUMNS.includes(key), isAvg = key.includes("Average");
+                        // Skip hidden subset columns — they are data-only, never rendered as columns
+                        if (key === 'MedArabiQ_MCQ' || key === 'MedArabiQ_QA') {
+                            return { id: key, name: key, hidden: true, sort: false };
+                        }
                         let name = isAvg ? key.toUpperCase() : key;
+                        // MedArabiQ: show subset subtitle when a single task type is active
+                        if (key === 'MedArabiQ') {
+                            const at = tableOptions.taskTypes;
+                            if (at.length === 1 && at[0] === 'MCQ')
+                                name = `MedArabiQ<br><span style="font-size:9px;font-weight:400;line-height:1;opacity:0.85;">(MCQ subsets)</span>`;
+                            else if (at.length === 1 && at[0] === 'QA')
+                                name = `MedArabiQ<br><span style="font-size:9px;font-weight:400;line-height:1;opacity:0.85;">(QA subsets)</span>`;
+                        }
                         let icon = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="lucide lucide-chevrons-up-down"><path d="m7 15 5 5 5-5"/><path d="m7 9 5-5 5 5"/></svg>`;
                         let cls = "text-slate-400 hover:text-slate-600 dark:text-slate-500 dark:hover:text-slate-300";
                                 return gridjs.html(`<button onclick="window.openBenchmarkDetails(decodeURIComponent('${em}'), decodeURIComponent('${eb}'))" class="w-full text-left hover:opacity-90 transition-opacity" title="Click for per-question details">${raw}</button>`);
                             };
+                            const shouldUseBar = true;
                             def.formatter = (c, r) => {
                                 const modelName = (modelNameIdx > -1 && r.cells[modelNameIdx]) ? r.cells[modelNameIdx].data : "";