Spaces:

qimma
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Alyafeai commited on Feb 23

Commit

c7488db

1 Parent(s): 1482380

change the way we read results by specifying source type, source type is the prefix of the filename. Sperating the tasks into different lists based on the source

Browse files

Files changed (2) hide show

backend/config.py +29 -6
backend/data_loader.py +69 -12

backend/config.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List, Tuple, Optional
 from huggingface_hub import HfApi
 # --- 1. Repository & API Configuration ---
@@ -15,7 +15,7 @@ API = HfApi(token=hf_api_token)
 # --- 2. Task Definitions ---
 # Format: (dataset_key, metric_field, display_name)
-TASKS: List[Tuple[str, str, str]] = [
     ("qimma-AraDiCE-Culture:_average|0",    "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "AraDiCE-Culture"),
     ("qimma-ArabicMMLU:_average|0",         "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabicMMLU"),
     ("qimma-ArabCulture:_average|0",        "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabCulture"),
@@ -29,14 +29,37 @@ TASKS: List[Tuple[str, str, str]] = [
         "MedArabiQ",
     ),
     ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
-    ("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "qimma-GAT:_average|0", "GAT"),
     ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
     ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
-    # ("humaneval", "humaneval+", "HumanEval+"),
-    # ("mbpp", "mbpp+", "MBPP+"),
-    # ("fann_or_flop", "fann_or_flop", "FannOrFlop"),
 ]
 # --- 3. Model Metadata ---
 MODEL_TYPE_TO_EMOJI: Dict[str, str] = {

 import os
+from typing import Dict, List, Tuple, Optional, Any
 from huggingface_hub import HfApi
 # --- 1. Repository & API Configuration ---
 # --- 2. Task Definitions ---
 # Format: (dataset_key, metric_field, display_name)
+RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
     ("qimma-AraDiCE-Culture:_average|0",    "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "AraDiCE-Culture"),
     ("qimma-ArabicMMLU:_average|0",         "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabicMMLU"),
     ("qimma-ArabCulture:_average|0",        "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabCulture"),
         "MedArabiQ",
     ),
     ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
     ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
     ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
 ]
+CODE_TASKS: List[Tuple[Any, Any, str]] = [
+    ("humaneval", "humaneval+", "HumanEval+"),
+    ("mbpp", "mbpp+", "MBPP+"),
+]
+FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
+    ("fann_or_flop", "fann_or_flop", "FannOrFlop"),
+]
+TASK_SOURCES: Dict[str, Dict[str, Any]] = {
+    "results": {
+        "prefixes": ["results"],
+        "tasks": RESULTS_TASKS,
+    },
+    "code": {
+        "prefixes": ["code"],
+        "tasks": CODE_TASKS,
+    },
+    "fannflop": {
+        # Keep this configurable for future rename.
+        "prefixes": ["fannflop"],
+        "tasks": FANNFLOP_TASKS,
+    },
+}
+TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
 # --- 3. Model Metadata ---
 MODEL_TYPE_TO_EMOJI: Dict[str, str] = {

backend/data_loader.py CHANGED Viewed

@@ -18,6 +18,7 @@ from backend.config import (
     REQUESTS_REPO_ID,
     RESULTS_REPO_ID,
     TASKS,
     MODEL_TYPE_TO_EMOJI,
     hf_api_token,
 )
@@ -25,6 +26,16 @@ from backend.helpers import unify_precision, get_model_size
 logger = logging.getLogger(__name__)
 # -----------------------------------------------------------------------------
 # Utilities
@@ -81,6 +92,14 @@ def load_requests(status: str) -> pd.DataFrame:
 # Results parsing
 # -----------------------------------------------------------------------------
 def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
     try:
         with open(path, "r", encoding="utf-8") as f:
@@ -88,28 +107,38 @@ def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
     cfg = data.get("config_general", {})
     results = data.get("results", {})
-    model = cfg.get("model_name", "UNK")
     precision = unify_precision(cfg.get("model_dtype", "UNK"))
-    string_date = path.name.split("_")[-1].replace(".json", "")
     row = {
         "Model Name": model,
         "Precision": precision,
-        "datetime": datetime.strptime(string_date, "%Y-%m-%dT%H-%M-%S.%f")
     }
-    for task_key, metric_key, display in TASKS:
-        if isinstance(task_key,list):
             weight_total = 0
             metric_total = 0
             for t, (m, w) in zip(task_key, metric_key):
                 val = results.get(t, {}).get(m, 0)
-                metric_total += (val * weight_total)
                 weight_total += w
-            val = metric_total/weight_total
         else:
             val = np.nan
             if task_key in results and metric_key in results[task_key]:
@@ -174,13 +203,38 @@ def load_scoreboard() -> pd.DataFrame:
     df = pd.DataFrame(rows)
     df["datetime"] = pd.to_datetime(df["datetime"])
-    # sort by datetime to get the latest and remove the others
     df = df.sort_values("datetime", ascending=False)
-    df = df.drop_duplicates(subset="Model Name", keep="first")
-    # numeric
     task_cols = [t[2] for t in TASKS]
-    # df[task_cols] = df[task_cols].apply(pd.to_numeric, errors="coerce")
     for col in task_cols:
         df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
     df["Average"] = df[task_cols].mean(axis=1).round(2)
@@ -211,6 +265,7 @@ def load_scoreboard() -> pd.DataFrame:
     if not finished.empty:
         finished["precision"] = finished["precision"].apply(unify_precision)
         meta = finished.groupby(["model", "precision"]).last().reset_index()
         def is_missing(v: Any) -> bool:
             return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
@@ -220,6 +275,8 @@ def load_scoreboard() -> pd.DataFrame:
                 (meta["model"] == row["Model Name"]) &
                 (meta["precision"] == row["Precision"])
             ]
             if not m.empty:
                 m = m.iloc[0]
                 if is_missing(row.get("License")):

     REQUESTS_REPO_ID,
     RESULTS_REPO_ID,
     TASKS,
+    TASK_SOURCES,
     MODEL_TYPE_TO_EMOJI,
     hf_api_token,
 )
 logger = logging.getLogger(__name__)
+_SOURCE_BY_PREFIX = {
+    prefix.lower(): source
+    for source, cfg in TASK_SOURCES.items()
+    for prefix in cfg.get("prefixes", [])
+}
+_TASKS_BY_SOURCE = {
+    source: cfg.get("tasks", [])
+    for source, cfg in TASK_SOURCES.items()
+}
 # -----------------------------------------------------------------------------
 # Utilities
 # Results parsing
 # -----------------------------------------------------------------------------
+def _infer_source_from_filename(path: Path) -> Optional[str]:
+    stem = path.stem
+    if "_" not in stem:
+        return None
+    prefix = stem.split("_", 1)[0].lower()
+    return _SOURCE_BY_PREFIX.get(prefix)
 def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
     try:
         with open(path, "r", encoding="utf-8") as f:
     except Exception:
         return None
+    source_type = _infer_source_from_filename(path)
+    if source_type is None:
+        return None
     cfg = data.get("config_general", {})
     results = data.get("results", {})
+    model = cfg.get("model_name") or data.get("model_name", "UNK")
     precision = unify_precision(cfg.get("model_dtype", "UNK"))
+    try:
+        string_date = path.stem.split("_", 1)[1]
+        parsed_dt = datetime.strptime(string_date, "%Y-%m-%dT%H-%M-%S.%f")
+    except Exception:
+        return None
     row = {
         "Model Name": model,
         "Precision": precision,
+        "datetime": parsed_dt,
+        "Source Type": source_type,
     }
+    for task_key, metric_key, display in _TASKS_BY_SOURCE.get(source_type, []):
+        if isinstance(task_key, list):
             weight_total = 0
             metric_total = 0
             for t, (m, w) in zip(task_key, metric_key):
                 val = results.get(t, {}).get(m, 0)
+                metric_total += (val * w)
                 weight_total += w
+            val = metric_total / weight_total if weight_total > 0 else np.nan
         else:
             val = np.nan
             if task_key in results and metric_key in results[task_key]:
     df = pd.DataFrame(rows)
     df["datetime"] = pd.to_datetime(df["datetime"])
+    # Keep latest file per (model, source), then merge source metrics per model.
     df = df.sort_values("datetime", ascending=False)
+    df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
     task_cols = [t[2] for t in TASKS]
+    for col in task_cols:
+        if col not in df.columns:
+            df[col] = np.nan
+    def first_non_null(values):
+        for v in values:
+            if pd.notna(v):
+                return v
+        return np.nan
+    def first_valid_precision(values):
+        for v in values:
+            if isinstance(v, str) and v.strip() and v not in {"Missing", "UNK"}:
+                return v
+        for v in values:
+            if pd.notna(v):
+                return v
+        return "UNK"
+    agg_map = {
+        "datetime": "max",
+        "Precision": first_valid_precision,
+    }
+    agg_map.update({col: first_non_null for col in task_cols})
+    df = df.groupby("Model Name", as_index=False).agg(agg_map)
+    # numeric
     for col in task_cols:
         df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
     df["Average"] = df[task_cols].mean(axis=1).round(2)
     if not finished.empty:
         finished["precision"] = finished["precision"].apply(unify_precision)
         meta = finished.groupby(["model", "precision"]).last().reset_index()
+        meta_by_model = finished.groupby(["model"]).last().reset_index()
         def is_missing(v: Any) -> bool:
             return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
                 (meta["model"] == row["Model Name"]) &
                 (meta["precision"] == row["Precision"])
             ]
+            if m.empty:
+                m = meta_by_model[meta_by_model["model"] == row["Model Name"]]
             if not m.empty:
                 m = m.iloc[0]
                 if is_missing(row.get("License")):