Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
change the way we read results by specifying source type, source type is the prefix of the filename. Sperating the tasks into different lists based on the source
Browse files- backend/config.py +29 -6
- backend/data_loader.py +69 -12
backend/config.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import Dict, List, Tuple, Optional
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
# --- 1. Repository & API Configuration ---
|
|
@@ -15,7 +15,7 @@ API = HfApi(token=hf_api_token)
|
|
| 15 |
|
| 16 |
# --- 2. Task Definitions ---
|
| 17 |
# Format: (dataset_key, metric_field, display_name)
|
| 18 |
-
|
| 19 |
("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
|
| 20 |
("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
|
| 21 |
("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
|
|
@@ -29,14 +29,37 @@ TASKS: List[Tuple[str, str, str]] = [
|
|
| 29 |
"MedArabiQ",
|
| 30 |
),
|
| 31 |
("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
|
| 32 |
-
("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "qimma-GAT:_average|0", "GAT"),
|
| 33 |
("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
|
| 34 |
("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
|
| 35 |
-
# ("humaneval", "humaneval+", "HumanEval+"),
|
| 36 |
-
# ("mbpp", "mbpp+", "MBPP+"),
|
| 37 |
-
# ("fann_or_flop", "fann_or_flop", "FannOrFlop"),
|
| 38 |
]
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# --- 3. Model Metadata ---
|
| 41 |
|
| 42 |
MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import Dict, List, Tuple, Optional, Any
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
# --- 1. Repository & API Configuration ---
|
|
|
|
| 15 |
|
| 16 |
# --- 2. Task Definitions ---
|
| 17 |
# Format: (dataset_key, metric_field, display_name)
|
| 18 |
+
RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
|
| 19 |
("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
|
| 20 |
("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
|
| 21 |
("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
|
|
|
|
| 29 |
"MedArabiQ",
|
| 30 |
),
|
| 31 |
("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
|
|
|
|
| 32 |
("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
|
| 33 |
("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
|
|
|
|
|
|
|
|
|
|
| 34 |
]
|
| 35 |
|
| 36 |
+
CODE_TASKS: List[Tuple[Any, Any, str]] = [
|
| 37 |
+
("humaneval", "humaneval+", "HumanEval+"),
|
| 38 |
+
("mbpp", "mbpp+", "MBPP+"),
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
|
| 42 |
+
("fann_or_flop", "fann_or_flop", "FannOrFlop"),
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
TASK_SOURCES: Dict[str, Dict[str, Any]] = {
|
| 46 |
+
"results": {
|
| 47 |
+
"prefixes": ["results"],
|
| 48 |
+
"tasks": RESULTS_TASKS,
|
| 49 |
+
},
|
| 50 |
+
"code": {
|
| 51 |
+
"prefixes": ["code"],
|
| 52 |
+
"tasks": CODE_TASKS,
|
| 53 |
+
},
|
| 54 |
+
"fannflop": {
|
| 55 |
+
# Keep this configurable for future rename.
|
| 56 |
+
"prefixes": ["fannflop"],
|
| 57 |
+
"tasks": FANNFLOP_TASKS,
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
|
| 62 |
+
|
| 63 |
# --- 3. Model Metadata ---
|
| 64 |
|
| 65 |
MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
|
backend/data_loader.py
CHANGED
|
@@ -18,6 +18,7 @@ from backend.config import (
|
|
| 18 |
REQUESTS_REPO_ID,
|
| 19 |
RESULTS_REPO_ID,
|
| 20 |
TASKS,
|
|
|
|
| 21 |
MODEL_TYPE_TO_EMOJI,
|
| 22 |
hf_api_token,
|
| 23 |
)
|
|
@@ -25,6 +26,16 @@ from backend.helpers import unify_precision, get_model_size
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# -----------------------------------------------------------------------------
|
| 30 |
# Utilities
|
|
@@ -81,6 +92,14 @@ def load_requests(status: str) -> pd.DataFrame:
|
|
| 81 |
# Results parsing
|
| 82 |
# -----------------------------------------------------------------------------
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
|
| 85 |
try:
|
| 86 |
with open(path, "r", encoding="utf-8") as f:
|
|
@@ -88,28 +107,38 @@ def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
|
|
| 88 |
except Exception:
|
| 89 |
return None
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
cfg = data.get("config_general", {})
|
| 92 |
results = data.get("results", {})
|
| 93 |
|
| 94 |
-
model = cfg.get("model_name", "UNK")
|
| 95 |
precision = unify_precision(cfg.get("model_dtype", "UNK"))
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
row = {
|
| 99 |
"Model Name": model,
|
| 100 |
"Precision": precision,
|
| 101 |
-
"datetime":
|
|
|
|
| 102 |
}
|
| 103 |
-
|
| 104 |
-
for task_key, metric_key, display in
|
| 105 |
-
if isinstance(task_key,list):
|
| 106 |
weight_total = 0
|
| 107 |
metric_total = 0
|
| 108 |
for t, (m, w) in zip(task_key, metric_key):
|
| 109 |
val = results.get(t, {}).get(m, 0)
|
| 110 |
-
metric_total += (val *
|
| 111 |
weight_total += w
|
| 112 |
-
val = metric_total/weight_total
|
| 113 |
else:
|
| 114 |
val = np.nan
|
| 115 |
if task_key in results and metric_key in results[task_key]:
|
|
@@ -174,13 +203,38 @@ def load_scoreboard() -> pd.DataFrame:
|
|
| 174 |
df = pd.DataFrame(rows)
|
| 175 |
df["datetime"] = pd.to_datetime(df["datetime"])
|
| 176 |
|
| 177 |
-
#
|
| 178 |
df = df.sort_values("datetime", ascending=False)
|
| 179 |
-
df = df.drop_duplicates(subset="Model Name", keep="first")
|
| 180 |
|
| 181 |
-
# numeric
|
| 182 |
task_cols = [t[2] for t in TASKS]
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
for col in task_cols:
|
| 185 |
df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
|
| 186 |
df["Average"] = df[task_cols].mean(axis=1).round(2)
|
|
@@ -211,6 +265,7 @@ def load_scoreboard() -> pd.DataFrame:
|
|
| 211 |
if not finished.empty:
|
| 212 |
finished["precision"] = finished["precision"].apply(unify_precision)
|
| 213 |
meta = finished.groupby(["model", "precision"]).last().reset_index()
|
|
|
|
| 214 |
|
| 215 |
def is_missing(v: Any) -> bool:
|
| 216 |
return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
|
|
@@ -220,6 +275,8 @@ def load_scoreboard() -> pd.DataFrame:
|
|
| 220 |
(meta["model"] == row["Model Name"]) &
|
| 221 |
(meta["precision"] == row["Precision"])
|
| 222 |
]
|
|
|
|
|
|
|
| 223 |
if not m.empty:
|
| 224 |
m = m.iloc[0]
|
| 225 |
if is_missing(row.get("License")):
|
|
|
|
| 18 |
REQUESTS_REPO_ID,
|
| 19 |
RESULTS_REPO_ID,
|
| 20 |
TASKS,
|
| 21 |
+
TASK_SOURCES,
|
| 22 |
MODEL_TYPE_TO_EMOJI,
|
| 23 |
hf_api_token,
|
| 24 |
)
|
|
|
|
| 26 |
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
| 29 |
+
_SOURCE_BY_PREFIX = {
|
| 30 |
+
prefix.lower(): source
|
| 31 |
+
for source, cfg in TASK_SOURCES.items()
|
| 32 |
+
for prefix in cfg.get("prefixes", [])
|
| 33 |
+
}
|
| 34 |
+
_TASKS_BY_SOURCE = {
|
| 35 |
+
source: cfg.get("tasks", [])
|
| 36 |
+
for source, cfg in TASK_SOURCES.items()
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
|
| 40 |
# -----------------------------------------------------------------------------
|
| 41 |
# Utilities
|
|
|
|
| 92 |
# Results parsing
|
| 93 |
# -----------------------------------------------------------------------------
|
| 94 |
|
| 95 |
+
def _infer_source_from_filename(path: Path) -> Optional[str]:
|
| 96 |
+
stem = path.stem
|
| 97 |
+
if "_" not in stem:
|
| 98 |
+
return None
|
| 99 |
+
prefix = stem.split("_", 1)[0].lower()
|
| 100 |
+
return _SOURCE_BY_PREFIX.get(prefix)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
|
| 104 |
try:
|
| 105 |
with open(path, "r", encoding="utf-8") as f:
|
|
|
|
| 107 |
except Exception:
|
| 108 |
return None
|
| 109 |
|
| 110 |
+
source_type = _infer_source_from_filename(path)
|
| 111 |
+
if source_type is None:
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
cfg = data.get("config_general", {})
|
| 115 |
results = data.get("results", {})
|
| 116 |
|
| 117 |
+
model = cfg.get("model_name") or data.get("model_name", "UNK")
|
| 118 |
precision = unify_precision(cfg.get("model_dtype", "UNK"))
|
| 119 |
|
| 120 |
+
try:
|
| 121 |
+
string_date = path.stem.split("_", 1)[1]
|
| 122 |
+
parsed_dt = datetime.strptime(string_date, "%Y-%m-%dT%H-%M-%S.%f")
|
| 123 |
+
except Exception:
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
row = {
|
| 127 |
"Model Name": model,
|
| 128 |
"Precision": precision,
|
| 129 |
+
"datetime": parsed_dt,
|
| 130 |
+
"Source Type": source_type,
|
| 131 |
}
|
| 132 |
+
|
| 133 |
+
for task_key, metric_key, display in _TASKS_BY_SOURCE.get(source_type, []):
|
| 134 |
+
if isinstance(task_key, list):
|
| 135 |
weight_total = 0
|
| 136 |
metric_total = 0
|
| 137 |
for t, (m, w) in zip(task_key, metric_key):
|
| 138 |
val = results.get(t, {}).get(m, 0)
|
| 139 |
+
metric_total += (val * w)
|
| 140 |
weight_total += w
|
| 141 |
+
val = metric_total / weight_total if weight_total > 0 else np.nan
|
| 142 |
else:
|
| 143 |
val = np.nan
|
| 144 |
if task_key in results and metric_key in results[task_key]:
|
|
|
|
| 203 |
df = pd.DataFrame(rows)
|
| 204 |
df["datetime"] = pd.to_datetime(df["datetime"])
|
| 205 |
|
| 206 |
+
# Keep latest file per (model, source), then merge source metrics per model.
|
| 207 |
df = df.sort_values("datetime", ascending=False)
|
| 208 |
+
df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
|
| 209 |
|
|
|
|
| 210 |
task_cols = [t[2] for t in TASKS]
|
| 211 |
+
for col in task_cols:
|
| 212 |
+
if col not in df.columns:
|
| 213 |
+
df[col] = np.nan
|
| 214 |
+
|
| 215 |
+
def first_non_null(values):
|
| 216 |
+
for v in values:
|
| 217 |
+
if pd.notna(v):
|
| 218 |
+
return v
|
| 219 |
+
return np.nan
|
| 220 |
+
|
| 221 |
+
def first_valid_precision(values):
|
| 222 |
+
for v in values:
|
| 223 |
+
if isinstance(v, str) and v.strip() and v not in {"Missing", "UNK"}:
|
| 224 |
+
return v
|
| 225 |
+
for v in values:
|
| 226 |
+
if pd.notna(v):
|
| 227 |
+
return v
|
| 228 |
+
return "UNK"
|
| 229 |
+
|
| 230 |
+
agg_map = {
|
| 231 |
+
"datetime": "max",
|
| 232 |
+
"Precision": first_valid_precision,
|
| 233 |
+
}
|
| 234 |
+
agg_map.update({col: first_non_null for col in task_cols})
|
| 235 |
+
df = df.groupby("Model Name", as_index=False).agg(agg_map)
|
| 236 |
+
|
| 237 |
+
# numeric
|
| 238 |
for col in task_cols:
|
| 239 |
df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
|
| 240 |
df["Average"] = df[task_cols].mean(axis=1).round(2)
|
|
|
|
| 265 |
if not finished.empty:
|
| 266 |
finished["precision"] = finished["precision"].apply(unify_precision)
|
| 267 |
meta = finished.groupby(["model", "precision"]).last().reset_index()
|
| 268 |
+
meta_by_model = finished.groupby(["model"]).last().reset_index()
|
| 269 |
|
| 270 |
def is_missing(v: Any) -> bool:
|
| 271 |
return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
|
|
|
|
| 275 |
(meta["model"] == row["Model Name"]) &
|
| 276 |
(meta["precision"] == row["Precision"])
|
| 277 |
]
|
| 278 |
+
if m.empty:
|
| 279 |
+
m = meta_by_model[meta_by_model["model"] == row["Model Name"]]
|
| 280 |
if not m.empty:
|
| 281 |
m = m.iloc[0]
|
| 282 |
if is_missing(row.get("License")):
|