Alyafeai commited on
Commit
c7488db
·
1 Parent(s): 1482380

change the way we read results by specifying source type, source type is the prefix of the filename. Sperating the tasks into different lists based on the source

Browse files
Files changed (2) hide show
  1. backend/config.py +29 -6
  2. backend/data_loader.py +69 -12
backend/config.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from typing import Dict, List, Tuple, Optional
3
  from huggingface_hub import HfApi
4
 
5
  # --- 1. Repository & API Configuration ---
@@ -15,7 +15,7 @@ API = HfApi(token=hf_api_token)
15
 
16
  # --- 2. Task Definitions ---
17
  # Format: (dataset_key, metric_field, display_name)
18
- TASKS: List[Tuple[str, str, str]] = [
19
  ("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
20
  ("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
21
  ("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
@@ -29,14 +29,37 @@ TASKS: List[Tuple[str, str, str]] = [
29
  "MedArabiQ",
30
  ),
31
  ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
32
- ("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "qimma-GAT:_average|0", "GAT"),
33
  ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
34
  ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
35
- # ("humaneval", "humaneval+", "HumanEval+"),
36
- # ("mbpp", "mbpp+", "MBPP+"),
37
- # ("fann_or_flop", "fann_or_flop", "FannOrFlop"),
38
  ]
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # --- 3. Model Metadata ---
41
 
42
  MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
 
1
  import os
2
+ from typing import Dict, List, Tuple, Optional, Any
3
  from huggingface_hub import HfApi
4
 
5
  # --- 1. Repository & API Configuration ---
 
15
 
16
  # --- 2. Task Definitions ---
17
  # Format: (dataset_key, metric_field, display_name)
18
+ RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
19
  ("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
20
  ("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
21
  ("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
 
29
  "MedArabiQ",
30
  ),
31
  ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
 
32
  ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
33
  ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
 
 
 
34
  ]
35
 
36
+ CODE_TASKS: List[Tuple[Any, Any, str]] = [
37
+ ("humaneval", "humaneval+", "HumanEval+"),
38
+ ("mbpp", "mbpp+", "MBPP+"),
39
+ ]
40
+
41
+ FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
42
+ ("fann_or_flop", "fann_or_flop", "FannOrFlop"),
43
+ ]
44
+
45
+ TASK_SOURCES: Dict[str, Dict[str, Any]] = {
46
+ "results": {
47
+ "prefixes": ["results"],
48
+ "tasks": RESULTS_TASKS,
49
+ },
50
+ "code": {
51
+ "prefixes": ["code"],
52
+ "tasks": CODE_TASKS,
53
+ },
54
+ "fannflop": {
55
+ # Keep this configurable for future rename.
56
+ "prefixes": ["fannflop"],
57
+ "tasks": FANNFLOP_TASKS,
58
+ },
59
+ }
60
+
61
+ TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
62
+
63
  # --- 3. Model Metadata ---
64
 
65
  MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
backend/data_loader.py CHANGED
@@ -18,6 +18,7 @@ from backend.config import (
18
  REQUESTS_REPO_ID,
19
  RESULTS_REPO_ID,
20
  TASKS,
 
21
  MODEL_TYPE_TO_EMOJI,
22
  hf_api_token,
23
  )
@@ -25,6 +26,16 @@ from backend.helpers import unify_precision, get_model_size
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # -----------------------------------------------------------------------------
30
  # Utilities
@@ -81,6 +92,14 @@ def load_requests(status: str) -> pd.DataFrame:
81
  # Results parsing
82
  # -----------------------------------------------------------------------------
83
 
 
 
 
 
 
 
 
 
84
  def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
85
  try:
86
  with open(path, "r", encoding="utf-8") as f:
@@ -88,28 +107,38 @@ def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
88
  except Exception:
89
  return None
90
 
 
 
 
 
91
  cfg = data.get("config_general", {})
92
  results = data.get("results", {})
93
 
94
- model = cfg.get("model_name", "UNK")
95
  precision = unify_precision(cfg.get("model_dtype", "UNK"))
96
 
97
- string_date = path.name.split("_")[-1].replace(".json", "")
 
 
 
 
 
98
  row = {
99
  "Model Name": model,
100
  "Precision": precision,
101
- "datetime": datetime.strptime(string_date, "%Y-%m-%dT%H-%M-%S.%f")
 
102
  }
103
-
104
- for task_key, metric_key, display in TASKS:
105
- if isinstance(task_key,list):
106
  weight_total = 0
107
  metric_total = 0
108
  for t, (m, w) in zip(task_key, metric_key):
109
  val = results.get(t, {}).get(m, 0)
110
- metric_total += (val * weight_total)
111
  weight_total += w
112
- val = metric_total/weight_total
113
  else:
114
  val = np.nan
115
  if task_key in results and metric_key in results[task_key]:
@@ -174,13 +203,38 @@ def load_scoreboard() -> pd.DataFrame:
174
  df = pd.DataFrame(rows)
175
  df["datetime"] = pd.to_datetime(df["datetime"])
176
 
177
- # sort by datetime to get the latest and remove the others
178
  df = df.sort_values("datetime", ascending=False)
179
- df = df.drop_duplicates(subset="Model Name", keep="first")
180
 
181
- # numeric
182
  task_cols = [t[2] for t in TASKS]
183
- # df[task_cols] = df[task_cols].apply(pd.to_numeric, errors="coerce")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  for col in task_cols:
185
  df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
186
  df["Average"] = df[task_cols].mean(axis=1).round(2)
@@ -211,6 +265,7 @@ def load_scoreboard() -> pd.DataFrame:
211
  if not finished.empty:
212
  finished["precision"] = finished["precision"].apply(unify_precision)
213
  meta = finished.groupby(["model", "precision"]).last().reset_index()
 
214
 
215
  def is_missing(v: Any) -> bool:
216
  return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
@@ -220,6 +275,8 @@ def load_scoreboard() -> pd.DataFrame:
220
  (meta["model"] == row["Model Name"]) &
221
  (meta["precision"] == row["Precision"])
222
  ]
 
 
223
  if not m.empty:
224
  m = m.iloc[0]
225
  if is_missing(row.get("License")):
 
18
  REQUESTS_REPO_ID,
19
  RESULTS_REPO_ID,
20
  TASKS,
21
+ TASK_SOURCES,
22
  MODEL_TYPE_TO_EMOJI,
23
  hf_api_token,
24
  )
 
26
 
27
  logger = logging.getLogger(__name__)
28
 
29
+ _SOURCE_BY_PREFIX = {
30
+ prefix.lower(): source
31
+ for source, cfg in TASK_SOURCES.items()
32
+ for prefix in cfg.get("prefixes", [])
33
+ }
34
+ _TASKS_BY_SOURCE = {
35
+ source: cfg.get("tasks", [])
36
+ for source, cfg in TASK_SOURCES.items()
37
+ }
38
+
39
 
40
  # -----------------------------------------------------------------------------
41
  # Utilities
 
92
  # Results parsing
93
  # -----------------------------------------------------------------------------
94
 
95
+ def _infer_source_from_filename(path: Path) -> Optional[str]:
96
+ stem = path.stem
97
+ if "_" not in stem:
98
+ return None
99
+ prefix = stem.split("_", 1)[0].lower()
100
+ return _SOURCE_BY_PREFIX.get(prefix)
101
+
102
+
103
  def _parse_result_file(path: Path) -> Optional[Dict[str, Any]]:
104
  try:
105
  with open(path, "r", encoding="utf-8") as f:
 
107
  except Exception:
108
  return None
109
 
110
+ source_type = _infer_source_from_filename(path)
111
+ if source_type is None:
112
+ return None
113
+
114
  cfg = data.get("config_general", {})
115
  results = data.get("results", {})
116
 
117
+ model = cfg.get("model_name") or data.get("model_name", "UNK")
118
  precision = unify_precision(cfg.get("model_dtype", "UNK"))
119
 
120
+ try:
121
+ string_date = path.stem.split("_", 1)[1]
122
+ parsed_dt = datetime.strptime(string_date, "%Y-%m-%dT%H-%M-%S.%f")
123
+ except Exception:
124
+ return None
125
+
126
  row = {
127
  "Model Name": model,
128
  "Precision": precision,
129
+ "datetime": parsed_dt,
130
+ "Source Type": source_type,
131
  }
132
+
133
+ for task_key, metric_key, display in _TASKS_BY_SOURCE.get(source_type, []):
134
+ if isinstance(task_key, list):
135
  weight_total = 0
136
  metric_total = 0
137
  for t, (m, w) in zip(task_key, metric_key):
138
  val = results.get(t, {}).get(m, 0)
139
+ metric_total += (val * w)
140
  weight_total += w
141
+ val = metric_total / weight_total if weight_total > 0 else np.nan
142
  else:
143
  val = np.nan
144
  if task_key in results and metric_key in results[task_key]:
 
203
  df = pd.DataFrame(rows)
204
  df["datetime"] = pd.to_datetime(df["datetime"])
205
 
206
+ # Keep latest file per (model, source), then merge source metrics per model.
207
  df = df.sort_values("datetime", ascending=False)
208
+ df = df.drop_duplicates(subset=["Model Name", "Source Type"], keep="first")
209
 
 
210
  task_cols = [t[2] for t in TASKS]
211
+ for col in task_cols:
212
+ if col not in df.columns:
213
+ df[col] = np.nan
214
+
215
+ def first_non_null(values):
216
+ for v in values:
217
+ if pd.notna(v):
218
+ return v
219
+ return np.nan
220
+
221
+ def first_valid_precision(values):
222
+ for v in values:
223
+ if isinstance(v, str) and v.strip() and v not in {"Missing", "UNK"}:
224
+ return v
225
+ for v in values:
226
+ if pd.notna(v):
227
+ return v
228
+ return "UNK"
229
+
230
+ agg_map = {
231
+ "datetime": "max",
232
+ "Precision": first_valid_precision,
233
+ }
234
+ agg_map.update({col: first_non_null for col in task_cols})
235
+ df = df.groupby("Model Name", as_index=False).agg(agg_map)
236
+
237
+ # numeric
238
  for col in task_cols:
239
  df[col] = (pd.to_numeric(df[col], errors="coerce") * 100).round(2)
240
  df["Average"] = df[task_cols].mean(axis=1).round(2)
 
265
  if not finished.empty:
266
  finished["precision"] = finished["precision"].apply(unify_precision)
267
  meta = finished.groupby(["model", "precision"]).last().reset_index()
268
+ meta_by_model = finished.groupby(["model"]).last().reset_index()
269
 
270
  def is_missing(v: Any) -> bool:
271
  return v is None or (isinstance(v, str) and not v.strip()) or pd.isna(v)
 
275
  (meta["model"] == row["Model Name"]) &
276
  (meta["precision"] == row["Precision"])
277
  ]
278
+ if m.empty:
279
+ m = meta_by_model[meta_by_model["model"] == row["Model Name"]]
280
  if not m.empty:
281
  m = m.iloc[0]
282
  if is_missing(row.get("License")):