Spaces:
Running
Running
Commit ·
bdb64b6
1
Parent(s): 9bb611a
Fix UI leaderboard fields and commit uv.lock
Browse files- .gitignore +0 -1
- server/benchmark_store.py +12 -5
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -13,4 +13,3 @@ test_output.txt
|
|
| 13 |
unnecessary/
|
| 14 |
results/
|
| 15 |
.pytest_cache/
|
| 16 |
-
uv.lock
|
|
|
|
| 13 |
unnecessary/
|
| 14 |
results/
|
| 15 |
.pytest_cache/
|
|
|
server/benchmark_store.py
CHANGED
|
@@ -39,10 +39,10 @@ def append_result(model: str, model_id: str, scores: Dict[str, float]) -> Dict:
|
|
| 39 |
"""Add a new benchmark result and persist to disk. Returns the saved entry."""
|
| 40 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 41 |
entry = {
|
| 42 |
-
'
|
| 43 |
'model_id': model_id,
|
| 44 |
'scores': scores,
|
| 45 |
-
'
|
| 46 |
'type': 'full_run',
|
| 47 |
'timestamp': datetime.utcnow().isoformat(),
|
| 48 |
}
|
|
@@ -55,6 +55,11 @@ def append_result(model: str, model_id: str, scores: Dict[str, float]) -> Dict:
|
|
| 55 |
def get_all() -> List[Dict]:
|
| 56 |
"""Return all benchmark results, newest first."""
|
| 57 |
results = _load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return sorted(results, key=lambda x: x.get('timestamp', ''), reverse=True)
|
| 59 |
|
| 60 |
|
|
@@ -63,7 +68,9 @@ def get_leaderboard() -> List[Dict]:
|
|
| 63 |
results = _load()
|
| 64 |
best: Dict[str, Dict] = {}
|
| 65 |
for r in results:
|
| 66 |
-
mid = r.get('model_id', r.get('model', 'unknown'))
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
best[mid] = r
|
| 69 |
-
return sorted(best.values(), key=lambda x: x.get('avg', 0), reverse=True)
|
|
|
|
| 39 |
"""Add a new benchmark result and persist to disk. Returns the saved entry."""
|
| 40 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 41 |
entry = {
|
| 42 |
+
'model_name': model,
|
| 43 |
'model_id': model_id,
|
| 44 |
'scores': scores,
|
| 45 |
+
'average': avg,
|
| 46 |
'type': 'full_run',
|
| 47 |
'timestamp': datetime.utcnow().isoformat(),
|
| 48 |
}
|
|
|
|
| 55 |
def get_all() -> List[Dict]:
|
| 56 |
"""Return all benchmark results, newest first."""
|
| 57 |
results = _load()
|
| 58 |
+
for r in results:
|
| 59 |
+
if 'average' not in r and 'avg' in r:
|
| 60 |
+
r['average'] = r['avg']
|
| 61 |
+
if 'model_name' not in r and 'model' in r:
|
| 62 |
+
r['model_name'] = r['model']
|
| 63 |
return sorted(results, key=lambda x: x.get('timestamp', ''), reverse=True)
|
| 64 |
|
| 65 |
|
|
|
|
| 68 |
results = _load()
|
| 69 |
best: Dict[str, Dict] = {}
|
| 70 |
for r in results:
|
| 71 |
+
mid = r.get('model_id', r.get('model_name', r.get('model', 'unknown')))
|
| 72 |
+
val = r.get('average', r.get('avg', 0))
|
| 73 |
+
best_val = best[mid].get('average', best[mid].get('avg', 0)) if mid in best else -1
|
| 74 |
+
if mid not in best or val > best_val:
|
| 75 |
best[mid] = r
|
| 76 |
+
return sorted(best.values(), key=lambda x: x.get('average', x.get('avg', 0)), reverse=True)
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|