Spaces:
Running
Running
Empty PR for testing
#9
by gneubig - opened
- .gitattributes +0 -1
- alternative_agents_page.py +0 -103
- app.py +4 -31
- assets/harnesses/README.md +0 -59
- assets/harnesses/claude-code.svg +0 -1
- assets/harnesses/codex-cli.svg +0 -1
- assets/harnesses/gemini-cli.svg +0 -1
- assets/harnesses/openhands.svg +0 -1
- docs/screenshots/alternative-agents.png +0 -3
- leaderboard_transformer.py +56 -211
- main_page.py +41 -81
- setup_data.py +5 -17
- simple_data_loader.py +59 -155
- tests/test_runtime_sorting.py +0 -40
- ui_components.py +19 -32
.gitattributes
CHANGED
|
@@ -1,2 +1 @@
|
|
| 1 |
|
| 2 |
-
docs/screenshots/alternative-agents.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
|
|
|
alternative_agents_page.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
| 1 |
-
"""Alternative Agents leaderboard page.
|
| 2 |
-
|
| 3 |
-
The canonical OpenHands Index leaderboard (Home + the per-category pages)
|
| 4 |
-
ranks default OpenHands agent runs from ``results/{model}/`` in the
|
| 5 |
-
openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
|
| 6 |
-
Gemini CLI, OpenHands Sub-agents, ...) live under
|
| 7 |
-
``alternative_agents/{type}/{model}/`` and aren't directly comparable to
|
| 8 |
-
default OpenHands runs (different scaffolds, different cost/runtime
|
| 9 |
-
characteristics), so they get their own standalone page instead of being
|
| 10 |
-
mixed into the same ranking.
|
| 11 |
-
|
| 12 |
-
This page is intentionally a single Overall view (no per-category
|
| 13 |
-
subpages) — the alternative-agents dataset is small (one row per
|
| 14 |
-
harness × model) and the goal is "show me all the alternatives at a
|
| 15 |
-
glance", not "drill into Issue Resolution for Codex".
|
| 16 |
-
|
| 17 |
-
To make same-model comparisons easier, the page also appends canonical
|
| 18 |
-
OpenHands rows for any language model that appears in the alternative
|
| 19 |
-
agent dataset. The match is exact, so ``Gemini-3-Pro`` and
|
| 20 |
-
``Gemini-3.1-Pro`` remain distinct entries.
|
| 21 |
-
"""
|
| 22 |
-
import matplotlib
|
| 23 |
-
matplotlib.use('Agg')
|
| 24 |
-
import pandas as pd
|
| 25 |
-
import gradio as gr
|
| 26 |
-
|
| 27 |
-
from simple_data_loader import SimpleLeaderboardViewer
|
| 28 |
-
from ui_components import (
|
| 29 |
-
create_leaderboard_display,
|
| 30 |
-
get_full_leaderboard_data,
|
| 31 |
-
)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
ALTERNATIVE_AGENTS_INTRO = """
|
| 35 |
-
<div id="alternative-agents-intro">
|
| 36 |
-
<h2>Alternative Agents</h2>
|
| 37 |
-
<p>
|
| 38 |
-
Third-party agent harnesses running the OpenHands Index benchmarks.
|
| 39 |
-
To make direct comparisons easier, this page also includes the
|
| 40 |
-
canonical OpenHands row whenever the exact same language model appears
|
| 41 |
-
under an alternative harness. Cost and runtime numbers still come from
|
| 42 |
-
each harness's own instrumentation and aren't directly comparable
|
| 43 |
-
across harnesses.
|
| 44 |
-
</p>
|
| 45 |
-
</div>
|
| 46 |
-
"""
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def _append_openhands_shared_models(
|
| 50 |
-
alternative_df: pd.DataFrame,
|
| 51 |
-
split: str,
|
| 52 |
-
) -> pd.DataFrame:
|
| 53 |
-
if alternative_df.empty or "Language Model" not in alternative_df.columns:
|
| 54 |
-
return alternative_df
|
| 55 |
-
|
| 56 |
-
openhands_df, _ = get_full_leaderboard_data(
|
| 57 |
-
split,
|
| 58 |
-
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 59 |
-
)
|
| 60 |
-
if openhands_df.empty or "Language Model" not in openhands_df.columns:
|
| 61 |
-
return alternative_df
|
| 62 |
-
|
| 63 |
-
alternative_models = set(
|
| 64 |
-
alternative_df["Language Model"].dropna().astype(str).str.strip()
|
| 65 |
-
)
|
| 66 |
-
if not alternative_models:
|
| 67 |
-
return alternative_df
|
| 68 |
-
|
| 69 |
-
openhands_shared_df = openhands_df[
|
| 70 |
-
openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
|
| 71 |
-
].copy()
|
| 72 |
-
if openhands_shared_df.empty:
|
| 73 |
-
return alternative_df
|
| 74 |
-
|
| 75 |
-
return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def build_page():
|
| 79 |
-
gr.HTML(ALTERNATIVE_AGENTS_INTRO)
|
| 80 |
-
|
| 81 |
-
gr.Markdown("---")
|
| 82 |
-
|
| 83 |
-
test_df, test_tag_map = get_full_leaderboard_data(
|
| 84 |
-
"test",
|
| 85 |
-
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
if test_df.empty:
|
| 89 |
-
gr.Markdown(
|
| 90 |
-
"No alternative agent submissions yet. New runs land in "
|
| 91 |
-
"`alternative_agents/{type}/{model}/` in "
|
| 92 |
-
"[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
|
| 93 |
-
)
|
| 94 |
-
return
|
| 95 |
-
|
| 96 |
-
test_df = _append_openhands_shared_models(test_df, split="test")
|
| 97 |
-
|
| 98 |
-
create_leaderboard_display(
|
| 99 |
-
full_df=test_df,
|
| 100 |
-
tag_map=test_tag_map,
|
| 101 |
-
category_name="Overall",
|
| 102 |
-
split_name="test",
|
| 103 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -35,7 +35,6 @@ from app_creation import build_page as build_app_creation_page
|
|
| 35 |
from frontend_development import build_page as build_frontend_page
|
| 36 |
from test_generation import build_page as build_test_generation_page
|
| 37 |
from information_gathering import build_page as build_information_gathering_page
|
| 38 |
-
from alternative_agents_page import build_page as build_alternative_agents_page
|
| 39 |
from about import build_page as build_about_page
|
| 40 |
|
| 41 |
logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
|
|
@@ -374,46 +373,20 @@ with demo.route("Testing", "/testing"):
|
|
| 374 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 375 |
build_information_gathering_page()
|
| 376 |
|
| 377 |
-
with demo.route("Alternative Agents", "/alternative-agents"):
|
| 378 |
-
build_alternative_agents_page()
|
| 379 |
-
|
| 380 |
with demo.route("About", "/about"):
|
| 381 |
build_about_page()
|
| 382 |
|
| 383 |
logger.info("All routes configured")
|
| 384 |
|
| 385 |
# Mount the REST API on /api
|
| 386 |
-
from fastapi import FastAPI
|
| 387 |
-
from fastapi.responses import RedirectResponse
|
| 388 |
-
from starlette.middleware.base import BaseHTTPMiddleware
|
| 389 |
from api import api_app
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
"""Middleware to redirect root path "/" to "/home".
|
| 394 |
-
|
| 395 |
-
This fixes the 307 trailing slash redirect issue (Gradio bug #11071) that
|
| 396 |
-
occurs when Gradio is mounted at "/" - FastAPI's default behavior redirects
|
| 397 |
-
"/" to "//", which breaks routing on HuggingFace Spaces.
|
| 398 |
-
|
| 399 |
-
See: https://github.com/gradio-app/gradio/issues/11071
|
| 400 |
-
"""
|
| 401 |
-
async def dispatch(self, request: Request, call_next):
|
| 402 |
-
if request.url.path == "/":
|
| 403 |
-
return RedirectResponse(url="/home", status_code=302)
|
| 404 |
-
return await call_next(request)
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
# Create a parent FastAPI app with redirect_slashes=False to prevent
|
| 408 |
-
# automatic trailing slash redirects that cause issues with Gradio
|
| 409 |
-
root_app = FastAPI(redirect_slashes=False)
|
| 410 |
-
|
| 411 |
-
# Add middleware to handle root path redirect to /home
|
| 412 |
-
root_app.add_middleware(RootRedirectMiddleware)
|
| 413 |
-
|
| 414 |
root_app.mount("/api", api_app)
|
| 415 |
|
| 416 |
-
# Mount Gradio app
|
| 417 |
app = gr.mount_gradio_app(root_app, demo, path="/")
|
| 418 |
logger.info("REST API mounted at /api, Gradio app mounted at /")
|
| 419 |
|
|
|
|
| 35 |
from frontend_development import build_page as build_frontend_page
|
| 36 |
from test_generation import build_page as build_test_generation_page
|
| 37 |
from information_gathering import build_page as build_information_gathering_page
|
|
|
|
| 38 |
from about import build_page as build_about_page
|
| 39 |
|
| 40 |
logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
|
|
|
|
| 373 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 374 |
build_information_gathering_page()
|
| 375 |
|
|
|
|
|
|
|
|
|
|
| 376 |
with demo.route("About", "/about"):
|
| 377 |
build_about_page()
|
| 378 |
|
| 379 |
logger.info("All routes configured")
|
| 380 |
|
| 381 |
# Mount the REST API on /api
|
| 382 |
+
from fastapi import FastAPI
|
|
|
|
|
|
|
| 383 |
from api import api_app
|
| 384 |
|
| 385 |
+
# Create a parent FastAPI app that will host both the API and Gradio
|
| 386 |
+
root_app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
root_app.mount("/api", api_app)
|
| 388 |
|
| 389 |
+
# Mount Gradio app - root redirect is handled by the proxy
|
| 390 |
app = gr.mount_gradio_app(root_app, demo, path="/")
|
| 391 |
logger.info("REST API mounted at /api, Gradio app mounted at /")
|
| 392 |
|
assets/harnesses/README.md
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
# Agent harness logos
|
| 2 |
-
|
| 3 |
-
This folder holds the **bottom half** of the composite scatter markers used
|
| 4 |
-
on the [Alternative Agents](../../alternative_agents_page.py) page. Each
|
| 5 |
-
point on that scatter stacks two logos: the model provider on top (from
|
| 6 |
-
`assets/logo-*.svg`) and the harness on the bottom (from this folder).
|
| 7 |
-
|
| 8 |
-
## Expected filenames
|
| 9 |
-
|
| 10 |
-
The scatter code looks up a logo by the exact `agent_name` string that the
|
| 11 |
-
`push-to-index` workflow writes into the index repo's `metadata.json`, then
|
| 12 |
-
maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
|
| 13 |
-
these filenames in sync with that map.
|
| 14 |
-
|
| 15 |
-
| `agent_name` (in index repo) | File in this folder |
|
| 16 |
-
| --- | --- |
|
| 17 |
-
| `Claude Code` | `claude-code.svg` or `claude-code.png` |
|
| 18 |
-
| `Codex` | `codex-cli.svg` or `codex-cli.png` |
|
| 19 |
-
| `Gemini CLI` | `gemini-cli.svg` or `gemini-cli.png` |
|
| 20 |
-
| `OpenHands` | `openhands.svg` or `openhands.png` |
|
| 21 |
-
| `OpenHands Sub-agents` | `openhands.svg` or `openhands.png` (shared with `OpenHands`) |
|
| 22 |
-
|
| 23 |
-
Both `.svg` and `.png` are accepted — the resolver tries `.svg` first, then
|
| 24 |
-
`.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
|
| 25 |
-
binary files on plain `git push` and routes PNGs through Xet, so an SVG is
|
| 26 |
-
one less thing to set up.
|
| 27 |
-
|
| 28 |
-
## When a file is missing
|
| 29 |
-
|
| 30 |
-
The scatter falls back to a single marker (just the model provider logo) —
|
| 31 |
-
exactly the same rendering path the canonical OpenHands pages use. Nothing
|
| 32 |
-
crashes and nothing prints a warning in normal operation. This means you
|
| 33 |
-
can roll out logos one harness at a time without waiting for all four.
|
| 34 |
-
|
| 35 |
-
## Sizing and shape
|
| 36 |
-
|
| 37 |
-
- Square canvas. The composite marker is drawn at a fixed aspect ratio, so
|
| 38 |
-
a non-square logo will get squished.
|
| 39 |
-
- Any SVG `viewBox` works — the renderer base64-encodes the file as-is and
|
| 40 |
-
Plotly scales it to the marker's `sizex` / `sizey`. Around `80×80` to
|
| 41 |
-
`256×256` is a good source size.
|
| 42 |
-
- Leave some internal padding (≈10%) so the logo doesn't touch the marker
|
| 43 |
-
edge when two are stacked.
|
| 44 |
-
- No background is required, but a rounded-square coloured tile reads well
|
| 45 |
-
at small sizes because it gives each harness a distinct silhouette even
|
| 46 |
-
when the inner glyph isn't fully legible. Look at the existing
|
| 47 |
-
`assets/logo-*.svg` files for the canonical model provider logos if you
|
| 48 |
-
want a visual reference for sizing.
|
| 49 |
-
|
| 50 |
-
## Adding a new harness
|
| 51 |
-
|
| 52 |
-
1. Decide on the exact `agent_name` that the push-to-index workflow writes
|
| 53 |
-
for the new harness (see `AGENT_NAME_BY_TYPE` in
|
| 54 |
-
`OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
|
| 55 |
-
2. Add an entry to `HARNESS_LOGO_STEMS` in
|
| 56 |
-
[`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
|
| 57 |
-
maps the display name to a stem.
|
| 58 |
-
3. Drop `{stem}.svg` (or `.png`) into this folder.
|
| 59 |
-
4. Reload the app and look at `/alternative-agents`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/harnesses/claude-code.svg
DELETED
assets/harnesses/codex-cli.svg
DELETED
assets/harnesses/gemini-cli.svg
DELETED
assets/harnesses/openhands.svg
DELETED
docs/screenshots/alternative-agents.png
DELETED
Git LFS Details
|
leaderboard_transformer.py
CHANGED
|
@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
|
|
| 228 |
def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
| 229 |
"""
|
| 230 |
Gets the appropriate icon based on the mark_by selection.
|
| 231 |
-
|
| 232 |
Args:
|
| 233 |
model_name: The model name
|
| 234 |
openness: The openness value (open/closed)
|
| 235 |
mark_by: One of "Company", "Openness", or "Country"
|
| 236 |
-
|
| 237 |
Returns:
|
| 238 |
dict with 'path' and 'name' keys
|
| 239 |
"""
|
| 240 |
from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
|
| 241 |
-
|
| 242 |
if mark_by == MARK_BY_OPENNESS:
|
| 243 |
return get_openness_icon(openness)
|
| 244 |
elif mark_by == MARK_BY_COUNTRY:
|
|
@@ -247,59 +247,6 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
|
| 247 |
return get_company_from_model(model_name)
|
| 248 |
|
| 249 |
|
| 250 |
-
# Map the agent_name stored in the index repo's metadata.json to a file stem
|
| 251 |
-
# inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
|
| 252 |
-
# OpenHands/evaluation push_to_index_from_archive.py — if a new ACP harness
|
| 253 |
-
# lands there, add the corresponding display name and a matching stem here.
|
| 254 |
-
#
|
| 255 |
-
# The scatter plot looks for {stem}.svg first, then {stem}.png in
|
| 256 |
-
# assets/harnesses/. This repo intentionally ships only a README in that
|
| 257 |
-
# folder: drop the logo files in by hand (SVG preferred, PNG works too via
|
| 258 |
-
# HF Xet) and they'll be picked up on the next app restart. If the file is
|
| 259 |
-
# missing, get_harness_icon() returns None and the scatter falls back to the
|
| 260 |
-
# single-marker path — same rendering the canonical OpenHands pages use —
|
| 261 |
-
# so logos can be added one harness at a time without breaking anything.
|
| 262 |
-
HARNESS_LOGO_STEMS: dict[str, str] = {
|
| 263 |
-
"Claude Code": "claude-code",
|
| 264 |
-
"Codex": "codex-cli",
|
| 265 |
-
"Gemini CLI": "gemini-cli",
|
| 266 |
-
"OpenHands": "openhands",
|
| 267 |
-
"OpenHands Sub-agents": "openhands",
|
| 268 |
-
}
|
| 269 |
-
HARNESS_LOGO_DIR = "assets/harnesses"
|
| 270 |
-
HARNESS_LOGO_EXTENSIONS = ("svg", "png")
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
|
| 274 |
-
"""Return {'path', 'name'} for the harness logo, or None if not usable.
|
| 275 |
-
|
| 276 |
-
Consumed by the Alternative Agents scatter plot to draw a composite
|
| 277 |
-
marker (model provider on top, harness on bottom). Returns None in any
|
| 278 |
-
of three cases, all of which make the caller skip the harness layer:
|
| 279 |
-
|
| 280 |
-
- ``agent_name`` is empty or missing from the dataframe row.
|
| 281 |
-
- ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
|
| 282 |
-
hasn't been registered yet — register it and drop in a logo).
|
| 283 |
-
- The logo file for that stem doesn't exist in ``assets/harnesses/``
|
| 284 |
-
yet (the repo ships only the README).
|
| 285 |
-
|
| 286 |
-
That third case is the important one: it lets the Alternative Agents
|
| 287 |
-
page work immediately after checkout even when the harness logo files
|
| 288 |
-
haven't been dropped in. The corresponding points just render like a
|
| 289 |
-
canonical-page marker (model logo only) until the file is added.
|
| 290 |
-
"""
|
| 291 |
-
if not agent_name:
|
| 292 |
-
return None
|
| 293 |
-
stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
|
| 294 |
-
if stem is None:
|
| 295 |
-
return None
|
| 296 |
-
for ext in HARNESS_LOGO_EXTENSIONS:
|
| 297 |
-
path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
|
| 298 |
-
if os.path.exists(path):
|
| 299 |
-
return {"path": path, "name": agent_name}
|
| 300 |
-
return None
|
| 301 |
-
|
| 302 |
-
|
| 303 |
# Standard layout configuration for all charts
|
| 304 |
STANDARD_LAYOUT = dict(
|
| 305 |
template="plotly_white",
|
|
@@ -708,7 +655,6 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 708 |
# Case 1: Handle fixed, special-case mappings first.
|
| 709 |
fixed_mappings = {
|
| 710 |
'id': 'id',
|
| 711 |
-
'agent_name': 'Agent',
|
| 712 |
'SDK version': 'SDK Version',
|
| 713 |
'Openhands version': 'SDK Version', # Legacy support
|
| 714 |
'Language model': 'Language Model',
|
|
@@ -869,21 +815,7 @@ class DataTransformer:
|
|
| 869 |
df_view = df_sorted.copy()
|
| 870 |
|
| 871 |
# --- 3. Add Columns for Agent Openness ---
|
| 872 |
-
|
| 873 |
-
# more than one distinct agent. On the canonical OpenHands pages
|
| 874 |
-
# every row says "OpenHands", so adding the column is just noise;
|
| 875 |
-
# on the Alternative Agents page rows differ (Claude Code / Codex
|
| 876 |
-
# / Gemini CLI / OpenHands Sub-agents), so the column carries
|
| 877 |
-
# signal and disambiguates same-model rows from different
|
| 878 |
-
# harnesses.
|
| 879 |
-
has_mixed_agents = (
|
| 880 |
-
"Agent" in df_view.columns
|
| 881 |
-
and df_view["Agent"].dropna().nunique() > 1
|
| 882 |
-
)
|
| 883 |
-
if has_mixed_agents:
|
| 884 |
-
base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
|
| 885 |
-
else:
|
| 886 |
-
base_cols = ["id", "Language Model", "SDK Version", "Source"]
|
| 887 |
new_cols = ["Openness"]
|
| 888 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 889 |
|
|
@@ -1086,18 +1018,13 @@ def _plot_scatter_plotly(
|
|
| 1086 |
"""
|
| 1087 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 1088 |
Format: {lm_name} (SDK {version})
|
| 1089 |
-
Harness: {agent} (only when the row carries an Agent —
|
| 1090 |
-
Alternative Agents page only; the
|
| 1091 |
-
canonical OpenHands pages drop the
|
| 1092 |
-
Agent column in view() so this line
|
| 1093 |
-
is skipped there)
|
| 1094 |
Average Score: {score}
|
| 1095 |
Average Cost/Runtime: {value}
|
| 1096 |
Openness: {openness}
|
| 1097 |
"""
|
| 1098 |
h_pad = " "
|
| 1099 |
parts = ["<br>"]
|
| 1100 |
-
|
| 1101 |
# Get and clean the language model name
|
| 1102 |
llm_base_value = row.get('Language Model', '')
|
| 1103 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
|
@@ -1105,21 +1032,13 @@ def _plot_scatter_plotly(
|
|
| 1105 |
lm_name = llm_base_value[0]
|
| 1106 |
else:
|
| 1107 |
lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
|
| 1108 |
-
|
| 1109 |
# Get SDK version
|
| 1110 |
sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
|
| 1111 |
-
|
| 1112 |
# Title line: {lm_name} (SDK {version})
|
| 1113 |
parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
|
| 1114 |
-
|
| 1115 |
-
# Harness line — only on pages where the Agent column is present
|
| 1116 |
-
# (Alternative Agents). Without this, two rows for the same LM run
|
| 1117 |
-
# under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
|
| 1118 |
-
# on claude-sonnet-4-5) are indistinguishable on hover.
|
| 1119 |
-
agent_value = row.get('Agent')
|
| 1120 |
-
if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
|
| 1121 |
-
parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
|
| 1122 |
-
|
| 1123 |
# Average Score
|
| 1124 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 1125 |
|
|
@@ -1192,116 +1111,51 @@ def _plot_scatter_plotly(
|
|
| 1192 |
y_min = min_score - 5 if min_score > 5 else 0
|
| 1193 |
y_max = max_score + 5
|
| 1194 |
|
| 1195 |
-
# Cache base64-encoded logos across rows — every Claude model on the
|
| 1196 |
-
# Alternative Agents page points at the same assets/harness-claude-code.svg,
|
| 1197 |
-
# so decoding once per path is ~N× cheaper than once per point.
|
| 1198 |
-
_logo_cache: dict[str, str] = {}
|
| 1199 |
-
def _encode_logo(path: str) -> Optional[str]:
|
| 1200 |
-
if path in _logo_cache:
|
| 1201 |
-
return _logo_cache[path]
|
| 1202 |
-
if not os.path.exists(path):
|
| 1203 |
-
return None
|
| 1204 |
-
try:
|
| 1205 |
-
with open(path, "rb") as f:
|
| 1206 |
-
encoded = base64.b64encode(f.read()).decode("utf-8")
|
| 1207 |
-
except Exception as e:
|
| 1208 |
-
logger.warning(f"Could not load logo {path}: {e}")
|
| 1209 |
-
return None
|
| 1210 |
-
mime = "svg+xml" if path.lower().endswith(".svg") else "png"
|
| 1211 |
-
uri = f"data:image/{mime};base64,{encoded}"
|
| 1212 |
-
_logo_cache[path] = uri
|
| 1213 |
-
return uri
|
| 1214 |
-
|
| 1215 |
-
# Composite markers: on the Alternative Agents page the dataframe carries
|
| 1216 |
-
# an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
|
| 1217 |
-
# so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
|
| 1218 |
-
# Sub-agents would otherwise share the exact same Anthropic logo marker
|
| 1219 |
-
# and be visually indistinguishable. When Agent is present, we stack
|
| 1220 |
-
# two logos at each point: model provider on top, harness on the bottom.
|
| 1221 |
-
# Canonical OpenHands pages drop the Agent column in view() (via the
|
| 1222 |
-
# has_mixed_agents check), so they fall through to the single-logo path
|
| 1223 |
-
# and render exactly as before.
|
| 1224 |
-
has_harness_column = (
|
| 1225 |
-
"Agent" in data_plot.columns
|
| 1226 |
-
and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
|
| 1227 |
-
)
|
| 1228 |
-
|
| 1229 |
-
# Marker sizes. The composite variant fits two logos inside roughly the
|
| 1230 |
-
# same vertical footprint as a single marker, so each half is slightly
|
| 1231 |
-
# smaller and the two halves are offset symmetrically around the point's
|
| 1232 |
-
# true y-coordinate.
|
| 1233 |
-
SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
|
| 1234 |
-
STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
|
| 1235 |
-
STACKED_Y_OFFSET = 0.028 # half-separation between model (top) and harness (bottom)
|
| 1236 |
-
|
| 1237 |
for _, row in data_plot.iterrows():
|
| 1238 |
model_name = row.get('Language Model', '')
|
| 1239 |
openness = row.get('Openness', '')
|
| 1240 |
marker_info = get_marker_icon(model_name, openness, mark_by)
|
| 1241 |
-
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
-
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
-
|
| 1260 |
-
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
-
|
| 1269 |
-
|
| 1270 |
-
|
| 1271 |
-
|
| 1272 |
-
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
layer="above",
|
| 1283 |
-
))
|
| 1284 |
-
layout_images.append(dict(
|
| 1285 |
-
source=harness_uri,
|
| 1286 |
-
xref="x domain", yref="y domain",
|
| 1287 |
-
x=domain_x, y=harness_y,
|
| 1288 |
-
sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
|
| 1289 |
-
xanchor="center", yanchor="middle",
|
| 1290 |
-
layer="above",
|
| 1291 |
-
))
|
| 1292 |
-
else:
|
| 1293 |
-
# Single marker (canonical OpenHands pages, or Alternative Agents
|
| 1294 |
-
# rows with an unknown harness name — the latter shouldn't happen
|
| 1295 |
-
# in practice since HARNESS_LOGO_PATHS covers every agent_name the
|
| 1296 |
-
# push-to-index script emits).
|
| 1297 |
-
layout_images.append(dict(
|
| 1298 |
-
source=model_logo_uri,
|
| 1299 |
-
xref="x domain", yref="y domain",
|
| 1300 |
-
x=domain_x, y=domain_y,
|
| 1301 |
-
sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
|
| 1302 |
-
xanchor="center", yanchor="middle",
|
| 1303 |
-
layer="above",
|
| 1304 |
-
))
|
| 1305 |
|
| 1306 |
# --- Section 7: Add Model Name Labels to Frontier Points ---
|
| 1307 |
if frontier_rows:
|
|
@@ -1472,47 +1326,38 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 1472 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 1473 |
|
| 1474 |
|
| 1475 |
-
def _hidden_runtime_sort_key(runtime_value: float | int | None, score_value: float | int | None) -> str:
|
| 1476 |
-
"""Build a hidden prefix so Gradio's string-based runtime sorting behaves numerically."""
|
| 1477 |
-
if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
|
| 1478 |
-
return f"{float(runtime_value):020.6f}"
|
| 1479 |
-
if pd.notna(score_value):
|
| 1480 |
-
return "99999999999999999998"
|
| 1481 |
-
return "99999999999999999999"
|
| 1482 |
-
|
| 1483 |
-
|
| 1484 |
def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
|
| 1485 |
"""
|
| 1486 |
Applies custom formatting to a runtime column based on its corresponding score column.
|
| 1487 |
- If runtime is not null, formats as time with 's' suffix.
|
| 1488 |
- If runtime is null but score is not, it becomes "Missing".
|
| 1489 |
- If both runtime and score are null, it becomes "Not Submitted".
|
| 1490 |
-
- Adds a hidden, zero-padded numeric prefix so Gradio sorts the column numerically.
|
| 1491 |
Args:
|
| 1492 |
df: The DataFrame to modify.
|
| 1493 |
runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
|
| 1494 |
Returns:
|
| 1495 |
The DataFrame with the formatted runtime column.
|
| 1496 |
"""
|
|
|
|
| 1497 |
score_col_name = runtime_col_name.replace("Runtime", "Score")
|
| 1498 |
|
|
|
|
| 1499 |
if score_col_name not in df.columns:
|
| 1500 |
-
return df
|
| 1501 |
|
| 1502 |
def apply_formatting_logic(row):
|
| 1503 |
runtime_value = row[runtime_col_name]
|
| 1504 |
score_value = row[score_col_name]
|
| 1505 |
status_color = "#ec4899"
|
| 1506 |
-
sort_key = _hidden_runtime_sort_key(runtime_value, score_value)
|
| 1507 |
-
hidden_sort_prefix = f'<span style="display:none">{sort_key}</span>'
|
| 1508 |
|
| 1509 |
if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
|
| 1510 |
-
return f"{
|
| 1511 |
elif pd.notna(score_value):
|
| 1512 |
-
return f'
|
| 1513 |
else:
|
| 1514 |
-
return f'
|
| 1515 |
|
|
|
|
| 1516 |
df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
|
| 1517 |
|
| 1518 |
return df
|
|
|
|
| 228 |
def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
| 229 |
"""
|
| 230 |
Gets the appropriate icon based on the mark_by selection.
|
| 231 |
+
|
| 232 |
Args:
|
| 233 |
model_name: The model name
|
| 234 |
openness: The openness value (open/closed)
|
| 235 |
mark_by: One of "Company", "Openness", or "Country"
|
| 236 |
+
|
| 237 |
Returns:
|
| 238 |
dict with 'path' and 'name' keys
|
| 239 |
"""
|
| 240 |
from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
|
| 241 |
+
|
| 242 |
if mark_by == MARK_BY_OPENNESS:
|
| 243 |
return get_openness_icon(openness)
|
| 244 |
elif mark_by == MARK_BY_COUNTRY:
|
|
|
|
| 247 |
return get_company_from_model(model_name)
|
| 248 |
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
# Standard layout configuration for all charts
|
| 251 |
STANDARD_LAYOUT = dict(
|
| 252 |
template="plotly_white",
|
|
|
|
| 655 |
# Case 1: Handle fixed, special-case mappings first.
|
| 656 |
fixed_mappings = {
|
| 657 |
'id': 'id',
|
|
|
|
| 658 |
'SDK version': 'SDK Version',
|
| 659 |
'Openhands version': 'SDK Version', # Legacy support
|
| 660 |
'Language model': 'Language Model',
|
|
|
|
| 815 |
df_view = df_sorted.copy()
|
| 816 |
|
| 817 |
# --- 3. Add Columns for Agent Openness ---
|
| 818 |
+
base_cols = ["id","Language Model","SDK Version","Source"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
new_cols = ["Openness"]
|
| 820 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 821 |
|
|
|
|
| 1018 |
"""
|
| 1019 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 1020 |
Format: {lm_name} (SDK {version})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
Average Score: {score}
|
| 1022 |
Average Cost/Runtime: {value}
|
| 1023 |
Openness: {openness}
|
| 1024 |
"""
|
| 1025 |
h_pad = " "
|
| 1026 |
parts = ["<br>"]
|
| 1027 |
+
|
| 1028 |
# Get and clean the language model name
|
| 1029 |
llm_base_value = row.get('Language Model', '')
|
| 1030 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
|
|
|
| 1032 |
lm_name = llm_base_value[0]
|
| 1033 |
else:
|
| 1034 |
lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
|
| 1035 |
+
|
| 1036 |
# Get SDK version
|
| 1037 |
sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
|
| 1038 |
+
|
| 1039 |
# Title line: {lm_name} (SDK {version})
|
| 1040 |
parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
|
| 1041 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1042 |
# Average Score
|
| 1043 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 1044 |
|
|
|
|
| 1111 |
y_min = min_score - 5 if min_score > 5 else 0
|
| 1112 |
y_max = max_score + 5
|
| 1113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
for _, row in data_plot.iterrows():
|
| 1115 |
model_name = row.get('Language Model', '')
|
| 1116 |
openness = row.get('Openness', '')
|
| 1117 |
marker_info = get_marker_icon(model_name, openness, mark_by)
|
| 1118 |
+
logo_path = marker_info['path']
|
| 1119 |
+
|
| 1120 |
+
# Read the SVG file and encode as base64 data URI
|
| 1121 |
+
if os.path.exists(logo_path):
|
| 1122 |
+
try:
|
| 1123 |
+
with open(logo_path, 'rb') as f:
|
| 1124 |
+
encoded_logo = base64.b64encode(f.read()).decode('utf-8')
|
| 1125 |
+
logo_uri = f"data:image/svg+xml;base64,{encoded_logo}"
|
| 1126 |
+
|
| 1127 |
+
x_val = row[x_col_to_use]
|
| 1128 |
+
y_val = row[y_col_to_use]
|
| 1129 |
+
|
| 1130 |
+
# Convert to domain coordinates (0-1 range)
|
| 1131 |
+
# For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
|
| 1132 |
+
if x_val > 0:
|
| 1133 |
+
log_x = np.log10(x_val)
|
| 1134 |
+
domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
|
| 1135 |
+
else:
|
| 1136 |
+
domain_x = 0
|
| 1137 |
+
|
| 1138 |
+
# For linear y: domain_y = (y - y_min) / (y_max - y_min)
|
| 1139 |
+
domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
|
| 1140 |
+
|
| 1141 |
+
# Clamp to valid range
|
| 1142 |
+
domain_x = max(0, min(1, domain_x))
|
| 1143 |
+
domain_y = max(0, min(1, domain_y))
|
| 1144 |
+
|
| 1145 |
+
layout_images.append(dict(
|
| 1146 |
+
source=logo_uri,
|
| 1147 |
+
xref="x domain", # Use domain coordinates for log scale compatibility
|
| 1148 |
+
yref="y domain",
|
| 1149 |
+
x=domain_x,
|
| 1150 |
+
y=domain_y,
|
| 1151 |
+
sizex=0.04, # Size as fraction of plot width
|
| 1152 |
+
sizey=0.06, # Size as fraction of plot height
|
| 1153 |
+
xanchor="center",
|
| 1154 |
+
yanchor="middle",
|
| 1155 |
+
layer="above"
|
| 1156 |
+
))
|
| 1157 |
+
except Exception as e:
|
| 1158 |
+
logger.warning(f"Could not load logo {logo_path}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
|
| 1160 |
# --- Section 7: Add Model Name Labels to Frontier Points ---
|
| 1161 |
if frontier_rows:
|
|
|
|
| 1326 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 1327 |
|
| 1328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1329 |
def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
|
| 1330 |
"""
|
| 1331 |
Applies custom formatting to a runtime column based on its corresponding score column.
|
| 1332 |
- If runtime is not null, formats as time with 's' suffix.
|
| 1333 |
- If runtime is null but score is not, it becomes "Missing".
|
| 1334 |
- If both runtime and score are null, it becomes "Not Submitted".
|
|
|
|
| 1335 |
Args:
|
| 1336 |
df: The DataFrame to modify.
|
| 1337 |
runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
|
| 1338 |
Returns:
|
| 1339 |
The DataFrame with the formatted runtime column.
|
| 1340 |
"""
|
| 1341 |
+
# Find the corresponding score column by replacing "Runtime" with "Score"
|
| 1342 |
score_col_name = runtime_col_name.replace("Runtime", "Score")
|
| 1343 |
|
| 1344 |
+
# Ensure the score column actually exists to avoid errors
|
| 1345 |
if score_col_name not in df.columns:
|
| 1346 |
+
return df # Return the DataFrame unmodified if there's no matching score
|
| 1347 |
|
| 1348 |
def apply_formatting_logic(row):
|
| 1349 |
runtime_value = row[runtime_col_name]
|
| 1350 |
score_value = row[score_col_name]
|
| 1351 |
status_color = "#ec4899"
|
|
|
|
|
|
|
| 1352 |
|
| 1353 |
if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
|
| 1354 |
+
return f"{runtime_value:.0f}s"
|
| 1355 |
elif pd.notna(score_value):
|
| 1356 |
+
return f'<span style="color: {status_color};">Missing</span>' # Score exists, but runtime is missing
|
| 1357 |
else:
|
| 1358 |
+
return f'<span style="color: {status_color};">Not Submitted</span>' # Neither score nor runtime exists
|
| 1359 |
|
| 1360 |
+
# Apply the logic to the specified runtime column and update the DataFrame
|
| 1361 |
df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
|
| 1362 |
|
| 1363 |
return df
|
main_page.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import matplotlib
|
| 2 |
matplotlib.use('Agg')
|
| 3 |
import gradio as gr
|
| 4 |
-
import pandas as pd
|
| 5 |
|
| 6 |
|
| 7 |
from ui_components import (
|
|
@@ -27,32 +26,6 @@ from constants import MARK_BY_DEFAULT
|
|
| 27 |
CACHED_VIEWERS = {}
|
| 28 |
CACHED_TAG_MAPS = {}
|
| 29 |
|
| 30 |
-
|
| 31 |
-
def filter_complete_entries(df: pd.DataFrame) -> pd.DataFrame:
|
| 32 |
-
if df.empty:
|
| 33 |
-
return df.copy()
|
| 34 |
-
|
| 35 |
-
category_score_columns = [
|
| 36 |
-
'Issue Resolution Score',
|
| 37 |
-
'Frontend Score',
|
| 38 |
-
'Greenfield Score',
|
| 39 |
-
'Testing Score',
|
| 40 |
-
'Information Gathering Score',
|
| 41 |
-
]
|
| 42 |
-
|
| 43 |
-
if all(column in df.columns for column in category_score_columns):
|
| 44 |
-
return df[df[category_score_columns].notna().all(axis=1)].copy()
|
| 45 |
-
|
| 46 |
-
if 'Categories Completed' in df.columns:
|
| 47 |
-
categories_completed = pd.to_numeric(df['Categories Completed'], errors='coerce')
|
| 48 |
-
return df[categories_completed >= 5].copy()
|
| 49 |
-
|
| 50 |
-
if 'Categories Attempted' in df.columns:
|
| 51 |
-
return df[df['Categories Attempted'] == '5/5'].copy()
|
| 52 |
-
|
| 53 |
-
return df.copy()
|
| 54 |
-
|
| 55 |
-
|
| 56 |
def build_page():
|
| 57 |
with gr.Row(elem_id="intro-row"):
|
| 58 |
with gr.Column(scale=1):
|
|
@@ -65,91 +38,78 @@ def build_page():
|
|
| 65 |
|
| 66 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 67 |
if not test_df.empty:
|
| 68 |
-
|
|
|
|
| 69 |
full_df=test_df,
|
| 70 |
tag_map=test_tag_map,
|
| 71 |
category_name=CATEGORY_NAME,
|
| 72 |
split_name="test"
|
| 73 |
)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
has_complete_entries = len(test_df_complete) > 0
|
| 77 |
-
|
| 78 |
if 'Openness' in test_df.columns:
|
| 79 |
test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
|
| 80 |
else:
|
| 81 |
test_df_open = test_df.copy()
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
initial_df = test_df_complete if has_complete_entries else test_df
|
| 85 |
-
|
| 86 |
# --- Winners by Category Section ---
|
| 87 |
gr.Markdown("---")
|
| 88 |
gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
|
| 89 |
gr.Markdown("Top 5 performing systems in each benchmark category.")
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
| 96 |
# --- New Visualization Sections ---
|
| 97 |
gr.Markdown("---")
|
| 98 |
-
|
| 99 |
# Evolution Over Time Section
|
| 100 |
gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
|
| 101 |
gr.Markdown("Track how model performance has improved over time based on release dates.")
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
gr.Markdown("---")
|
| 109 |
-
|
| 110 |
# Open Model Accuracy by Size Section (always shows open models only by design)
|
| 111 |
gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
|
| 112 |
gr.Markdown("Compare open-weights model performance against their parameter count.")
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
evolution_fig = create_evolution_over_time_chart(
|
| 127 |
-
size_fig = create_accuracy_by_size_chart(
|
| 128 |
-
|
| 129 |
return winners_html, evolution_fig, size_fig
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
show_open_only_input = show_open_only_checkbox if show_open_only_checkbox is not None else gr.State(value=False)
|
| 133 |
-
extra_section_inputs = [show_incomplete_input, show_open_only_input, mark_by_dropdown]
|
| 134 |
-
|
| 135 |
-
if show_incomplete_checkbox is not None:
|
| 136 |
-
show_incomplete_checkbox.change(
|
| 137 |
-
fn=update_extra_sections,
|
| 138 |
-
inputs=extra_section_inputs,
|
| 139 |
-
outputs=[winners_component, evolution_component, size_component]
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
if show_open_only_checkbox is not None:
|
| 143 |
show_open_only_checkbox.change(
|
| 144 |
fn=update_extra_sections,
|
| 145 |
-
inputs=
|
| 146 |
outputs=[winners_component, evolution_component, size_component]
|
| 147 |
)
|
| 148 |
-
|
| 149 |
if mark_by_dropdown is not None:
|
| 150 |
mark_by_dropdown.change(
|
| 151 |
fn=update_extra_sections,
|
| 152 |
-
inputs=
|
| 153 |
outputs=[winners_component, evolution_component, size_component]
|
| 154 |
)
|
| 155 |
|
|
|
|
| 1 |
import matplotlib
|
| 2 |
matplotlib.use('Agg')
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
from ui_components import (
|
|
|
|
| 26 |
CACHED_VIEWERS = {}
|
| 27 |
CACHED_TAG_MAPS = {}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def build_page():
|
| 30 |
with gr.Row(elem_id="intro-row"):
|
| 31 |
with gr.Column(scale=1):
|
|
|
|
| 38 |
|
| 39 |
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 40 |
if not test_df.empty:
|
| 41 |
+
# Get the checkbox and dropdown returned from create_leaderboard_display
|
| 42 |
+
show_open_only_checkbox, mark_by_dropdown = create_leaderboard_display(
|
| 43 |
full_df=test_df,
|
| 44 |
tag_map=test_tag_map,
|
| 45 |
category_name=CATEGORY_NAME,
|
| 46 |
split_name="test"
|
| 47 |
)
|
| 48 |
+
|
| 49 |
+
# Prepare open-only filtered dataframe for Winners and Evolution
|
|
|
|
|
|
|
| 50 |
if 'Openness' in test_df.columns:
|
| 51 |
test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
|
| 52 |
else:
|
| 53 |
test_df_open = test_df.copy()
|
| 54 |
+
|
|
|
|
|
|
|
|
|
|
| 55 |
# --- Winners by Category Section ---
|
| 56 |
gr.Markdown("---")
|
| 57 |
gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
|
| 58 |
gr.Markdown("Top 5 performing systems in each benchmark category.")
|
| 59 |
+
|
| 60 |
+
# Create both all and open-only versions of winners HTML
|
| 61 |
+
winners_html_all = create_winners_by_category_html(test_df, top_n=5)
|
| 62 |
+
winners_html_open = create_winners_by_category_html(test_df_open, top_n=5)
|
| 63 |
+
|
| 64 |
+
winners_component = gr.HTML(winners_html_all, elem_id="winners-by-category")
|
| 65 |
+
|
| 66 |
# --- New Visualization Sections ---
|
| 67 |
gr.Markdown("---")
|
| 68 |
+
|
| 69 |
# Evolution Over Time Section
|
| 70 |
gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
|
| 71 |
gr.Markdown("Track how model performance has improved over time based on release dates.")
|
| 72 |
+
|
| 73 |
+
# Create initial evolution chart with default mark_by
|
| 74 |
+
evolution_fig_all = create_evolution_over_time_chart(test_df, MARK_BY_DEFAULT)
|
| 75 |
+
|
| 76 |
+
evolution_component = gr.Plot(value=evolution_fig_all, elem_id="evolution-chart")
|
| 77 |
+
|
| 78 |
gr.Markdown("---")
|
| 79 |
+
|
| 80 |
# Open Model Accuracy by Size Section (always shows open models only by design)
|
| 81 |
gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
|
| 82 |
gr.Markdown("Compare open-weights model performance against their parameter count.")
|
| 83 |
+
|
| 84 |
+
size_fig = create_accuracy_by_size_chart(test_df, MARK_BY_DEFAULT)
|
| 85 |
+
size_component = gr.Plot(value=size_fig, elem_id="size-accuracy-chart")
|
| 86 |
+
|
| 87 |
+
# Update function for Winners, Evolution, and Size charts based on filters
|
| 88 |
+
def update_extra_sections(show_open_only, mark_by):
|
| 89 |
+
# Select the appropriate dataframe based on open_only filter
|
| 90 |
+
df_to_use = test_df_open if show_open_only else test_df
|
| 91 |
+
|
| 92 |
+
# Winners HTML (not affected by mark_by, only open_only)
|
| 93 |
+
winners_html = winners_html_open if show_open_only else winners_html_all
|
| 94 |
+
|
| 95 |
+
# Regenerate charts with current mark_by setting
|
| 96 |
+
evolution_fig = create_evolution_over_time_chart(df_to_use, mark_by)
|
| 97 |
+
size_fig = create_accuracy_by_size_chart(test_df, mark_by) # Size chart always uses full df (filters internally)
|
| 98 |
+
|
| 99 |
return winners_html, evolution_fig, size_fig
|
| 100 |
+
|
| 101 |
+
# Connect both checkbox and dropdown to update all extra sections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
if show_open_only_checkbox is not None:
|
| 103 |
show_open_only_checkbox.change(
|
| 104 |
fn=update_extra_sections,
|
| 105 |
+
inputs=[show_open_only_checkbox, mark_by_dropdown],
|
| 106 |
outputs=[winners_component, evolution_component, size_component]
|
| 107 |
)
|
| 108 |
+
|
| 109 |
if mark_by_dropdown is not None:
|
| 110 |
mark_by_dropdown.change(
|
| 111 |
fn=update_extra_sections,
|
| 112 |
+
inputs=[show_open_only_checkbox if show_open_only_checkbox else gr.State(value=False), mark_by_dropdown],
|
| 113 |
outputs=[winners_component, evolution_component, size_component]
|
| 114 |
)
|
| 115 |
|
setup_data.py
CHANGED
|
@@ -70,39 +70,27 @@ def fetch_data_from_github():
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
-
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
-
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
-
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
-
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
-
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
| 94 |
-
|
| 95 |
-
# Also copy alternative_agents/ if present, so the loader can pick up
|
| 96 |
-
# ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
|
| 97 |
-
# alongside the default OpenHands agent results.
|
| 98 |
-
alt_source = clone_dir / "alternative_agents"
|
| 99 |
-
if alt_source.exists():
|
| 100 |
-
alt_target = target_dir / "alternative_agents"
|
| 101 |
-
shutil.copytree(alt_source, alt_target)
|
| 102 |
-
agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
|
| 103 |
-
print(f"Found alternative agent types: {agent_types}")
|
| 104 |
-
else:
|
| 105 |
-
print("No alternative_agents/ directory in repository (skipping)")
|
| 106 |
|
| 107 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 108 |
|
|
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
+
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
+
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
+
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
+
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
+
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 96 |
|
simple_data_loader.py
CHANGED
|
@@ -96,43 +96,17 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
|
|
| 96 |
|
| 97 |
class SimpleLeaderboardViewer:
|
| 98 |
"""Simple replacement for agent-eval's LeaderboardViewer."""
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
AGENT_FILTER_ALTERNATIVE = "alternative"
|
| 102 |
-
|
| 103 |
-
def __init__(
|
| 104 |
-
self,
|
| 105 |
-
data_dir: str,
|
| 106 |
-
config: str,
|
| 107 |
-
split: str,
|
| 108 |
-
agent_filter: str = AGENT_FILTER_OPENHANDS,
|
| 109 |
-
):
|
| 110 |
"""
|
| 111 |
Args:
|
| 112 |
data_dir: Path to data directory
|
| 113 |
config: Config name (e.g., "1.0.0-dev1")
|
| 114 |
split: Split name (e.g., "validation" or "test")
|
| 115 |
-
agent_filter: Which submissions to include.
|
| 116 |
-
``"openhands"`` (default) loads only the default OpenHands
|
| 117 |
-
agent runs from ``results/{model}/`` — the canonical
|
| 118 |
-
leaderboard. ``"alternative"`` loads only third-party
|
| 119 |
-
harnesses (Claude Code / Codex / Gemini CLI / OpenHands
|
| 120 |
-
Sub-agents) from ``alternative_agents/{type}/{model}/``,
|
| 121 |
-
which power the standalone Alternative Agents page.
|
| 122 |
-
The two are kept on separate pages because their
|
| 123 |
-
cost/runtime numbers aren't apples-to-apples and mixing
|
| 124 |
-
them in one ranking would be misleading.
|
| 125 |
"""
|
| 126 |
-
if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
|
| 127 |
-
raise ValueError(
|
| 128 |
-
f"agent_filter must be one of "
|
| 129 |
-
f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
|
| 130 |
-
f"got {agent_filter!r}"
|
| 131 |
-
)
|
| 132 |
self.data_dir = Path(data_dir)
|
| 133 |
self.config = config
|
| 134 |
self.split = split
|
| 135 |
-
self.agent_filter = agent_filter
|
| 136 |
self.config_path = self.data_dir / config
|
| 137 |
|
| 138 |
# Benchmark to category mappings (single source of truth)
|
|
@@ -153,115 +127,55 @@ class SimpleLeaderboardViewer:
|
|
| 153 |
if benchmark not in self.tag_map[category]:
|
| 154 |
self.tag_map[category].append(benchmark)
|
| 155 |
|
| 156 |
-
# Default agent_name when metadata.json doesn't carry one. Matches the
|
| 157 |
-
# default-agent value used by push_to_index_from_archive.py so legacy
|
| 158 |
-
# entries (which omit the field) still group cleanly with new entries.
|
| 159 |
-
DEFAULT_AGENT_NAME = "OpenHands"
|
| 160 |
-
|
| 161 |
-
def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
|
| 162 |
-
"""Build per-benchmark records from a single agent directory.
|
| 163 |
-
|
| 164 |
-
Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
|
| 165 |
-
``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
|
| 166 |
-
Returns ``(records, validation_errors)``. Returns an empty list of
|
| 167 |
-
records when the directory has no scores or is hidden from the
|
| 168 |
-
leaderboard.
|
| 169 |
-
"""
|
| 170 |
-
records: list[dict] = []
|
| 171 |
-
metadata, scores, errors = load_and_validate_agent_data(agent_dir)
|
| 172 |
-
|
| 173 |
-
if metadata is None or scores is None:
|
| 174 |
-
return records, errors
|
| 175 |
-
|
| 176 |
-
if metadata.get('hide_from_leaderboard', False):
|
| 177 |
-
logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
|
| 178 |
-
return records, errors
|
| 179 |
-
|
| 180 |
-
# Resolve the agent display name. Prefer the value stamped into
|
| 181 |
-
# metadata.json by push-to-index; fall back to the directory's
|
| 182 |
-
# default (e.g. "Claude Code" for acp-claude/) and finally to
|
| 183 |
-
# "OpenHands" for legacy results/ entries that predate the field.
|
| 184 |
-
agent_name = (
|
| 185 |
-
metadata.get('agent_name')
|
| 186 |
-
or default_agent_name
|
| 187 |
-
or self.DEFAULT_AGENT_NAME
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
-
for score_entry in scores:
|
| 191 |
-
record = {
|
| 192 |
-
'agent_name': agent_name,
|
| 193 |
-
'agent_version': metadata.get('agent_version', 'Unknown'),
|
| 194 |
-
'llm_base': metadata.get('model', 'unknown'),
|
| 195 |
-
'openness': metadata.get('openness', 'unknown'),
|
| 196 |
-
'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
|
| 197 |
-
'release_date': metadata.get('release_date', ''),
|
| 198 |
-
'parameter_count_b': metadata.get('parameter_count_b'),
|
| 199 |
-
'active_parameter_count_b': metadata.get('active_parameter_count_b'),
|
| 200 |
-
'score': score_entry.get('score'),
|
| 201 |
-
'metric': score_entry.get('metric', 'unknown'),
|
| 202 |
-
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 203 |
-
'average_runtime': score_entry.get('average_runtime'),
|
| 204 |
-
'tags': [score_entry.get('benchmark')],
|
| 205 |
-
'full_archive': score_entry.get('full_archive', ''),
|
| 206 |
-
'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
|
| 207 |
-
}
|
| 208 |
-
records.append(record)
|
| 209 |
-
return records, errors
|
| 210 |
-
|
| 211 |
def _load_from_agent_dirs(self):
|
| 212 |
-
"""Load
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
|
| 219 |
-
acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
|
| 220 |
-
Alternative Agents page uses this.
|
| 221 |
-
|
| 222 |
-
Returns ``None`` if no records were found (which makes the caller
|
| 223 |
-
render an empty-state placeholder).
|
| 224 |
-
"""
|
| 225 |
all_records = []
|
| 226 |
all_validation_errors = []
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
#
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
# Log validation errors if any
|
| 266 |
if all_validation_errors:
|
| 267 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
@@ -269,10 +183,10 @@ class SimpleLeaderboardViewer:
|
|
| 269 |
logger.warning(f" - {error}")
|
| 270 |
if len(all_validation_errors) > 5:
|
| 271 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 272 |
-
|
| 273 |
if not all_records:
|
| 274 |
-
return None #
|
| 275 |
-
|
| 276 |
return pd.DataFrame(all_records)
|
| 277 |
|
| 278 |
def _load(self):
|
|
@@ -292,36 +206,26 @@ class SimpleLeaderboardViewer:
|
|
| 292 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 293 |
transformed_records = []
|
| 294 |
|
| 295 |
-
# Create a unique identifier
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
# one row when both submit to the leaderboard.
|
| 299 |
-
df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
|
| 300 |
-
df['agent_id'] = (
|
| 301 |
-
df['agent_name'].astype(str)
|
| 302 |
-
+ '_' + df['agent_version'].astype(str)
|
| 303 |
-
+ '_' + df['llm_base'].astype(str)
|
| 304 |
-
)
|
| 305 |
-
|
| 306 |
for agent_id in df['agent_id'].unique():
|
| 307 |
agent_records = df[df['agent_id'] == agent_id]
|
| 308 |
-
|
| 309 |
# Build a single record for this agent
|
| 310 |
first_record = agent_records.iloc[0]
|
| 311 |
agent_version = first_record['agent_version']
|
| 312 |
-
|
| 313 |
-
|
| 314 |
# Normalize openness to "open" or "closed"
|
| 315 |
from aliases import OPENNESS_MAPPING
|
| 316 |
raw_openness = first_record['openness']
|
| 317 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 318 |
-
|
| 319 |
# All 5 categories for the leaderboard
|
| 320 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 321 |
-
|
| 322 |
record = {
|
| 323 |
# Core agent info - use final display names
|
| 324 |
-
'agent_name': agent_name, # Will become "Agent"
|
| 325 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 326 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 327 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
@@ -331,7 +235,7 @@ class SimpleLeaderboardViewer:
|
|
| 331 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 332 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 333 |
# Additional columns expected by the transformer
|
| 334 |
-
# Use agent_id (
|
| 335 |
'id': agent_id,
|
| 336 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 337 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
|
|
|
| 96 |
|
| 97 |
class SimpleLeaderboardViewer:
|
| 98 |
"""Simple replacement for agent-eval's LeaderboardViewer."""
|
| 99 |
+
|
| 100 |
+
def __init__(self, data_dir: str, config: str, split: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
"""
|
| 102 |
Args:
|
| 103 |
data_dir: Path to data directory
|
| 104 |
config: Config name (e.g., "1.0.0-dev1")
|
| 105 |
split: Split name (e.g., "validation" or "test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
self.data_dir = Path(data_dir)
|
| 108 |
self.config = config
|
| 109 |
self.split = split
|
|
|
|
| 110 |
self.config_path = self.data_dir / config
|
| 111 |
|
| 112 |
# Benchmark to category mappings (single source of truth)
|
|
|
|
| 127 |
if benchmark not in self.tag_map[category]:
|
| 128 |
self.tag_map[category].append(benchmark)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def _load_from_agent_dirs(self):
|
| 131 |
+
"""Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
|
| 132 |
+
results_dir = self.config_path / "results"
|
| 133 |
+
|
| 134 |
+
if not results_dir.exists():
|
| 135 |
+
return None # Fall back to old format
|
| 136 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
all_records = []
|
| 138 |
all_validation_errors = []
|
| 139 |
+
|
| 140 |
+
# Iterate through each agent directory
|
| 141 |
+
for agent_dir in results_dir.iterdir():
|
| 142 |
+
if not agent_dir.is_dir():
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
# Load and validate using pydantic models
|
| 146 |
+
metadata, scores, errors = load_and_validate_agent_data(agent_dir)
|
| 147 |
+
|
| 148 |
+
if errors:
|
| 149 |
+
all_validation_errors.extend(errors)
|
| 150 |
+
|
| 151 |
+
if metadata is None or scores is None:
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
# Skip entries that are hidden from the leaderboard
|
| 155 |
+
if metadata.get('hide_from_leaderboard', False):
|
| 156 |
+
logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
# Create one record per benchmark (mimicking old JSONL format)
|
| 160 |
+
for score_entry in scores:
|
| 161 |
+
record = {
|
| 162 |
+
'agent_version': metadata.get('agent_version', 'Unknown'),
|
| 163 |
+
'llm_base': metadata.get('model', 'unknown'),
|
| 164 |
+
'openness': metadata.get('openness', 'unknown'),
|
| 165 |
+
'submission_time': metadata.get('submission_time', ''),
|
| 166 |
+
'release_date': metadata.get('release_date', ''), # Model release date
|
| 167 |
+
'parameter_count_b': metadata.get('parameter_count_b'), # Total params in billions
|
| 168 |
+
'active_parameter_count_b': metadata.get('active_parameter_count_b'), # Active params for MoE
|
| 169 |
+
'score': score_entry.get('score'),
|
| 170 |
+
'metric': score_entry.get('metric', 'unknown'),
|
| 171 |
+
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 172 |
+
'average_runtime': score_entry.get('average_runtime'),
|
| 173 |
+
'tags': [score_entry.get('benchmark')],
|
| 174 |
+
'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
|
| 175 |
+
'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
|
| 176 |
+
}
|
| 177 |
+
all_records.append(record)
|
| 178 |
+
|
| 179 |
# Log validation errors if any
|
| 180 |
if all_validation_errors:
|
| 181 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
|
|
| 183 |
logger.warning(f" - {error}")
|
| 184 |
if len(all_validation_errors) > 5:
|
| 185 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 186 |
+
|
| 187 |
if not all_records:
|
| 188 |
+
return None # Fall back to old format
|
| 189 |
+
|
| 190 |
return pd.DataFrame(all_records)
|
| 191 |
|
| 192 |
def _load(self):
|
|
|
|
| 206 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 207 |
transformed_records = []
|
| 208 |
|
| 209 |
+
# Create a unique identifier for each agent (version + model)
|
| 210 |
+
df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
|
| 211 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
for agent_id in df['agent_id'].unique():
|
| 213 |
agent_records = df[df['agent_id'] == agent_id]
|
| 214 |
+
|
| 215 |
# Build a single record for this agent
|
| 216 |
first_record = agent_records.iloc[0]
|
| 217 |
agent_version = first_record['agent_version']
|
| 218 |
+
|
|
|
|
| 219 |
# Normalize openness to "open" or "closed"
|
| 220 |
from aliases import OPENNESS_MAPPING
|
| 221 |
raw_openness = first_record['openness']
|
| 222 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 223 |
+
|
| 224 |
# All 5 categories for the leaderboard
|
| 225 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 226 |
+
|
| 227 |
record = {
|
| 228 |
# Core agent info - use final display names
|
|
|
|
| 229 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 230 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 231 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
|
|
| 235 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 236 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 237 |
# Additional columns expected by the transformer
|
| 238 |
+
# Use agent_id (version_model) as unique identifier for Pareto frontier calculation
|
| 239 |
'id': agent_id,
|
| 240 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 241 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
tests/test_runtime_sorting.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
|
| 3 |
-
from leaderboard_transformer import format_runtime_column
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def test_runtime_strings_sort_numerically_in_ascending_order():
|
| 7 |
-
df = pd.DataFrame(
|
| 8 |
-
{
|
| 9 |
-
"Average Score": [0.8, 0.8, 0.8, 0.8, None],
|
| 10 |
-
"Average Runtime": [1323.0, 372.0, 410.0, None, None],
|
| 11 |
-
}
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
formatted = format_runtime_column(df.copy(), "Average Runtime")
|
| 15 |
-
runtimes = formatted["Average Runtime"].tolist()
|
| 16 |
-
|
| 17 |
-
assert sorted(runtimes) == [
|
| 18 |
-
runtimes[1],
|
| 19 |
-
runtimes[2],
|
| 20 |
-
runtimes[0],
|
| 21 |
-
runtimes[3],
|
| 22 |
-
runtimes[4],
|
| 23 |
-
]
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def test_runtime_formatting_preserves_visible_labels():
|
| 27 |
-
df = pd.DataFrame(
|
| 28 |
-
{
|
| 29 |
-
"Average Score": [0.8, 0.8, None],
|
| 30 |
-
"Average Runtime": [45.2, None, None],
|
| 31 |
-
}
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
formatted = format_runtime_column(df.copy(), "Average Runtime")
|
| 35 |
-
values = formatted["Average Runtime"].tolist()
|
| 36 |
-
|
| 37 |
-
assert values[0].endswith("45s")
|
| 38 |
-
assert values[1].endswith("Missing</span>")
|
| 39 |
-
assert values[2].endswith("Not Submitted</span>")
|
| 40 |
-
assert 'display:none' in values[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui_components.py
CHANGED
|
@@ -508,36 +508,28 @@ class DummyViewer:
|
|
| 508 |
# The _load method returns the error DataFrame and an empty tag map
|
| 509 |
return self._error_df, {}
|
| 510 |
|
| 511 |
-
def get_leaderboard_viewer_instance(
|
| 512 |
-
split: str,
|
| 513 |
-
agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 514 |
-
):
|
| 515 |
"""
|
| 516 |
-
Fetches the LeaderboardViewer for a
|
| 517 |
-
|
| 518 |
-
both axes so the OpenHands and Alternative Agents pages don't fight
|
| 519 |
-
over a single slot. On error, returns a stable DummyViewer.
|
| 520 |
"""
|
| 521 |
global CACHED_VIEWERS, CACHED_TAG_MAPS
|
| 522 |
|
| 523 |
-
cache_key = (split, agent_filter)
|
| 524 |
-
|
| 525 |
with _cache_lock:
|
| 526 |
-
if
|
| 527 |
# Cache hit: return the cached viewer and tag map
|
| 528 |
-
return CACHED_VIEWERS[
|
| 529 |
|
| 530 |
# --- Cache miss: try to load data from the source ---
|
| 531 |
try:
|
| 532 |
# First try to load from extracted data directory (local mock data)
|
| 533 |
data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
|
| 534 |
-
|
| 535 |
-
print(f"Loading data for split '{split}'
|
| 536 |
viewer = SimpleLeaderboardViewer(
|
| 537 |
data_dir=data_dir,
|
| 538 |
config=CONFIG_NAME,
|
| 539 |
-
split=split
|
| 540 |
-
agent_filter=agent_filter,
|
| 541 |
)
|
| 542 |
|
| 543 |
# Simplify tag map creation
|
|
@@ -545,14 +537,14 @@ def get_leaderboard_viewer_instance(
|
|
| 545 |
|
| 546 |
# Cache the results for next time (thread-safe)
|
| 547 |
with _cache_lock:
|
| 548 |
-
CACHED_VIEWERS[
|
| 549 |
-
CACHED_TAG_MAPS[
|
| 550 |
|
| 551 |
return viewer, pretty_tag_map
|
| 552 |
|
| 553 |
except Exception as e:
|
| 554 |
# On ANY error, create a consistent error message and cache a DummyViewer
|
| 555 |
-
error_message = f"Error loading data for split '{split}'
|
| 556 |
print(format_error(error_message))
|
| 557 |
|
| 558 |
dummy_df = pd.DataFrame({"Message": [error_message]})
|
|
@@ -561,8 +553,8 @@ def get_leaderboard_viewer_instance(
|
|
| 561 |
|
| 562 |
# Cache the dummy objects so we don't try to fetch again on this run
|
| 563 |
with _cache_lock:
|
| 564 |
-
CACHED_VIEWERS[
|
| 565 |
-
CACHED_TAG_MAPS[
|
| 566 |
|
| 567 |
return dummy_viewer, dummy_tag_map
|
| 568 |
|
|
@@ -1040,8 +1032,8 @@ def create_leaderboard_display(
|
|
| 1040 |
outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
|
| 1041 |
)
|
| 1042 |
|
| 1043 |
-
# Return the
|
| 1044 |
-
return
|
| 1045 |
|
| 1046 |
# # --- Detailed Benchmark Display ---
|
| 1047 |
def create_benchmark_details_display(
|
|
@@ -1276,17 +1268,12 @@ def create_benchmark_details_display(
|
|
| 1276 |
legend_markdown = create_legend_markdown(benchmark_name)
|
| 1277 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 1278 |
|
| 1279 |
-
def get_full_leaderboard_data(
|
| 1280 |
-
split: str,
|
| 1281 |
-
agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 1282 |
-
) -> tuple[pd.DataFrame, dict]:
|
| 1283 |
"""
|
| 1284 |
-
Loads and transforms the complete dataset for a
|
| 1285 |
-
|
| 1286 |
-
that don't pass it stay on the canonical leaderboard. The Alternative
|
| 1287 |
-
Agents page passes ``"alternative"`` to get the third-party harnesses.
|
| 1288 |
"""
|
| 1289 |
-
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split
|
| 1290 |
|
| 1291 |
if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
|
| 1292 |
raw_df, _ = viewer_or_data._load()
|
|
|
|
| 508 |
# The _load method returns the error DataFrame and an empty tag map
|
| 509 |
return self._error_df, {}
|
| 510 |
|
| 511 |
+
def get_leaderboard_viewer_instance(split: str):
|
|
|
|
|
|
|
|
|
|
| 512 |
"""
|
| 513 |
+
Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
|
| 514 |
+
re-downloading data. On error, returns a stable DummyViewer object.
|
|
|
|
|
|
|
| 515 |
"""
|
| 516 |
global CACHED_VIEWERS, CACHED_TAG_MAPS
|
| 517 |
|
|
|
|
|
|
|
| 518 |
with _cache_lock:
|
| 519 |
+
if split in CACHED_VIEWERS:
|
| 520 |
# Cache hit: return the cached viewer and tag map
|
| 521 |
+
return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
|
| 522 |
|
| 523 |
# --- Cache miss: try to load data from the source ---
|
| 524 |
try:
|
| 525 |
# First try to load from extracted data directory (local mock data)
|
| 526 |
data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
|
| 527 |
+
|
| 528 |
+
print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
|
| 529 |
viewer = SimpleLeaderboardViewer(
|
| 530 |
data_dir=data_dir,
|
| 531 |
config=CONFIG_NAME,
|
| 532 |
+
split=split
|
|
|
|
| 533 |
)
|
| 534 |
|
| 535 |
# Simplify tag map creation
|
|
|
|
| 537 |
|
| 538 |
# Cache the results for next time (thread-safe)
|
| 539 |
with _cache_lock:
|
| 540 |
+
CACHED_VIEWERS[split] = viewer
|
| 541 |
+
CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
|
| 542 |
|
| 543 |
return viewer, pretty_tag_map
|
| 544 |
|
| 545 |
except Exception as e:
|
| 546 |
# On ANY error, create a consistent error message and cache a DummyViewer
|
| 547 |
+
error_message = f"Error loading data for split '{split}': {e}"
|
| 548 |
print(format_error(error_message))
|
| 549 |
|
| 550 |
dummy_df = pd.DataFrame({"Message": [error_message]})
|
|
|
|
| 553 |
|
| 554 |
# Cache the dummy objects so we don't try to fetch again on this run
|
| 555 |
with _cache_lock:
|
| 556 |
+
CACHED_VIEWERS[split] = dummy_viewer
|
| 557 |
+
CACHED_TAG_MAPS[split] = dummy_tag_map
|
| 558 |
|
| 559 |
return dummy_viewer, dummy_tag_map
|
| 560 |
|
|
|
|
| 1032 |
outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
|
| 1033 |
)
|
| 1034 |
|
| 1035 |
+
# Return the show_open_only_checkbox and mark_by_dropdown so they can be used to update other sections
|
| 1036 |
+
return show_open_only_checkbox, mark_by_dropdown
|
| 1037 |
|
| 1038 |
# # --- Detailed Benchmark Display ---
|
| 1039 |
def create_benchmark_details_display(
|
|
|
|
| 1268 |
legend_markdown = create_legend_markdown(benchmark_name)
|
| 1269 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 1270 |
|
| 1271 |
+
def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
|
|
|
|
|
|
|
|
| 1272 |
"""
|
| 1273 |
+
Loads and transforms the complete dataset for a given split.
|
| 1274 |
+
This function handles caching and returns the final "pretty" DataFrame and tag map.
|
|
|
|
|
|
|
| 1275 |
"""
|
| 1276 |
+
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
|
| 1277 |
|
| 1278 |
if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
|
| 1279 |
raw_df, _ = viewer_or_data._load()
|