.gitattributes CHANGED
@@ -1,2 +1 @@
1
 
2
- docs/screenshots/alternative-agents.png filter=lfs diff=lfs merge=lfs -text
 
1
 
 
alternative_agents_page.py DELETED
@@ -1,103 +0,0 @@
1
- """Alternative Agents leaderboard page.
2
-
3
- The canonical OpenHands Index leaderboard (Home + the per-category pages)
4
- ranks default OpenHands agent runs from ``results/{model}/`` in the
5
- openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
6
- Gemini CLI, OpenHands Sub-agents, ...) live under
7
- ``alternative_agents/{type}/{model}/`` and aren't directly comparable to
8
- default OpenHands runs (different scaffolds, different cost/runtime
9
- characteristics), so they get their own standalone page instead of being
10
- mixed into the same ranking.
11
-
12
- This page is intentionally a single Overall view (no per-category
13
- subpages) — the alternative-agents dataset is small (one row per
14
- harness × model) and the goal is "show me all the alternatives at a
15
- glance", not "drill into Issue Resolution for Codex".
16
-
17
- To make same-model comparisons easier, the page also appends canonical
18
- OpenHands rows for any language model that appears in the alternative
19
- agent dataset. The match is exact, so ``Gemini-3-Pro`` and
20
- ``Gemini-3.1-Pro`` remain distinct entries.
21
- """
22
- import matplotlib
23
- matplotlib.use('Agg')
24
- import pandas as pd
25
- import gradio as gr
26
-
27
- from simple_data_loader import SimpleLeaderboardViewer
28
- from ui_components import (
29
- create_leaderboard_display,
30
- get_full_leaderboard_data,
31
- )
32
-
33
-
34
- ALTERNATIVE_AGENTS_INTRO = """
35
- <div id="alternative-agents-intro">
36
- <h2>Alternative Agents</h2>
37
- <p>
38
- Third-party agent harnesses running the OpenHands Index benchmarks.
39
- To make direct comparisons easier, this page also includes the
40
- canonical OpenHands row whenever the exact same language model appears
41
- under an alternative harness. Cost and runtime numbers still come from
42
- each harness's own instrumentation and aren't directly comparable
43
- across harnesses.
44
- </p>
45
- </div>
46
- """
47
-
48
-
49
- def _append_openhands_shared_models(
50
- alternative_df: pd.DataFrame,
51
- split: str,
52
- ) -> pd.DataFrame:
53
- if alternative_df.empty or "Language Model" not in alternative_df.columns:
54
- return alternative_df
55
-
56
- openhands_df, _ = get_full_leaderboard_data(
57
- split,
58
- agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
59
- )
60
- if openhands_df.empty or "Language Model" not in openhands_df.columns:
61
- return alternative_df
62
-
63
- alternative_models = set(
64
- alternative_df["Language Model"].dropna().astype(str).str.strip()
65
- )
66
- if not alternative_models:
67
- return alternative_df
68
-
69
- openhands_shared_df = openhands_df[
70
- openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
71
- ].copy()
72
- if openhands_shared_df.empty:
73
- return alternative_df
74
-
75
- return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)
76
-
77
-
78
- def build_page():
79
- gr.HTML(ALTERNATIVE_AGENTS_INTRO)
80
-
81
- gr.Markdown("---")
82
-
83
- test_df, test_tag_map = get_full_leaderboard_data(
84
- "test",
85
- agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
86
- )
87
-
88
- if test_df.empty:
89
- gr.Markdown(
90
- "No alternative agent submissions yet. New runs land in "
91
- "`alternative_agents/{type}/{model}/` in "
92
- "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
93
- )
94
- return
95
-
96
- test_df = _append_openhands_shared_models(test_df, split="test")
97
-
98
- create_leaderboard_display(
99
- full_df=test_df,
100
- tag_map=test_tag_map,
101
- category_name="Overall",
102
- split_name="test",
103
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -35,7 +35,6 @@ from app_creation import build_page as build_app_creation_page
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
38
- from alternative_agents_page import build_page as build_alternative_agents_page
39
  from about import build_page as build_about_page
40
 
41
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -374,46 +373,20 @@ with demo.route("Testing", "/testing"):
374
  with demo.route("Information Gathering", "/information-gathering"):
375
  build_information_gathering_page()
376
 
377
- with demo.route("Alternative Agents", "/alternative-agents"):
378
- build_alternative_agents_page()
379
-
380
  with demo.route("About", "/about"):
381
  build_about_page()
382
 
383
  logger.info("All routes configured")
384
 
385
  # Mount the REST API on /api
386
- from fastapi import FastAPI, Request
387
- from fastapi.responses import RedirectResponse
388
- from starlette.middleware.base import BaseHTTPMiddleware
389
  from api import api_app
390
 
391
-
392
- class RootRedirectMiddleware(BaseHTTPMiddleware):
393
- """Middleware to redirect root path "/" to "/home".
394
-
395
- This fixes the 307 trailing slash redirect issue (Gradio bug #11071) that
396
- occurs when Gradio is mounted at "/" - FastAPI's default behavior redirects
397
- "/" to "//", which breaks routing on HuggingFace Spaces.
398
-
399
- See: https://github.com/gradio-app/gradio/issues/11071
400
- """
401
- async def dispatch(self, request: Request, call_next):
402
- if request.url.path == "/":
403
- return RedirectResponse(url="/home", status_code=302)
404
- return await call_next(request)
405
-
406
-
407
- # Create a parent FastAPI app with redirect_slashes=False to prevent
408
- # automatic trailing slash redirects that cause issues with Gradio
409
- root_app = FastAPI(redirect_slashes=False)
410
-
411
- # Add middleware to handle root path redirect to /home
412
- root_app.add_middleware(RootRedirectMiddleware)
413
-
414
  root_app.mount("/api", api_app)
415
 
416
- # Mount Gradio app at root path
417
  app = gr.mount_gradio_app(root_app, demo, path="/")
418
  logger.info("REST API mounted at /api, Gradio app mounted at /")
419
 
 
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
 
38
  from about import build_page as build_about_page
39
 
40
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 
373
  with demo.route("Information Gathering", "/information-gathering"):
374
  build_information_gathering_page()
375
 
 
 
 
376
  with demo.route("About", "/about"):
377
  build_about_page()
378
 
379
  logger.info("All routes configured")
380
 
381
  # Mount the REST API on /api
382
+ from fastapi import FastAPI
 
 
383
  from api import api_app
384
 
385
+ # Create a parent FastAPI app that will host both the API and Gradio
386
+ root_app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  root_app.mount("/api", api_app)
388
 
389
+ # Mount Gradio app - root redirect is handled by the proxy
390
  app = gr.mount_gradio_app(root_app, demo, path="/")
391
  logger.info("REST API mounted at /api, Gradio app mounted at /")
392
 
assets/harnesses/README.md DELETED
@@ -1,59 +0,0 @@
1
- # Agent harness logos
2
-
3
- This folder holds the **bottom half** of the composite scatter markers used
4
- on the [Alternative Agents](../../alternative_agents_page.py) page. Each
5
- point on that scatter stacks two logos: the model provider on top (from
6
- `assets/logo-*.svg`) and the harness on the bottom (from this folder).
7
-
8
- ## Expected filenames
9
-
10
- The scatter code looks up a logo by the exact `agent_name` string that the
11
- `push-to-index` workflow writes into the index repo's `metadata.json`, then
12
- maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
13
- these filenames in sync with that map.
14
-
15
- | `agent_name` (in index repo) | File in this folder |
16
- | --- | --- |
17
- | `Claude Code` | `claude-code.svg` or `claude-code.png` |
18
- | `Codex` | `codex-cli.svg` or `codex-cli.png` |
19
- | `Gemini CLI` | `gemini-cli.svg` or `gemini-cli.png` |
20
- | `OpenHands` | `openhands.svg` or `openhands.png` |
21
- | `OpenHands Sub-agents` | `openhands.svg` or `openhands.png` (shared with `OpenHands`) |
22
-
23
- Both `.svg` and `.png` are accepted — the resolver tries `.svg` first, then
24
- `.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
25
- binary files on plain `git push` and routes PNGs through Xet, so an SVG is
26
- one less thing to set up.
27
-
28
- ## When a file is missing
29
-
30
- The scatter falls back to a single marker (just the model provider logo) —
31
- exactly the same rendering path the canonical OpenHands pages use. Nothing
32
- crashes and nothing prints a warning in normal operation. This means you
33
- can roll out logos one harness at a time without waiting for all four.
34
-
35
- ## Sizing and shape
36
-
37
- - Square canvas. The composite marker is drawn at a fixed aspect ratio, so
38
- a non-square logo will get squished.
39
- - Any SVG `viewBox` works — the renderer base64-encodes the file as-is and
40
- Plotly scales it to the marker's `sizex` / `sizey`. Around `80×80` to
41
- `256×256` is a good source size.
42
- - Leave some internal padding (≈10%) so the logo doesn't touch the marker
43
- edge when two are stacked.
44
- - No background is required, but a rounded-square coloured tile reads well
45
- at small sizes because it gives each harness a distinct silhouette even
46
- when the inner glyph isn't fully legible. Look at the existing
47
- `assets/logo-*.svg` files for the canonical model provider logos if you
48
- want a visual reference for sizing.
49
-
50
- ## Adding a new harness
51
-
52
- 1. Decide on the exact `agent_name` that the push-to-index workflow writes
53
- for the new harness (see `AGENT_NAME_BY_TYPE` in
54
- `OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
55
- 2. Add an entry to `HARNESS_LOGO_STEMS` in
56
- [`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
57
- maps the display name to a stem.
58
- 3. Drop `{stem}.svg` (or `.png`) into this folder.
59
- 4. Reload the app and look at `/alternative-agents`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/harnesses/claude-code.svg DELETED
assets/harnesses/codex-cli.svg DELETED
assets/harnesses/gemini-cli.svg DELETED
assets/harnesses/openhands.svg DELETED
docs/screenshots/alternative-agents.png DELETED

Git LFS Details

  • SHA256: 99766c7d2c11a6f90f24a5f0effbae74a8aa33096b89ff1c4fcfb238fe06a2f5
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
leaderboard_transformer.py CHANGED
@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
228
  def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
229
  """
230
  Gets the appropriate icon based on the mark_by selection.
231
-
232
  Args:
233
  model_name: The model name
234
  openness: The openness value (open/closed)
235
  mark_by: One of "Company", "Openness", or "Country"
236
-
237
  Returns:
238
  dict with 'path' and 'name' keys
239
  """
240
  from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
241
-
242
  if mark_by == MARK_BY_OPENNESS:
243
  return get_openness_icon(openness)
244
  elif mark_by == MARK_BY_COUNTRY:
@@ -247,59 +247,6 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
247
  return get_company_from_model(model_name)
248
 
249
 
250
- # Map the agent_name stored in the index repo's metadata.json to a file stem
251
- # inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
252
- # OpenHands/evaluation push_to_index_from_archive.py — if a new ACP harness
253
- # lands there, add the corresponding display name and a matching stem here.
254
- #
255
- # The scatter plot looks for {stem}.svg first, then {stem}.png in
256
- # assets/harnesses/. This repo intentionally ships only a README in that
257
- # folder: drop the logo files in by hand (SVG preferred, PNG works too via
258
- # HF Xet) and they'll be picked up on the next app restart. If the file is
259
- # missing, get_harness_icon() returns None and the scatter falls back to the
260
- # single-marker path — same rendering the canonical OpenHands pages use —
261
- # so logos can be added one harness at a time without breaking anything.
262
- HARNESS_LOGO_STEMS: dict[str, str] = {
263
- "Claude Code": "claude-code",
264
- "Codex": "codex-cli",
265
- "Gemini CLI": "gemini-cli",
266
- "OpenHands": "openhands",
267
- "OpenHands Sub-agents": "openhands",
268
- }
269
- HARNESS_LOGO_DIR = "assets/harnesses"
270
- HARNESS_LOGO_EXTENSIONS = ("svg", "png")
271
-
272
-
273
- def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
274
- """Return {'path', 'name'} for the harness logo, or None if not usable.
275
-
276
- Consumed by the Alternative Agents scatter plot to draw a composite
277
- marker (model provider on top, harness on bottom). Returns None in any
278
- of three cases, all of which make the caller skip the harness layer:
279
-
280
- - ``agent_name`` is empty or missing from the dataframe row.
281
- - ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
282
- hasn't been registered yet — register it and drop in a logo).
283
- - The logo file for that stem doesn't exist in ``assets/harnesses/``
284
- yet (the repo ships only the README).
285
-
286
- That third case is the important one: it lets the Alternative Agents
287
- page work immediately after checkout even when the harness logo files
288
- haven't been dropped in. The corresponding points just render like a
289
- canonical-page marker (model logo only) until the file is added.
290
- """
291
- if not agent_name:
292
- return None
293
- stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
294
- if stem is None:
295
- return None
296
- for ext in HARNESS_LOGO_EXTENSIONS:
297
- path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
298
- if os.path.exists(path):
299
- return {"path": path, "name": agent_name}
300
- return None
301
-
302
-
303
  # Standard layout configuration for all charts
304
  STANDARD_LAYOUT = dict(
305
  template="plotly_white",
@@ -708,7 +655,6 @@ def _pretty_column_name(raw_col: str) -> str:
708
  # Case 1: Handle fixed, special-case mappings first.
709
  fixed_mappings = {
710
  'id': 'id',
711
- 'agent_name': 'Agent',
712
  'SDK version': 'SDK Version',
713
  'Openhands version': 'SDK Version', # Legacy support
714
  'Language model': 'Language Model',
@@ -869,21 +815,7 @@ class DataTransformer:
869
  df_view = df_sorted.copy()
870
 
871
  # --- 3. Add Columns for Agent Openness ---
872
- # Only include the "Agent" column when the dataframe actually has
873
- # more than one distinct agent. On the canonical OpenHands pages
874
- # every row says "OpenHands", so adding the column is just noise;
875
- # on the Alternative Agents page rows differ (Claude Code / Codex
876
- # / Gemini CLI / OpenHands Sub-agents), so the column carries
877
- # signal and disambiguates same-model rows from different
878
- # harnesses.
879
- has_mixed_agents = (
880
- "Agent" in df_view.columns
881
- and df_view["Agent"].dropna().nunique() > 1
882
- )
883
- if has_mixed_agents:
884
- base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
885
- else:
886
- base_cols = ["id", "Language Model", "SDK Version", "Source"]
887
  new_cols = ["Openness"]
888
  ending_cols = ["Date", "Logs", "Visualization"]
889
 
@@ -1086,18 +1018,13 @@ def _plot_scatter_plotly(
1086
  """
1087
  Builds the complete HTML string for the plot's hover tooltip.
1088
  Format: {lm_name} (SDK {version})
1089
- Harness: {agent} (only when the row carries an Agent —
1090
- Alternative Agents page only; the
1091
- canonical OpenHands pages drop the
1092
- Agent column in view() so this line
1093
- is skipped there)
1094
  Average Score: {score}
1095
  Average Cost/Runtime: {value}
1096
  Openness: {openness}
1097
  """
1098
  h_pad = " "
1099
  parts = ["<br>"]
1100
-
1101
  # Get and clean the language model name
1102
  llm_base_value = row.get('Language Model', '')
1103
  llm_base_value = clean_llm_base_list(llm_base_value)
@@ -1105,21 +1032,13 @@ def _plot_scatter_plotly(
1105
  lm_name = llm_base_value[0]
1106
  else:
1107
  lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
1108
-
1109
  # Get SDK version
1110
  sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
1111
-
1112
  # Title line: {lm_name} (SDK {version})
1113
  parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
1114
-
1115
- # Harness line — only on pages where the Agent column is present
1116
- # (Alternative Agents). Without this, two rows for the same LM run
1117
- # under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
1118
- # on claude-sonnet-4-5) are indistinguishable on hover.
1119
- agent_value = row.get('Agent')
1120
- if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
1121
- parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
1122
-
1123
  # Average Score
1124
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
1125
 
@@ -1192,116 +1111,51 @@ def _plot_scatter_plotly(
1192
  y_min = min_score - 5 if min_score > 5 else 0
1193
  y_max = max_score + 5
1194
 
1195
- # Cache base64-encoded logos across rows — every Claude model on the
1196
- # Alternative Agents page points at the same assets/harness-claude-code.svg,
1197
- # so decoding once per path is ~N× cheaper than once per point.
1198
- _logo_cache: dict[str, str] = {}
1199
- def _encode_logo(path: str) -> Optional[str]:
1200
- if path in _logo_cache:
1201
- return _logo_cache[path]
1202
- if not os.path.exists(path):
1203
- return None
1204
- try:
1205
- with open(path, "rb") as f:
1206
- encoded = base64.b64encode(f.read()).decode("utf-8")
1207
- except Exception as e:
1208
- logger.warning(f"Could not load logo {path}: {e}")
1209
- return None
1210
- mime = "svg+xml" if path.lower().endswith(".svg") else "png"
1211
- uri = f"data:image/{mime};base64,{encoded}"
1212
- _logo_cache[path] = uri
1213
- return uri
1214
-
1215
- # Composite markers: on the Alternative Agents page the dataframe carries
1216
- # an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
1217
- # so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
1218
- # Sub-agents would otherwise share the exact same Anthropic logo marker
1219
- # and be visually indistinguishable. When Agent is present, we stack
1220
- # two logos at each point: model provider on top, harness on the bottom.
1221
- # Canonical OpenHands pages drop the Agent column in view() (via the
1222
- # has_mixed_agents check), so they fall through to the single-logo path
1223
- # and render exactly as before.
1224
- has_harness_column = (
1225
- "Agent" in data_plot.columns
1226
- and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
1227
- )
1228
-
1229
- # Marker sizes. The composite variant fits two logos inside roughly the
1230
- # same vertical footprint as a single marker, so each half is slightly
1231
- # smaller and the two halves are offset symmetrically around the point's
1232
- # true y-coordinate.
1233
- SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
1234
- STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
1235
- STACKED_Y_OFFSET = 0.028 # half-separation between model (top) and harness (bottom)
1236
-
1237
  for _, row in data_plot.iterrows():
1238
  model_name = row.get('Language Model', '')
1239
  openness = row.get('Openness', '')
1240
  marker_info = get_marker_icon(model_name, openness, mark_by)
1241
- model_logo_uri = _encode_logo(marker_info['path'])
1242
- if model_logo_uri is None:
1243
- continue
1244
-
1245
- # Harness (only meaningful when the dataframe carries an Agent column).
1246
- harness_uri = None
1247
- if has_harness_column:
1248
- harness_info = get_harness_icon(row.get("Agent"))
1249
- if harness_info is not None:
1250
- harness_uri = _encode_logo(harness_info["path"])
1251
-
1252
- x_val = row[x_col_to_use]
1253
- y_val = row[y_col_to_use]
1254
-
1255
- # Convert to domain coordinates (0-1 range)
1256
- # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
1257
- if x_val > 0:
1258
- log_x = np.log10(x_val)
1259
- domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
1260
- else:
1261
- domain_x = 0
1262
-
1263
- # For linear y: domain_y = (y - y_min) / (y_max - y_min)
1264
- domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
1265
-
1266
- # Clamp to valid range
1267
- domain_x = max(0, min(1, domain_x))
1268
- domain_y = max(0, min(1, domain_y))
1269
-
1270
- if harness_uri is not None:
1271
- # Composite: stack model on top, harness on bottom, clamping
1272
- # each half to the plot area so markers near the edges don't
1273
- # drift off-canvas.
1274
- model_y = min(1, domain_y + STACKED_Y_OFFSET)
1275
- harness_y = max(0, domain_y - STACKED_Y_OFFSET)
1276
- layout_images.append(dict(
1277
- source=model_logo_uri,
1278
- xref="x domain", yref="y domain",
1279
- x=domain_x, y=model_y,
1280
- sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
1281
- xanchor="center", yanchor="middle",
1282
- layer="above",
1283
- ))
1284
- layout_images.append(dict(
1285
- source=harness_uri,
1286
- xref="x domain", yref="y domain",
1287
- x=domain_x, y=harness_y,
1288
- sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
1289
- xanchor="center", yanchor="middle",
1290
- layer="above",
1291
- ))
1292
- else:
1293
- # Single marker (canonical OpenHands pages, or Alternative Agents
1294
- # rows with an unknown harness name — the latter shouldn't happen
1295
- # in practice since HARNESS_LOGO_PATHS covers every agent_name the
1296
- # push-to-index script emits).
1297
- layout_images.append(dict(
1298
- source=model_logo_uri,
1299
- xref="x domain", yref="y domain",
1300
- x=domain_x, y=domain_y,
1301
- sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
1302
- xanchor="center", yanchor="middle",
1303
- layer="above",
1304
- ))
1305
 
1306
  # --- Section 7: Add Model Name Labels to Frontier Points ---
1307
  if frontier_rows:
@@ -1472,47 +1326,38 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
1472
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
1473
 
1474
 
1475
- def _hidden_runtime_sort_key(runtime_value: float | int | None, score_value: float | int | None) -> str:
1476
- """Build a hidden prefix so Gradio's string-based runtime sorting behaves numerically."""
1477
- if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1478
- return f"{float(runtime_value):020.6f}"
1479
- if pd.notna(score_value):
1480
- return "99999999999999999998"
1481
- return "99999999999999999999"
1482
-
1483
-
1484
  def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
1485
  """
1486
  Applies custom formatting to a runtime column based on its corresponding score column.
1487
  - If runtime is not null, formats as time with 's' suffix.
1488
  - If runtime is null but score is not, it becomes "Missing".
1489
  - If both runtime and score are null, it becomes "Not Submitted".
1490
- - Adds a hidden, zero-padded numeric prefix so Gradio sorts the column numerically.
1491
  Args:
1492
  df: The DataFrame to modify.
1493
  runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
1494
  Returns:
1495
  The DataFrame with the formatted runtime column.
1496
  """
 
1497
  score_col_name = runtime_col_name.replace("Runtime", "Score")
1498
 
 
1499
  if score_col_name not in df.columns:
1500
- return df
1501
 
1502
  def apply_formatting_logic(row):
1503
  runtime_value = row[runtime_col_name]
1504
  score_value = row[score_col_name]
1505
  status_color = "#ec4899"
1506
- sort_key = _hidden_runtime_sort_key(runtime_value, score_value)
1507
- hidden_sort_prefix = f'<span style="display:none">{sort_key}</span>'
1508
 
1509
  if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1510
- return f"{hidden_sort_prefix}{runtime_value:.0f}s"
1511
  elif pd.notna(score_value):
1512
- return f'{hidden_sort_prefix}<span style="color: {status_color};">Missing</span>'
1513
  else:
1514
- return f'{hidden_sort_prefix}<span style="color: {status_color};">Not Submitted</span>'
1515
 
 
1516
  df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
1517
 
1518
  return df
 
228
  def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
229
  """
230
  Gets the appropriate icon based on the mark_by selection.
231
+
232
  Args:
233
  model_name: The model name
234
  openness: The openness value (open/closed)
235
  mark_by: One of "Company", "Openness", or "Country"
236
+
237
  Returns:
238
  dict with 'path' and 'name' keys
239
  """
240
  from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
241
+
242
  if mark_by == MARK_BY_OPENNESS:
243
  return get_openness_icon(openness)
244
  elif mark_by == MARK_BY_COUNTRY:
 
247
  return get_company_from_model(model_name)
248
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  # Standard layout configuration for all charts
251
  STANDARD_LAYOUT = dict(
252
  template="plotly_white",
 
655
  # Case 1: Handle fixed, special-case mappings first.
656
  fixed_mappings = {
657
  'id': 'id',
 
658
  'SDK version': 'SDK Version',
659
  'Openhands version': 'SDK Version', # Legacy support
660
  'Language model': 'Language Model',
 
815
  df_view = df_sorted.copy()
816
 
817
  # --- 3. Add Columns for Agent Openness ---
818
+ base_cols = ["id","Language Model","SDK Version","Source"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  new_cols = ["Openness"]
820
  ending_cols = ["Date", "Logs", "Visualization"]
821
 
 
1018
  """
1019
  Builds the complete HTML string for the plot's hover tooltip.
1020
  Format: {lm_name} (SDK {version})
 
 
 
 
 
1021
  Average Score: {score}
1022
  Average Cost/Runtime: {value}
1023
  Openness: {openness}
1024
  """
1025
  h_pad = " "
1026
  parts = ["<br>"]
1027
+
1028
  # Get and clean the language model name
1029
  llm_base_value = row.get('Language Model', '')
1030
  llm_base_value = clean_llm_base_list(llm_base_value)
 
1032
  lm_name = llm_base_value[0]
1033
  else:
1034
  lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
1035
+
1036
  # Get SDK version
1037
  sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
1038
+
1039
  # Title line: {lm_name} (SDK {version})
1040
  parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
1041
+
 
 
 
 
 
 
 
 
1042
  # Average Score
1043
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
1044
 
 
1111
  y_min = min_score - 5 if min_score > 5 else 0
1112
  y_max = max_score + 5
1113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1114
  for _, row in data_plot.iterrows():
1115
  model_name = row.get('Language Model', '')
1116
  openness = row.get('Openness', '')
1117
  marker_info = get_marker_icon(model_name, openness, mark_by)
1118
+ logo_path = marker_info['path']
1119
+
1120
+ # Read the SVG file and encode as base64 data URI
1121
+ if os.path.exists(logo_path):
1122
+ try:
1123
+ with open(logo_path, 'rb') as f:
1124
+ encoded_logo = base64.b64encode(f.read()).decode('utf-8')
1125
+ logo_uri = f"data:image/svg+xml;base64,{encoded_logo}"
1126
+
1127
+ x_val = row[x_col_to_use]
1128
+ y_val = row[y_col_to_use]
1129
+
1130
+ # Convert to domain coordinates (0-1 range)
1131
+ # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
1132
+ if x_val > 0:
1133
+ log_x = np.log10(x_val)
1134
+ domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
1135
+ else:
1136
+ domain_x = 0
1137
+
1138
+ # For linear y: domain_y = (y - y_min) / (y_max - y_min)
1139
+ domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
1140
+
1141
+ # Clamp to valid range
1142
+ domain_x = max(0, min(1, domain_x))
1143
+ domain_y = max(0, min(1, domain_y))
1144
+
1145
+ layout_images.append(dict(
1146
+ source=logo_uri,
1147
+ xref="x domain", # Use domain coordinates for log scale compatibility
1148
+ yref="y domain",
1149
+ x=domain_x,
1150
+ y=domain_y,
1151
+ sizex=0.04, # Size as fraction of plot width
1152
+ sizey=0.06, # Size as fraction of plot height
1153
+ xanchor="center",
1154
+ yanchor="middle",
1155
+ layer="above"
1156
+ ))
1157
+ except Exception as e:
1158
+ logger.warning(f"Could not load logo {logo_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
 
1160
  # --- Section 7: Add Model Name Labels to Frontier Points ---
1161
  if frontier_rows:
 
1326
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
1327
 
1328
 
 
 
 
 
 
 
 
 
 
1329
  def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
1330
  """
1331
  Applies custom formatting to a runtime column based on its corresponding score column.
1332
  - If runtime is not null, formats as time with 's' suffix.
1333
  - If runtime is null but score is not, it becomes "Missing".
1334
  - If both runtime and score are null, it becomes "Not Submitted".
 
1335
  Args:
1336
  df: The DataFrame to modify.
1337
  runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
1338
  Returns:
1339
  The DataFrame with the formatted runtime column.
1340
  """
1341
+ # Find the corresponding score column by replacing "Runtime" with "Score"
1342
  score_col_name = runtime_col_name.replace("Runtime", "Score")
1343
 
1344
+ # Ensure the score column actually exists to avoid errors
1345
  if score_col_name not in df.columns:
1346
+ return df # Return the DataFrame unmodified if there's no matching score
1347
 
1348
  def apply_formatting_logic(row):
1349
  runtime_value = row[runtime_col_name]
1350
  score_value = row[score_col_name]
1351
  status_color = "#ec4899"
 
 
1352
 
1353
  if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1354
+ return f"{runtime_value:.0f}s"
1355
  elif pd.notna(score_value):
1356
+ return f'<span style="color: {status_color};">Missing</span>' # Score exists, but runtime is missing
1357
  else:
1358
+ return f'<span style="color: {status_color};">Not Submitted</span>' # Neither score nor runtime exists
1359
 
1360
+ # Apply the logic to the specified runtime column and update the DataFrame
1361
  df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
1362
 
1363
  return df
main_page.py CHANGED
@@ -1,7 +1,6 @@
1
  import matplotlib
2
  matplotlib.use('Agg')
3
  import gradio as gr
4
- import pandas as pd
5
 
6
 
7
  from ui_components import (
@@ -27,32 +26,6 @@ from constants import MARK_BY_DEFAULT
27
  CACHED_VIEWERS = {}
28
  CACHED_TAG_MAPS = {}
29
 
30
-
31
- def filter_complete_entries(df: pd.DataFrame) -> pd.DataFrame:
32
- if df.empty:
33
- return df.copy()
34
-
35
- category_score_columns = [
36
- 'Issue Resolution Score',
37
- 'Frontend Score',
38
- 'Greenfield Score',
39
- 'Testing Score',
40
- 'Information Gathering Score',
41
- ]
42
-
43
- if all(column in df.columns for column in category_score_columns):
44
- return df[df[category_score_columns].notna().all(axis=1)].copy()
45
-
46
- if 'Categories Completed' in df.columns:
47
- categories_completed = pd.to_numeric(df['Categories Completed'], errors='coerce')
48
- return df[categories_completed >= 5].copy()
49
-
50
- if 'Categories Attempted' in df.columns:
51
- return df[df['Categories Attempted'] == '5/5'].copy()
52
-
53
- return df.copy()
54
-
55
-
56
  def build_page():
57
  with gr.Row(elem_id="intro-row"):
58
  with gr.Column(scale=1):
@@ -65,91 +38,78 @@ def build_page():
65
 
66
  test_df, test_tag_map = get_full_leaderboard_data("test")
67
  if not test_df.empty:
68
- show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown = create_leaderboard_display(
 
69
  full_df=test_df,
70
  tag_map=test_tag_map,
71
  category_name=CATEGORY_NAME,
72
  split_name="test"
73
  )
74
-
75
- test_df_complete = filter_complete_entries(test_df)
76
- has_complete_entries = len(test_df_complete) > 0
77
-
78
  if 'Openness' in test_df.columns:
79
  test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
80
  else:
81
  test_df_open = test_df.copy()
82
- test_df_complete_open = filter_complete_entries(test_df_open)
83
-
84
- initial_df = test_df_complete if has_complete_entries else test_df
85
-
86
  # --- Winners by Category Section ---
87
  gr.Markdown("---")
88
  gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
89
  gr.Markdown("Top 5 performing systems in each benchmark category.")
90
-
91
- winners_component = gr.HTML(
92
- create_winners_by_category_html(initial_df, top_n=5),
93
- elem_id="winners-by-category",
94
- )
95
-
 
96
  # --- New Visualization Sections ---
97
  gr.Markdown("---")
98
-
99
  # Evolution Over Time Section
100
  gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
101
  gr.Markdown("Track how model performance has improved over time based on release dates.")
102
-
103
- evolution_component = gr.Plot(
104
- value=create_evolution_over_time_chart(initial_df, MARK_BY_DEFAULT),
105
- elem_id="evolution-chart",
106
- )
107
-
108
  gr.Markdown("---")
109
-
110
  # Open Model Accuracy by Size Section (always shows open models only by design)
111
  gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
112
  gr.Markdown("Compare open-weights model performance against their parameter count.")
113
-
114
- size_component = gr.Plot(
115
- value=create_accuracy_by_size_chart(initial_df, MARK_BY_DEFAULT),
116
- elem_id="size-accuracy-chart",
117
- )
118
-
119
- def update_extra_sections(show_incomplete, show_open_only, mark_by):
120
- include_incomplete = show_incomplete or not has_complete_entries
121
- base_df = test_df if include_incomplete else test_df_complete
122
- base_df_open = test_df_open if include_incomplete else test_df_complete_open
123
- winners_df = base_df_open if show_open_only else base_df
124
-
125
- winners_html = create_winners_by_category_html(winners_df, top_n=5)
126
- evolution_fig = create_evolution_over_time_chart(winners_df, mark_by)
127
- size_fig = create_accuracy_by_size_chart(base_df, mark_by)
128
-
129
  return winners_html, evolution_fig, size_fig
130
-
131
- show_incomplete_input = show_incomplete_checkbox if show_incomplete_checkbox is not None else gr.State(value=True)
132
- show_open_only_input = show_open_only_checkbox if show_open_only_checkbox is not None else gr.State(value=False)
133
- extra_section_inputs = [show_incomplete_input, show_open_only_input, mark_by_dropdown]
134
-
135
- if show_incomplete_checkbox is not None:
136
- show_incomplete_checkbox.change(
137
- fn=update_extra_sections,
138
- inputs=extra_section_inputs,
139
- outputs=[winners_component, evolution_component, size_component]
140
- )
141
-
142
  if show_open_only_checkbox is not None:
143
  show_open_only_checkbox.change(
144
  fn=update_extra_sections,
145
- inputs=extra_section_inputs,
146
  outputs=[winners_component, evolution_component, size_component]
147
  )
148
-
149
  if mark_by_dropdown is not None:
150
  mark_by_dropdown.change(
151
  fn=update_extra_sections,
152
- inputs=extra_section_inputs,
153
  outputs=[winners_component, evolution_component, size_component]
154
  )
155
 
 
1
  import matplotlib
2
  matplotlib.use('Agg')
3
  import gradio as gr
 
4
 
5
 
6
  from ui_components import (
 
26
  CACHED_VIEWERS = {}
27
  CACHED_TAG_MAPS = {}
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def build_page():
30
  with gr.Row(elem_id="intro-row"):
31
  with gr.Column(scale=1):
 
38
 
39
  test_df, test_tag_map = get_full_leaderboard_data("test")
40
  if not test_df.empty:
41
+ # Get the checkbox and dropdown returned from create_leaderboard_display
42
+ show_open_only_checkbox, mark_by_dropdown = create_leaderboard_display(
43
  full_df=test_df,
44
  tag_map=test_tag_map,
45
  category_name=CATEGORY_NAME,
46
  split_name="test"
47
  )
48
+
49
+ # Prepare open-only filtered dataframe for Winners and Evolution
 
 
50
  if 'Openness' in test_df.columns:
51
  test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
52
  else:
53
  test_df_open = test_df.copy()
54
+
 
 
 
55
  # --- Winners by Category Section ---
56
  gr.Markdown("---")
57
  gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
58
  gr.Markdown("Top 5 performing systems in each benchmark category.")
59
+
60
+ # Create both all and open-only versions of winners HTML
61
+ winners_html_all = create_winners_by_category_html(test_df, top_n=5)
62
+ winners_html_open = create_winners_by_category_html(test_df_open, top_n=5)
63
+
64
+ winners_component = gr.HTML(winners_html_all, elem_id="winners-by-category")
65
+
66
  # --- New Visualization Sections ---
67
  gr.Markdown("---")
68
+
69
  # Evolution Over Time Section
70
  gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
71
  gr.Markdown("Track how model performance has improved over time based on release dates.")
72
+
73
+ # Create initial evolution chart with default mark_by
74
+ evolution_fig_all = create_evolution_over_time_chart(test_df, MARK_BY_DEFAULT)
75
+
76
+ evolution_component = gr.Plot(value=evolution_fig_all, elem_id="evolution-chart")
77
+
78
  gr.Markdown("---")
79
+
80
  # Open Model Accuracy by Size Section (always shows open models only by design)
81
  gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
82
  gr.Markdown("Compare open-weights model performance against their parameter count.")
83
+
84
+ size_fig = create_accuracy_by_size_chart(test_df, MARK_BY_DEFAULT)
85
+ size_component = gr.Plot(value=size_fig, elem_id="size-accuracy-chart")
86
+
87
+ # Update function for Winners, Evolution, and Size charts based on filters
88
+ def update_extra_sections(show_open_only, mark_by):
89
+ # Select the appropriate dataframe based on open_only filter
90
+ df_to_use = test_df_open if show_open_only else test_df
91
+
92
+ # Winners HTML (not affected by mark_by, only open_only)
93
+ winners_html = winners_html_open if show_open_only else winners_html_all
94
+
95
+ # Regenerate charts with current mark_by setting
96
+ evolution_fig = create_evolution_over_time_chart(df_to_use, mark_by)
97
+ size_fig = create_accuracy_by_size_chart(test_df, mark_by) # Size chart always uses full df (filters internally)
98
+
99
  return winners_html, evolution_fig, size_fig
100
+
101
+ # Connect both checkbox and dropdown to update all extra sections
 
 
 
 
 
 
 
 
 
 
102
  if show_open_only_checkbox is not None:
103
  show_open_only_checkbox.change(
104
  fn=update_extra_sections,
105
+ inputs=[show_open_only_checkbox, mark_by_dropdown],
106
  outputs=[winners_component, evolution_component, size_component]
107
  )
108
+
109
  if mark_by_dropdown is not None:
110
  mark_by_dropdown.change(
111
  fn=update_extra_sections,
112
+ inputs=[show_open_only_checkbox if show_open_only_checkbox else gr.State(value=False), mark_by_dropdown],
113
  outputs=[winners_component, evolution_component, size_component]
114
  )
115
 
setup_data.py CHANGED
@@ -70,39 +70,27 @@ def fetch_data_from_github():
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
-
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
-
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
-
84
  print(f"Found {len(result_dirs)} agent result directories")
85
-
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
-
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
94
-
95
- # Also copy alternative_agents/ if present, so the loader can pick up
96
- # ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
97
- # alongside the default OpenHands agent results.
98
- alt_source = clone_dir / "alternative_agents"
99
- if alt_source.exists():
100
- alt_target = target_dir / "alternative_agents"
101
- shutil.copytree(alt_source, alt_target)
102
- agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
103
- print(f"Found alternative agent types: {agent_types}")
104
- else:
105
- print("No alternative_agents/ directory in repository (skipping)")
106
 
107
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
108
 
 
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
+
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
+
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
+
84
  print(f"Found {len(result_dirs)} agent result directories")
85
+
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
+
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
96
 
simple_data_loader.py CHANGED
@@ -96,43 +96,17 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
-
100
- AGENT_FILTER_OPENHANDS = "openhands"
101
- AGENT_FILTER_ALTERNATIVE = "alternative"
102
-
103
- def __init__(
104
- self,
105
- data_dir: str,
106
- config: str,
107
- split: str,
108
- agent_filter: str = AGENT_FILTER_OPENHANDS,
109
- ):
110
  """
111
  Args:
112
  data_dir: Path to data directory
113
  config: Config name (e.g., "1.0.0-dev1")
114
  split: Split name (e.g., "validation" or "test")
115
- agent_filter: Which submissions to include.
116
- ``"openhands"`` (default) loads only the default OpenHands
117
- agent runs from ``results/{model}/`` — the canonical
118
- leaderboard. ``"alternative"`` loads only third-party
119
- harnesses (Claude Code / Codex / Gemini CLI / OpenHands
120
- Sub-agents) from ``alternative_agents/{type}/{model}/``,
121
- which power the standalone Alternative Agents page.
122
- The two are kept on separate pages because their
123
- cost/runtime numbers aren't apples-to-apples and mixing
124
- them in one ranking would be misleading.
125
  """
126
- if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
127
- raise ValueError(
128
- f"agent_filter must be one of "
129
- f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
130
- f"got {agent_filter!r}"
131
- )
132
  self.data_dir = Path(data_dir)
133
  self.config = config
134
  self.split = split
135
- self.agent_filter = agent_filter
136
  self.config_path = self.data_dir / config
137
 
138
  # Benchmark to category mappings (single source of truth)
@@ -153,115 +127,55 @@ class SimpleLeaderboardViewer:
153
  if benchmark not in self.tag_map[category]:
154
  self.tag_map[category].append(benchmark)
155
 
156
- # Default agent_name when metadata.json doesn't carry one. Matches the
157
- # default-agent value used by push_to_index_from_archive.py so legacy
158
- # entries (which omit the field) still group cleanly with new entries.
159
- DEFAULT_AGENT_NAME = "OpenHands"
160
-
161
- def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
162
- """Build per-benchmark records from a single agent directory.
163
-
164
- Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
165
- ``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
166
- Returns ``(records, validation_errors)``. Returns an empty list of
167
- records when the directory has no scores or is hidden from the
168
- leaderboard.
169
- """
170
- records: list[dict] = []
171
- metadata, scores, errors = load_and_validate_agent_data(agent_dir)
172
-
173
- if metadata is None or scores is None:
174
- return records, errors
175
-
176
- if metadata.get('hide_from_leaderboard', False):
177
- logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
178
- return records, errors
179
-
180
- # Resolve the agent display name. Prefer the value stamped into
181
- # metadata.json by push-to-index; fall back to the directory's
182
- # default (e.g. "Claude Code" for acp-claude/) and finally to
183
- # "OpenHands" for legacy results/ entries that predate the field.
184
- agent_name = (
185
- metadata.get('agent_name')
186
- or default_agent_name
187
- or self.DEFAULT_AGENT_NAME
188
- )
189
-
190
- for score_entry in scores:
191
- record = {
192
- 'agent_name': agent_name,
193
- 'agent_version': metadata.get('agent_version', 'Unknown'),
194
- 'llm_base': metadata.get('model', 'unknown'),
195
- 'openness': metadata.get('openness', 'unknown'),
196
- 'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
197
- 'release_date': metadata.get('release_date', ''),
198
- 'parameter_count_b': metadata.get('parameter_count_b'),
199
- 'active_parameter_count_b': metadata.get('active_parameter_count_b'),
200
- 'score': score_entry.get('score'),
201
- 'metric': score_entry.get('metric', 'unknown'),
202
- 'cost_per_instance': score_entry.get('cost_per_instance'),
203
- 'average_runtime': score_entry.get('average_runtime'),
204
- 'tags': [score_entry.get('benchmark')],
205
- 'full_archive': score_entry.get('full_archive', ''),
206
- 'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
207
- }
208
- records.append(record)
209
- return records, errors
210
-
211
  def _load_from_agent_dirs(self):
212
- """Load agent records based on ``self.agent_filter``.
213
-
214
- - ``"openhands"`` (default): only ``{config}/results/{model}/``,
215
- which is the canonical OpenHands leaderboard. The Home page and
216
- the per-category subpages use this.
217
- - ``"alternative"``: only
218
- ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
219
- acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
220
- Alternative Agents page uses this.
221
-
222
- Returns ``None`` if no records were found (which makes the caller
223
- render an empty-state placeholder).
224
- """
225
  all_records = []
226
  all_validation_errors = []
227
-
228
- if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
229
- # Default OpenHands agent results
230
- results_dir = self.config_path / "results"
231
- if results_dir.exists():
232
- for agent_dir in results_dir.iterdir():
233
- if not agent_dir.is_dir():
234
- continue
235
- records, errors = self._records_from_agent_dir(agent_dir)
236
- all_records.extend(records)
237
- all_validation_errors.extend(errors)
238
- else:
239
- # Alternative agents (one subdirectory per agent_type, then per model)
240
- # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
241
- # map in OpenHands/evaluation push_to_index_from_archive.py — keeping
242
- # it in sync ensures rows are labelled the same way the index repo
243
- # records them.
244
- agent_type_default_name = {
245
- 'acp-claude': 'Claude Code',
246
- 'acp-codex': 'Codex',
247
- 'acp-gemini': 'Gemini CLI',
248
- 'openhands_subagents': 'OpenHands Sub-agents',
249
- }
250
- alt_dir = self.config_path / "alternative_agents"
251
- if alt_dir.exists():
252
- for type_dir in alt_dir.iterdir():
253
- if not type_dir.is_dir():
254
- continue
255
- default_name = agent_type_default_name.get(type_dir.name)
256
- for agent_dir in type_dir.iterdir():
257
- if not agent_dir.is_dir():
258
- continue
259
- records, errors = self._records_from_agent_dir(
260
- agent_dir, default_agent_name=default_name
261
- )
262
- all_records.extend(records)
263
- all_validation_errors.extend(errors)
264
-
 
 
265
  # Log validation errors if any
266
  if all_validation_errors:
267
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
@@ -269,10 +183,10 @@ class SimpleLeaderboardViewer:
269
  logger.warning(f" - {error}")
270
  if len(all_validation_errors) > 5:
271
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
272
-
273
  if not all_records:
274
- return None # Caller will render empty-state placeholder
275
-
276
  return pd.DataFrame(all_records)
277
 
278
  def _load(self):
@@ -292,36 +206,26 @@ class SimpleLeaderboardViewer:
292
  # Group by agent (version + model combination) to aggregate results across datasets
293
  transformed_records = []
294
 
295
- # Create a unique identifier per (agent_name, agent_version, model)
296
- # tuple. Including agent_name keeps an OpenHands run and a Claude
297
- # Code run on the same SDK version + model from collapsing into
298
- # one row when both submit to the leaderboard.
299
- df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
300
- df['agent_id'] = (
301
- df['agent_name'].astype(str)
302
- + '_' + df['agent_version'].astype(str)
303
- + '_' + df['llm_base'].astype(str)
304
- )
305
-
306
  for agent_id in df['agent_id'].unique():
307
  agent_records = df[df['agent_id'] == agent_id]
308
-
309
  # Build a single record for this agent
310
  first_record = agent_records.iloc[0]
311
  agent_version = first_record['agent_version']
312
- agent_name = first_record['agent_name']
313
-
314
  # Normalize openness to "open" or "closed"
315
  from aliases import OPENNESS_MAPPING
316
  raw_openness = first_record['openness']
317
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
318
-
319
  # All 5 categories for the leaderboard
320
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
321
-
322
  record = {
323
  # Core agent info - use final display names
324
- 'agent_name': agent_name, # Will become "Agent"
325
  'SDK version': agent_version, # Will become "SDK Version"
326
  'Language model': first_record['llm_base'], # Will become "Language Model"
327
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
@@ -331,7 +235,7 @@ class SimpleLeaderboardViewer:
331
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
332
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
333
  # Additional columns expected by the transformer
334
- # Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
335
  'id': agent_id,
336
  'source': first_record.get('source', ''), # Will become "Source"
337
  'logs': first_record.get('logs', ''), # Will become "Logs"
 
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
+
100
+ def __init__(self, data_dir: str, config: str, split: str):
 
 
 
 
 
 
 
 
 
101
  """
102
  Args:
103
  data_dir: Path to data directory
104
  config: Config name (e.g., "1.0.0-dev1")
105
  split: Split name (e.g., "validation" or "test")
 
 
 
 
 
 
 
 
 
 
106
  """
 
 
 
 
 
 
107
  self.data_dir = Path(data_dir)
108
  self.config = config
109
  self.split = split
 
110
  self.config_path = self.data_dir / config
111
 
112
  # Benchmark to category mappings (single source of truth)
 
127
  if benchmark not in self.tag_map[category]:
128
  self.tag_map[category].append(benchmark)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _load_from_agent_dirs(self):
131
+ """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
132
+ results_dir = self.config_path / "results"
133
+
134
+ if not results_dir.exists():
135
+ return None # Fall back to old format
136
+
 
 
 
 
 
 
 
137
  all_records = []
138
  all_validation_errors = []
139
+
140
+ # Iterate through each agent directory
141
+ for agent_dir in results_dir.iterdir():
142
+ if not agent_dir.is_dir():
143
+ continue
144
+
145
+ # Load and validate using pydantic models
146
+ metadata, scores, errors = load_and_validate_agent_data(agent_dir)
147
+
148
+ if errors:
149
+ all_validation_errors.extend(errors)
150
+
151
+ if metadata is None or scores is None:
152
+ continue
153
+
154
+ # Skip entries that are hidden from the leaderboard
155
+ if metadata.get('hide_from_leaderboard', False):
156
+ logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
157
+ continue
158
+
159
+ # Create one record per benchmark (mimicking old JSONL format)
160
+ for score_entry in scores:
161
+ record = {
162
+ 'agent_version': metadata.get('agent_version', 'Unknown'),
163
+ 'llm_base': metadata.get('model', 'unknown'),
164
+ 'openness': metadata.get('openness', 'unknown'),
165
+ 'submission_time': metadata.get('submission_time', ''),
166
+ 'release_date': metadata.get('release_date', ''), # Model release date
167
+ 'parameter_count_b': metadata.get('parameter_count_b'), # Total params in billions
168
+ 'active_parameter_count_b': metadata.get('active_parameter_count_b'), # Active params for MoE
169
+ 'score': score_entry.get('score'),
170
+ 'metric': score_entry.get('metric', 'unknown'),
171
+ 'cost_per_instance': score_entry.get('cost_per_instance'),
172
+ 'average_runtime': score_entry.get('average_runtime'),
173
+ 'tags': [score_entry.get('benchmark')],
174
+ 'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
175
+ 'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
176
+ }
177
+ all_records.append(record)
178
+
179
  # Log validation errors if any
180
  if all_validation_errors:
181
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
 
183
  logger.warning(f" - {error}")
184
  if len(all_validation_errors) > 5:
185
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
186
+
187
  if not all_records:
188
+ return None # Fall back to old format
189
+
190
  return pd.DataFrame(all_records)
191
 
192
  def _load(self):
 
206
  # Group by agent (version + model combination) to aggregate results across datasets
207
  transformed_records = []
208
 
209
+ # Create a unique identifier for each agent (version + model)
210
+ df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
211
+
 
 
 
 
 
 
 
 
212
  for agent_id in df['agent_id'].unique():
213
  agent_records = df[df['agent_id'] == agent_id]
214
+
215
  # Build a single record for this agent
216
  first_record = agent_records.iloc[0]
217
  agent_version = first_record['agent_version']
218
+
 
219
  # Normalize openness to "open" or "closed"
220
  from aliases import OPENNESS_MAPPING
221
  raw_openness = first_record['openness']
222
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
223
+
224
  # All 5 categories for the leaderboard
225
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
226
+
227
  record = {
228
  # Core agent info - use final display names
 
229
  'SDK version': agent_version, # Will become "SDK Version"
230
  'Language model': first_record['llm_base'], # Will become "Language Model"
231
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
 
235
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
236
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
237
  # Additional columns expected by the transformer
238
+ # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
239
  'id': agent_id,
240
  'source': first_record.get('source', ''), # Will become "Source"
241
  'logs': first_record.get('logs', ''), # Will become "Logs"
tests/test_runtime_sorting.py DELETED
@@ -1,40 +0,0 @@
1
- import pandas as pd
2
-
3
- from leaderboard_transformer import format_runtime_column
4
-
5
-
6
- def test_runtime_strings_sort_numerically_in_ascending_order():
7
- df = pd.DataFrame(
8
- {
9
- "Average Score": [0.8, 0.8, 0.8, 0.8, None],
10
- "Average Runtime": [1323.0, 372.0, 410.0, None, None],
11
- }
12
- )
13
-
14
- formatted = format_runtime_column(df.copy(), "Average Runtime")
15
- runtimes = formatted["Average Runtime"].tolist()
16
-
17
- assert sorted(runtimes) == [
18
- runtimes[1],
19
- runtimes[2],
20
- runtimes[0],
21
- runtimes[3],
22
- runtimes[4],
23
- ]
24
-
25
-
26
- def test_runtime_formatting_preserves_visible_labels():
27
- df = pd.DataFrame(
28
- {
29
- "Average Score": [0.8, 0.8, None],
30
- "Average Runtime": [45.2, None, None],
31
- }
32
- )
33
-
34
- formatted = format_runtime_column(df.copy(), "Average Runtime")
35
- values = formatted["Average Runtime"].tolist()
36
-
37
- assert values[0].endswith("45s")
38
- assert values[1].endswith("Missing</span>")
39
- assert values[2].endswith("Not Submitted</span>")
40
- assert 'display:none' in values[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui_components.py CHANGED
@@ -508,36 +508,28 @@ class DummyViewer:
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
- def get_leaderboard_viewer_instance(
512
- split: str,
513
- agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
514
- ):
515
  """
516
- Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
517
- thread-safe cache to avoid re-downloading data. The cache is keyed on
518
- both axes so the OpenHands and Alternative Agents pages don't fight
519
- over a single slot. On error, returns a stable DummyViewer.
520
  """
521
  global CACHED_VIEWERS, CACHED_TAG_MAPS
522
 
523
- cache_key = (split, agent_filter)
524
-
525
  with _cache_lock:
526
- if cache_key in CACHED_VIEWERS:
527
  # Cache hit: return the cached viewer and tag map
528
- return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
529
 
530
  # --- Cache miss: try to load data from the source ---
531
  try:
532
  # First try to load from extracted data directory (local mock data)
533
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
534
-
535
- print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
536
  viewer = SimpleLeaderboardViewer(
537
  data_dir=data_dir,
538
  config=CONFIG_NAME,
539
- split=split,
540
- agent_filter=agent_filter,
541
  )
542
 
543
  # Simplify tag map creation
@@ -545,14 +537,14 @@ def get_leaderboard_viewer_instance(
545
 
546
  # Cache the results for next time (thread-safe)
547
  with _cache_lock:
548
- CACHED_VIEWERS[cache_key] = viewer
549
- CACHED_TAG_MAPS[cache_key] = pretty_tag_map # Cache the pretty map directly
550
 
551
  return viewer, pretty_tag_map
552
 
553
  except Exception as e:
554
  # On ANY error, create a consistent error message and cache a DummyViewer
555
- error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
556
  print(format_error(error_message))
557
 
558
  dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -561,8 +553,8 @@ def get_leaderboard_viewer_instance(
561
 
562
  # Cache the dummy objects so we don't try to fetch again on this run
563
  with _cache_lock:
564
- CACHED_VIEWERS[cache_key] = dummy_viewer
565
- CACHED_TAG_MAPS[cache_key] = dummy_tag_map
566
 
567
  return dummy_viewer, dummy_tag_map
568
 
@@ -1040,8 +1032,8 @@ def create_leaderboard_display(
1040
  outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
1041
  )
1042
 
1043
- # Return the filter controls so they can be used to update other sections
1044
- return show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown
1045
 
1046
  # # --- Detailed Benchmark Display ---
1047
  def create_benchmark_details_display(
@@ -1276,17 +1268,12 @@ def create_benchmark_details_display(
1276
  legend_markdown = create_legend_markdown(benchmark_name)
1277
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1278
 
1279
- def get_full_leaderboard_data(
1280
- split: str,
1281
- agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
1282
- ) -> tuple[pd.DataFrame, dict]:
1283
  """
1284
- Loads and transforms the complete dataset for a (split, agent_filter)
1285
- pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
1286
- that don't pass it stay on the canonical leaderboard. The Alternative
1287
- Agents page passes ``"alternative"`` to get the third-party harnesses.
1288
  """
1289
- viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
1290
 
1291
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1292
  raw_df, _ = viewer_or_data._load()
 
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
+ def get_leaderboard_viewer_instance(split: str):
 
 
 
512
  """
513
+ Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
514
+ re-downloading data. On error, returns a stable DummyViewer object.
 
 
515
  """
516
  global CACHED_VIEWERS, CACHED_TAG_MAPS
517
 
 
 
518
  with _cache_lock:
519
+ if split in CACHED_VIEWERS:
520
  # Cache hit: return the cached viewer and tag map
521
+ return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
522
 
523
  # --- Cache miss: try to load data from the source ---
524
  try:
525
  # First try to load from extracted data directory (local mock data)
526
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
527
+
528
+ print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
529
  viewer = SimpleLeaderboardViewer(
530
  data_dir=data_dir,
531
  config=CONFIG_NAME,
532
+ split=split
 
533
  )
534
 
535
  # Simplify tag map creation
 
537
 
538
  # Cache the results for next time (thread-safe)
539
  with _cache_lock:
540
+ CACHED_VIEWERS[split] = viewer
541
+ CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
542
 
543
  return viewer, pretty_tag_map
544
 
545
  except Exception as e:
546
  # On ANY error, create a consistent error message and cache a DummyViewer
547
+ error_message = f"Error loading data for split '{split}': {e}"
548
  print(format_error(error_message))
549
 
550
  dummy_df = pd.DataFrame({"Message": [error_message]})
 
553
 
554
  # Cache the dummy objects so we don't try to fetch again on this run
555
  with _cache_lock:
556
+ CACHED_VIEWERS[split] = dummy_viewer
557
+ CACHED_TAG_MAPS[split] = dummy_tag_map
558
 
559
  return dummy_viewer, dummy_tag_map
560
 
 
1032
  outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
1033
  )
1034
 
1035
+ # Return the show_open_only_checkbox and mark_by_dropdown so they can be used to update other sections
1036
+ return show_open_only_checkbox, mark_by_dropdown
1037
 
1038
  # # --- Detailed Benchmark Display ---
1039
  def create_benchmark_details_display(
 
1268
  legend_markdown = create_legend_markdown(benchmark_name)
1269
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1270
 
1271
+ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
 
 
 
1272
  """
1273
+ Loads and transforms the complete dataset for a given split.
1274
+ This function handles caching and returns the final "pretty" DataFrame and tag map.
 
 
1275
  """
1276
+ viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
1277
 
1278
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1279
  raw_df, _ = viewer_or_data._load()