Spaces:

OpenHands
/

openhands-index

Running

App Files Files Community

Empty PR for testing

by gneubig - opened Feb 11

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+184

-737

Files changed (15) hide show

.gitattributes +0 -1
alternative_agents_page.py +0 -103
app.py +4 -31
assets/harnesses/README.md +0 -59
assets/harnesses/claude-code.svg +0 -1
assets/harnesses/codex-cli.svg +0 -1
assets/harnesses/gemini-cli.svg +0 -1
assets/harnesses/openhands.svg +0 -1
docs/screenshots/alternative-agents.png +0 -3
leaderboard_transformer.py +56 -211
main_page.py +41 -81
setup_data.py +5 -17
simple_data_loader.py +59 -155
tests/test_runtime_sorting.py +0 -40
ui_components.py +19 -32

.gitattributes CHANGED Viewed

	@@ -1,2 +1 @@
1
2	- docs/screenshots/alternative-agents.png filter=lfs diff=lfs merge=lfs -text


1

alternative_agents_page.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""Alternative Agents leaderboard page.
-The canonical OpenHands Index leaderboard (Home + the per-category pages)
-ranks default OpenHands agent runs from ``results/{model}/`` in the
-openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
-Gemini CLI, OpenHands Sub-agents, ...) live under
-``alternative_agents/{type}/{model}/`` and aren't directly comparable to
-default OpenHands runs (different scaffolds, different cost/runtime
-characteristics), so they get their own standalone page instead of being
-mixed into the same ranking.
-This page is intentionally a single Overall view (no per-category
-subpages) — the alternative-agents dataset is small (one row per
-harness × model) and the goal is "show me all the alternatives at a
-glance", not "drill into Issue Resolution for Codex".
-To make same-model comparisons easier, the page also appends canonical
-OpenHands rows for any language model that appears in the alternative
-agent dataset. The match is exact, so ``Gemini-3-Pro`` and
-``Gemini-3.1-Pro`` remain distinct entries.
-"""
-import matplotlib
-matplotlib.use('Agg')
-import pandas as pd
-import gradio as gr
-from simple_data_loader import SimpleLeaderboardViewer
-from ui_components import (
-    create_leaderboard_display,
-    get_full_leaderboard_data,
-)
-ALTERNATIVE_AGENTS_INTRO = """
-<div id="alternative-agents-intro">
-  <h2>Alternative Agents</h2>
-  <p>
-    Third-party agent harnesses running the OpenHands Index benchmarks.
-    To make direct comparisons easier, this page also includes the
-    canonical OpenHands row whenever the exact same language model appears
-    under an alternative harness. Cost and runtime numbers still come from
-    each harness's own instrumentation and aren't directly comparable
-    across harnesses.
-  </p>
-</div>
-"""
-def _append_openhands_shared_models(
-    alternative_df: pd.DataFrame,
-    split: str,
-) -> pd.DataFrame:
-    if alternative_df.empty or "Language Model" not in alternative_df.columns:
-        return alternative_df
-    openhands_df, _ = get_full_leaderboard_data(
-        split,
-        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
-    )
-    if openhands_df.empty or "Language Model" not in openhands_df.columns:
-        return alternative_df
-    alternative_models = set(
-        alternative_df["Language Model"].dropna().astype(str).str.strip()
-    )
-    if not alternative_models:
-        return alternative_df
-    openhands_shared_df = openhands_df[
-        openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
-    ].copy()
-    if openhands_shared_df.empty:
-        return alternative_df
-    return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)
-def build_page():
-    gr.HTML(ALTERNATIVE_AGENTS_INTRO)
-    gr.Markdown("---")
-    test_df, test_tag_map = get_full_leaderboard_data(
-        "test",
-        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
-    )
-    if test_df.empty:
-        gr.Markdown(
-            "No alternative agent submissions yet. New runs land in "
-            "`alternative_agents/{type}/{model}/` in "
-            "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
-        )
-        return
-    test_df = _append_openhands_shared_models(test_df, split="test")
-    create_leaderboard_display(
-        full_df=test_df,
-        tag_map=test_tag_map,
-        category_name="Overall",
-        split_name="test",
-    )

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ from app_creation import build_page as build_app_creation_page
 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
-from alternative_agents_page import build_page as build_alternative_agents_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -374,46 +373,20 @@ with demo.route("Testing", "/testing"):
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
-with demo.route("Alternative Agents", "/alternative-agents"):
-    build_alternative_agents_page()
 with demo.route("About", "/about"):
     build_about_page()
 logger.info("All routes configured")
 # Mount the REST API on /api
-from fastapi import FastAPI, Request
-from fastapi.responses import RedirectResponse
-from starlette.middleware.base import BaseHTTPMiddleware
 from api import api_app
-class RootRedirectMiddleware(BaseHTTPMiddleware):
-    """Middleware to redirect root path "/" to "/home".
-    This fixes the 307 trailing slash redirect issue (Gradio bug #11071) that
-    occurs when Gradio is mounted at "/" - FastAPI's default behavior redirects
-    "/" to "//", which breaks routing on HuggingFace Spaces.
-    See: https://github.com/gradio-app/gradio/issues/11071
-    """
-    async def dispatch(self, request: Request, call_next):
-        if request.url.path == "/":
-            return RedirectResponse(url="/home", status_code=302)
-        return await call_next(request)
-# Create a parent FastAPI app with redirect_slashes=False to prevent
-# automatic trailing slash redirects that cause issues with Gradio
-root_app = FastAPI(redirect_slashes=False)
-# Add middleware to handle root path redirect to /home
-root_app.add_middleware(RootRedirectMiddleware)
 root_app.mount("/api", api_app)
-# Mount Gradio app at root path
 app = gr.mount_gradio_app(root_app, demo, path="/")
 logger.info("REST API mounted at /api, Gradio app mounted at /")

 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
 with demo.route("About", "/about"):
     build_about_page()
 logger.info("All routes configured")
 # Mount the REST API on /api
+from fastapi import FastAPI
 from api import api_app
+# Create a parent FastAPI app that will host both the API and Gradio
+root_app = FastAPI()
 root_app.mount("/api", api_app)
+# Mount Gradio app - root redirect is handled by the proxy
 app = gr.mount_gradio_app(root_app, demo, path="/")
 logger.info("REST API mounted at /api, Gradio app mounted at /")

assets/harnesses/README.md DELETED Viewed

@@ -1,59 +0,0 @@
-# Agent harness logos
-This folder holds the **bottom half** of the composite scatter markers used
-on the [Alternative Agents](../../alternative_agents_page.py) page. Each
-point on that scatter stacks two logos: the model provider on top (from
-`assets/logo-*.svg`) and the harness on the bottom (from this folder).
-## Expected filenames
-The scatter code looks up a logo by the exact `agent_name` string that the
-`push-to-index` workflow writes into the index repo's `metadata.json`, then
-maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
-these filenames in sync with that map.
-| `agent_name` (in index repo) | File in this folder |
-| --- | --- |
-| `Claude Code`          | `claude-code.svg`  or `claude-code.png` |
-| `Codex`                | `codex-cli.svg`    or `codex-cli.png`   |
-| `Gemini CLI`           | `gemini-cli.svg`   or `gemini-cli.png`  |
-| `OpenHands`            | `openhands.svg`    or `openhands.png`   |
-| `OpenHands Sub-agents` | `openhands.svg`    or `openhands.png`   (shared with `OpenHands`) |
-Both `.svg` and `.png` are accepted — the resolver tries `.svg` first, then
-`.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
-binary files on plain `git push` and routes PNGs through Xet, so an SVG is
-one less thing to set up.
-## When a file is missing
-The scatter falls back to a single marker (just the model provider logo) —
-exactly the same rendering path the canonical OpenHands pages use. Nothing
-crashes and nothing prints a warning in normal operation. This means you
-can roll out logos one harness at a time without waiting for all four.
-## Sizing and shape
-- Square canvas. The composite marker is drawn at a fixed aspect ratio, so
-  a non-square logo will get squished.
-- Any SVG `viewBox` works — the renderer base64-encodes the file as-is and
-  Plotly scales it to the marker's `sizex` / `sizey`. Around `80×80` to
-  `256×256` is a good source size.
-- Leave some internal padding (≈10%) so the logo doesn't touch the marker
-  edge when two are stacked.
-- No background is required, but a rounded-square coloured tile reads well
-  at small sizes because it gives each harness a distinct silhouette even
-  when the inner glyph isn't fully legible. Look at the existing
-  `assets/logo-*.svg` files for the canonical model provider logos if you
-  want a visual reference for sizing.
-## Adding a new harness
-1. Decide on the exact `agent_name` that the push-to-index workflow writes
-   for the new harness (see `AGENT_NAME_BY_TYPE` in
-   `OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
-2. Add an entry to `HARNESS_LOGO_STEMS` in
-   [`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
-   maps the display name to a stem.
-3. Drop `{stem}.svg` (or `.png`) into this folder.
-4. Reload the app and look at `/alternative-agents`.

assets/harnesses/claude-code.svg DELETED Viewed

assets/harnesses/codex-cli.svg DELETED Viewed

assets/harnesses/gemini-cli.svg DELETED Viewed

assets/harnesses/openhands.svg DELETED Viewed

docs/screenshots/alternative-agents.png DELETED Viewed

Git LFS Details

SHA256: 99766c7d2c11a6f90f24a5f0effbae74a8aa33096b89ff1c4fcfb238fe06a2f5
Pointer size: 131 Bytes
Size of remote file: 104 kB

leaderboard_transformer.py CHANGED Viewed

@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
 def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
     """
     Gets the appropriate icon based on the mark_by selection.
     Args:
         model_name: The model name
         openness: The openness value (open/closed)
         mark_by: One of "Company", "Openness", or "Country"
     Returns:
         dict with 'path' and 'name' keys
     """
     from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
     if mark_by == MARK_BY_OPENNESS:
         return get_openness_icon(openness)
     elif mark_by == MARK_BY_COUNTRY:
@@ -247,59 +247,6 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
         return get_company_from_model(model_name)
-# Map the agent_name stored in the index repo's metadata.json to a file stem
-# inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
-# OpenHands/evaluation push_to_index_from_archive.py — if a new ACP harness
-# lands there, add the corresponding display name and a matching stem here.
-#
-# The scatter plot looks for {stem}.svg first, then {stem}.png in
-# assets/harnesses/. This repo intentionally ships only a README in that
-# folder: drop the logo files in by hand (SVG preferred, PNG works too via
-# HF Xet) and they'll be picked up on the next app restart. If the file is
-# missing, get_harness_icon() returns None and the scatter falls back to the
-# single-marker path — same rendering the canonical OpenHands pages use —
-# so logos can be added one harness at a time without breaking anything.
-HARNESS_LOGO_STEMS: dict[str, str] = {
-    "Claude Code":          "claude-code",
-    "Codex":                "codex-cli",
-    "Gemini CLI":           "gemini-cli",
-    "OpenHands":            "openhands",
-    "OpenHands Sub-agents": "openhands",
-}
-HARNESS_LOGO_DIR = "assets/harnesses"
-HARNESS_LOGO_EXTENSIONS = ("svg", "png")
-def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
-    """Return {'path', 'name'} for the harness logo, or None if not usable.
-    Consumed by the Alternative Agents scatter plot to draw a composite
-    marker (model provider on top, harness on bottom). Returns None in any
-    of three cases, all of which make the caller skip the harness layer:
-    - ``agent_name`` is empty or missing from the dataframe row.
-    - ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
-      hasn't been registered yet — register it and drop in a logo).
-    - The logo file for that stem doesn't exist in ``assets/harnesses/``
-      yet (the repo ships only the README).
-    That third case is the important one: it lets the Alternative Agents
-    page work immediately after checkout even when the harness logo files
-    haven't been dropped in. The corresponding points just render like a
-    canonical-page marker (model logo only) until the file is added.
-    """
-    if not agent_name:
-        return None
-    stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
-    if stem is None:
-        return None
-    for ext in HARNESS_LOGO_EXTENSIONS:
-        path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
-        if os.path.exists(path):
-            return {"path": path, "name": agent_name}
-    return None
 # Standard layout configuration for all charts
 STANDARD_LAYOUT = dict(
     template="plotly_white",
@@ -708,7 +655,6 @@ def _pretty_column_name(raw_col: str) -> str:
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
-        'agent_name': 'Agent',
         'SDK version': 'SDK Version',
         'Openhands version': 'SDK Version',  # Legacy support
         'Language model': 'Language Model',
@@ -869,21 +815,7 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
-        # Only include the "Agent" column when the dataframe actually has
-        # more than one distinct agent. On the canonical OpenHands pages
-        # every row says "OpenHands", so adding the column is just noise;
-        # on the Alternative Agents page rows differ (Claude Code / Codex
-        # / Gemini CLI / OpenHands Sub-agents), so the column carries
-        # signal and disambiguates same-model rows from different
-        # harnesses.
-        has_mixed_agents = (
-            "Agent" in df_view.columns
-            and df_view["Agent"].dropna().nunique() > 1
-        )
-        if has_mixed_agents:
-            base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
-        else:
-            base_cols = ["id", "Language Model", "SDK Version", "Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]
@@ -1086,18 +1018,13 @@ def _plot_scatter_plotly(
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
-                Harness: {agent}        (only when the row carries an Agent —
-                                         Alternative Agents page only; the
-                                         canonical OpenHands pages drop the
-                                         Agent column in view() so this line
-                                         is skipped there)
                 Average Score: {score}
                 Average Cost/Runtime: {value}
                 Openness: {openness}
         """
         h_pad = "   "
         parts = ["<br>"]
         # Get and clean the language model name
         llm_base_value = row.get('Language Model', '')
         llm_base_value = clean_llm_base_list(llm_base_value)
@@ -1105,21 +1032,13 @@ def _plot_scatter_plotly(
             lm_name = llm_base_value[0]
         else:
             lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
         # Get SDK version
         sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
         # Title line: {lm_name} (SDK {version})
         parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
-        # Harness line — only on pages where the Agent column is present
-        # (Alternative Agents). Without this, two rows for the same LM run
-        # under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
-        # on claude-sonnet-4-5) are indistinguishable on hover.
-        agent_value = row.get('Agent')
-        if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
-            parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
@@ -1192,116 +1111,51 @@ def _plot_scatter_plotly(
     y_min = min_score - 5 if min_score > 5 else 0
     y_max = max_score + 5
-    # Cache base64-encoded logos across rows — every Claude model on the
-    # Alternative Agents page points at the same assets/harness-claude-code.svg,
-    # so decoding once per path is ~N× cheaper than once per point.
-    _logo_cache: dict[str, str] = {}
-    def _encode_logo(path: str) -> Optional[str]:
-        if path in _logo_cache:
-            return _logo_cache[path]
-        if not os.path.exists(path):
-            return None
-        try:
-            with open(path, "rb") as f:
-                encoded = base64.b64encode(f.read()).decode("utf-8")
-        except Exception as e:
-            logger.warning(f"Could not load logo {path}: {e}")
-            return None
-        mime = "svg+xml" if path.lower().endswith(".svg") else "png"
-        uri = f"data:image/{mime};base64,{encoded}"
-        _logo_cache[path] = uri
-        return uri
-    # Composite markers: on the Alternative Agents page the dataframe carries
-    # an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
-    # so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
-    # Sub-agents would otherwise share the exact same Anthropic logo marker
-    # and be visually indistinguishable. When Agent is present, we stack
-    # two logos at each point: model provider on top, harness on the bottom.
-    # Canonical OpenHands pages drop the Agent column in view() (via the
-    # has_mixed_agents check), so they fall through to the single-logo path
-    # and render exactly as before.
-    has_harness_column = (
-        "Agent" in data_plot.columns
-        and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
-    )
-    # Marker sizes. The composite variant fits two logos inside roughly the
-    # same vertical footprint as a single marker, so each half is slightly
-    # smaller and the two halves are offset symmetrically around the point's
-    # true y-coordinate.
-    SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
-    STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
-    STACKED_Y_OFFSET = 0.028  # half-separation between model (top) and harness (bottom)
     for _, row in data_plot.iterrows():
         model_name = row.get('Language Model', '')
         openness = row.get('Openness', '')
         marker_info = get_marker_icon(model_name, openness, mark_by)
-        model_logo_uri = _encode_logo(marker_info['path'])
-        if model_logo_uri is None:
-            continue
-        # Harness (only meaningful when the dataframe carries an Agent column).
-        harness_uri = None
-        if has_harness_column:
-            harness_info = get_harness_icon(row.get("Agent"))
-            if harness_info is not None:
-                harness_uri = _encode_logo(harness_info["path"])
-        x_val = row[x_col_to_use]
-        y_val = row[y_col_to_use]
-        # Convert to domain coordinates (0-1 range)
-        # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
-        if x_val > 0:
-            log_x = np.log10(x_val)
-            domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
-        else:
-            domain_x = 0
-        # For linear y: domain_y = (y - y_min) / (y_max - y_min)
-        domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
-        # Clamp to valid range
-        domain_x = max(0, min(1, domain_x))
-        domain_y = max(0, min(1, domain_y))
-        if harness_uri is not None:
-            # Composite: stack model on top, harness on bottom, clamping
-            # each half to the plot area so markers near the edges don't
-            # drift off-canvas.
-            model_y = min(1, domain_y + STACKED_Y_OFFSET)
-            harness_y = max(0, domain_y - STACKED_Y_OFFSET)
-            layout_images.append(dict(
-                source=model_logo_uri,
-                xref="x domain", yref="y domain",
-                x=domain_x, y=model_y,
-                sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
-                xanchor="center", yanchor="middle",
-                layer="above",
-            ))
-            layout_images.append(dict(
-                source=harness_uri,
-                xref="x domain", yref="y domain",
-                x=domain_x, y=harness_y,
-                sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
-                xanchor="center", yanchor="middle",
-                layer="above",
-            ))
-        else:
-            # Single marker (canonical OpenHands pages, or Alternative Agents
-            # rows with an unknown harness name — the latter shouldn't happen
-            # in practice since HARNESS_LOGO_PATHS covers every agent_name the
-            # push-to-index script emits).
-            layout_images.append(dict(
-                source=model_logo_uri,
-                xref="x domain", yref="y domain",
-                x=domain_x, y=domain_y,
-                sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
-                xanchor="center", yanchor="middle",
-                layer="above",
-            ))
     # --- Section 7: Add Model Name Labels to Frontier Points ---
     if frontier_rows:
@@ -1472,47 +1326,38 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
-def _hidden_runtime_sort_key(runtime_value: float | int | None, score_value: float | int | None) -> str:
-    """Build a hidden prefix so Gradio's string-based runtime sorting behaves numerically."""
-    if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
-        return f"{float(runtime_value):020.6f}"
-    if pd.notna(score_value):
-        return "99999999999999999998"
-    return "99999999999999999999"
 def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
     """
     Applies custom formatting to a runtime column based on its corresponding score column.
     - If runtime is not null, formats as time with 's' suffix.
     - If runtime is null but score is not, it becomes "Missing".
     - If both runtime and score are null, it becomes "Not Submitted".
-    - Adds a hidden, zero-padded numeric prefix so Gradio sorts the column numerically.
     Args:
         df: The DataFrame to modify.
         runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
     Returns:
         The DataFrame with the formatted runtime column.
     """
     score_col_name = runtime_col_name.replace("Runtime", "Score")
     if score_col_name not in df.columns:
-        return df
     def apply_formatting_logic(row):
         runtime_value = row[runtime_col_name]
         score_value = row[score_col_name]
         status_color = "#ec4899"
-        sort_key = _hidden_runtime_sort_key(runtime_value, score_value)
-        hidden_sort_prefix = f'<span style="display:none">{sort_key}</span>'
         if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
-            return f"{hidden_sort_prefix}{runtime_value:.0f}s"
         elif pd.notna(score_value):
-            return f'{hidden_sort_prefix}<span style="color: {status_color};">Missing</span>'
         else:
-            return f'{hidden_sort_prefix}<span style="color: {status_color};">Not Submitted</span>'
     df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
     return df

 def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
     """
     Gets the appropriate icon based on the mark_by selection.
     Args:
         model_name: The model name
         openness: The openness value (open/closed)
         mark_by: One of "Company", "Openness", or "Country"
     Returns:
         dict with 'path' and 'name' keys
     """
     from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
     if mark_by == MARK_BY_OPENNESS:
         return get_openness_icon(openness)
     elif mark_by == MARK_BY_COUNTRY:
         return get_company_from_model(model_name)
 # Standard layout configuration for all charts
 STANDARD_LAYOUT = dict(
     template="plotly_white",
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
         'SDK version': 'SDK Version',
         'Openhands version': 'SDK Version',  # Legacy support
         'Language model': 'Language Model',
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
+        base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
                 Average Score: {score}
                 Average Cost/Runtime: {value}
                 Openness: {openness}
         """
         h_pad = "   "
         parts = ["<br>"]
         # Get and clean the language model name
         llm_base_value = row.get('Language Model', '')
         llm_base_value = clean_llm_base_list(llm_base_value)
             lm_name = llm_base_value[0]
         else:
             lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
         # Get SDK version
         sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
         # Title line: {lm_name} (SDK {version})
         parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
     y_min = min_score - 5 if min_score > 5 else 0
     y_max = max_score + 5
     for _, row in data_plot.iterrows():
         model_name = row.get('Language Model', '')
         openness = row.get('Openness', '')
         marker_info = get_marker_icon(model_name, openness, mark_by)
+        logo_path = marker_info['path']
+        # Read the SVG file and encode as base64 data URI
+        if os.path.exists(logo_path):
+            try:
+                with open(logo_path, 'rb') as f:
+                    encoded_logo = base64.b64encode(f.read()).decode('utf-8')
+                    logo_uri = f"data:image/svg+xml;base64,{encoded_logo}"
+                    x_val = row[x_col_to_use]
+                    y_val = row[y_col_to_use]
+                    # Convert to domain coordinates (0-1 range)
+                    # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
+                    if x_val > 0:
+                        log_x = np.log10(x_val)
+                        domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
+                    else:
+                        domain_x = 0
+                    # For linear y: domain_y = (y - y_min) / (y_max - y_min)
+                    domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
+                    # Clamp to valid range
+                    domain_x = max(0, min(1, domain_x))
+                    domain_y = max(0, min(1, domain_y))
+                    layout_images.append(dict(
+                        source=logo_uri,
+                        xref="x domain",  # Use domain coordinates for log scale compatibility
+                        yref="y domain",
+                        x=domain_x,
+                        y=domain_y,
+                        sizex=0.04,  # Size as fraction of plot width
+                        sizey=0.06,  # Size as fraction of plot height
+                        xanchor="center",
+                        yanchor="middle",
+                        layer="above"
+                    ))
+            except Exception as e:
+                logger.warning(f"Could not load logo {logo_path}: {e}")
     # --- Section 7: Add Model Name Labels to Frontier Points ---
     if frontier_rows:
     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
 def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
     """
     Applies custom formatting to a runtime column based on its corresponding score column.
     - If runtime is not null, formats as time with 's' suffix.
     - If runtime is null but score is not, it becomes "Missing".
     - If both runtime and score are null, it becomes "Not Submitted".
     Args:
         df: The DataFrame to modify.
         runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
     Returns:
         The DataFrame with the formatted runtime column.
     """
+    # Find the corresponding score column by replacing "Runtime" with "Score"
     score_col_name = runtime_col_name.replace("Runtime", "Score")
+    # Ensure the score column actually exists to avoid errors
     if score_col_name not in df.columns:
+        return df  # Return the DataFrame unmodified if there's no matching score
     def apply_formatting_logic(row):
         runtime_value = row[runtime_col_name]
         score_value = row[score_col_name]
         status_color = "#ec4899"
         if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
+            return f"{runtime_value:.0f}s"
         elif pd.notna(score_value):
+            return f'<span style="color: {status_color};">Missing</span>'  # Score exists, but runtime is missing
         else:
+            return f'<span style="color: {status_color};">Not Submitted</span>'  # Neither score nor runtime exists
+    # Apply the logic to the specified runtime column and update the DataFrame
     df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
     return df

main_page.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import matplotlib
 matplotlib.use('Agg')
 import gradio as gr
-import pandas as pd
 from ui_components import (
@@ -27,32 +26,6 @@ from constants import MARK_BY_DEFAULT
 CACHED_VIEWERS = {}
 CACHED_TAG_MAPS = {}
-def filter_complete_entries(df: pd.DataFrame) -> pd.DataFrame:
-    if df.empty:
-        return df.copy()
-    category_score_columns = [
-        'Issue Resolution Score',
-        'Frontend Score',
-        'Greenfield Score',
-        'Testing Score',
-        'Information Gathering Score',
-    ]
-    if all(column in df.columns for column in category_score_columns):
-        return df[df[category_score_columns].notna().all(axis=1)].copy()
-    if 'Categories Completed' in df.columns:
-        categories_completed = pd.to_numeric(df['Categories Completed'], errors='coerce')
-        return df[categories_completed >= 5].copy()
-    if 'Categories Attempted' in df.columns:
-        return df[df['Categories Attempted'] == '5/5'].copy()
-    return df.copy()
 def build_page():
     with gr.Row(elem_id="intro-row"):
         with gr.Column(scale=1):
@@ -65,91 +38,78 @@ def build_page():
     test_df, test_tag_map = get_full_leaderboard_data("test")
     if not test_df.empty:
-        show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown = create_leaderboard_display(
             full_df=test_df,
             tag_map=test_tag_map,
             category_name=CATEGORY_NAME,
             split_name="test"
         )
-        test_df_complete = filter_complete_entries(test_df)
-        has_complete_entries = len(test_df_complete) > 0
         if 'Openness' in test_df.columns:
             test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
         else:
             test_df_open = test_df.copy()
-        test_df_complete_open = filter_complete_entries(test_df_open)
-        initial_df = test_df_complete if has_complete_entries else test_df
         # --- Winners by Category Section ---
         gr.Markdown("---")
         gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
         gr.Markdown("Top 5 performing systems in each benchmark category.")
-        winners_component = gr.HTML(
-            create_winners_by_category_html(initial_df, top_n=5),
-            elem_id="winners-by-category",
-        )
         # --- New Visualization Sections ---
         gr.Markdown("---")
         # Evolution Over Time Section
         gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
         gr.Markdown("Track how model performance has improved over time based on release dates.")
-        evolution_component = gr.Plot(
-            value=create_evolution_over_time_chart(initial_df, MARK_BY_DEFAULT),
-            elem_id="evolution-chart",
-        )
         gr.Markdown("---")
         # Open Model Accuracy by Size Section (always shows open models only by design)
         gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
         gr.Markdown("Compare open-weights model performance against their parameter count.")
-        size_component = gr.Plot(
-            value=create_accuracy_by_size_chart(initial_df, MARK_BY_DEFAULT),
-            elem_id="size-accuracy-chart",
-        )
-        def update_extra_sections(show_incomplete, show_open_only, mark_by):
-            include_incomplete = show_incomplete or not has_complete_entries
-            base_df = test_df if include_incomplete else test_df_complete
-            base_df_open = test_df_open if include_incomplete else test_df_complete_open
-            winners_df = base_df_open if show_open_only else base_df
-            winners_html = create_winners_by_category_html(winners_df, top_n=5)
-            evolution_fig = create_evolution_over_time_chart(winners_df, mark_by)
-            size_fig = create_accuracy_by_size_chart(base_df, mark_by)
             return winners_html, evolution_fig, size_fig
-        show_incomplete_input = show_incomplete_checkbox if show_incomplete_checkbox is not None else gr.State(value=True)
-        show_open_only_input = show_open_only_checkbox if show_open_only_checkbox is not None else gr.State(value=False)
-        extra_section_inputs = [show_incomplete_input, show_open_only_input, mark_by_dropdown]
-        if show_incomplete_checkbox is not None:
-            show_incomplete_checkbox.change(
-                fn=update_extra_sections,
-                inputs=extra_section_inputs,
-                outputs=[winners_component, evolution_component, size_component]
-            )
         if show_open_only_checkbox is not None:
             show_open_only_checkbox.change(
                 fn=update_extra_sections,
-                inputs=extra_section_inputs,
                 outputs=[winners_component, evolution_component, size_component]
             )
         if mark_by_dropdown is not None:
             mark_by_dropdown.change(
                 fn=update_extra_sections,
-                inputs=extra_section_inputs,
                 outputs=[winners_component, evolution_component, size_component]
             )

 import matplotlib
 matplotlib.use('Agg')
 import gradio as gr
 from ui_components import (
 CACHED_VIEWERS = {}
 CACHED_TAG_MAPS = {}
 def build_page():
     with gr.Row(elem_id="intro-row"):
         with gr.Column(scale=1):
     test_df, test_tag_map = get_full_leaderboard_data("test")
     if not test_df.empty:
+        # Get the checkbox and dropdown returned from create_leaderboard_display
+        show_open_only_checkbox, mark_by_dropdown = create_leaderboard_display(
             full_df=test_df,
             tag_map=test_tag_map,
             category_name=CATEGORY_NAME,
             split_name="test"
         )
+        # Prepare open-only filtered dataframe for Winners and Evolution
         if 'Openness' in test_df.columns:
             test_df_open = test_df[test_df['Openness'].str.lower() == 'open'].copy()
         else:
             test_df_open = test_df.copy()
         # --- Winners by Category Section ---
         gr.Markdown("---")
         gr.HTML('<h2>Winners by Category</h2>', elem_id="winners-header")
         gr.Markdown("Top 5 performing systems in each benchmark category.")
+        # Create both all and open-only versions of winners HTML
+        winners_html_all = create_winners_by_category_html(test_df, top_n=5)
+        winners_html_open = create_winners_by_category_html(test_df_open, top_n=5)
+        winners_component = gr.HTML(winners_html_all, elem_id="winners-by-category")
         # --- New Visualization Sections ---
         gr.Markdown("---")
         # Evolution Over Time Section
         gr.HTML('<h2>Evolution Over Time</h2>', elem_id="evolution-header")
         gr.Markdown("Track how model performance has improved over time based on release dates.")
+        # Create initial evolution chart with default mark_by
+        evolution_fig_all = create_evolution_over_time_chart(test_df, MARK_BY_DEFAULT)
+        evolution_component = gr.Plot(value=evolution_fig_all, elem_id="evolution-chart")
         gr.Markdown("---")
         # Open Model Accuracy by Size Section (always shows open models only by design)
         gr.HTML('<h2>Open Model Accuracy by Size</h2>', elem_id="size-accuracy-header")
         gr.Markdown("Compare open-weights model performance against their parameter count.")
+        size_fig = create_accuracy_by_size_chart(test_df, MARK_BY_DEFAULT)
+        size_component = gr.Plot(value=size_fig, elem_id="size-accuracy-chart")
+        # Update function for Winners, Evolution, and Size charts based on filters
+        def update_extra_sections(show_open_only, mark_by):
+            # Select the appropriate dataframe based on open_only filter
+            df_to_use = test_df_open if show_open_only else test_df
+            # Winners HTML (not affected by mark_by, only open_only)
+            winners_html = winners_html_open if show_open_only else winners_html_all
+            # Regenerate charts with current mark_by setting
+            evolution_fig = create_evolution_over_time_chart(df_to_use, mark_by)
+            size_fig = create_accuracy_by_size_chart(test_df, mark_by)  # Size chart always uses full df (filters internally)
             return winners_html, evolution_fig, size_fig
+        # Connect both checkbox and dropdown to update all extra sections
         if show_open_only_checkbox is not None:
             show_open_only_checkbox.change(
                 fn=update_extra_sections,
+                inputs=[show_open_only_checkbox, mark_by_dropdown],
                 outputs=[winners_component, evolution_component, size_component]
             )
         if mark_by_dropdown is not None:
             mark_by_dropdown.change(
                 fn=update_extra_sections,
+                inputs=[show_open_only_checkbox if show_open_only_checkbox else gr.State(value=False), mark_by_dropdown],
                 outputs=[winners_component, evolution_component, size_component]
             )

setup_data.py CHANGED Viewed

@@ -70,39 +70,27 @@ def fetch_data_from_github():
         # Look for data files in the cloned repository
         results_source = clone_dir / "results"
         if not results_source.exists():
             print(f"Results directory not found in repository")
             return False
         # Check if there are any agent result directories
         result_dirs = list(results_source.iterdir())
         if not result_dirs:
             print(f"No agent results found in {results_source}")
             return False
         print(f"Found {len(result_dirs)} agent result directories")
         # Create target directory and copy the results structure
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
         # Copy the entire results directory
         target_results = target_dir / "results"
         shutil.copytree(results_source, target_results)
-        # Also copy alternative_agents/ if present, so the loader can pick up
-        # ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
-        # alongside the default OpenHands agent results.
-        alt_source = clone_dir / "alternative_agents"
-        if alt_source.exists():
-            alt_target = target_dir / "alternative_agents"
-            shutil.copytree(alt_source, alt_target)
-            agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
-            print(f"Found alternative agent types: {agent_types}")
-        else:
-            print("No alternative_agents/ directory in repository (skipping)")
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")

         # Look for data files in the cloned repository
         results_source = clone_dir / "results"
         if not results_source.exists():
             print(f"Results directory not found in repository")
             return False
         # Check if there are any agent result directories
         result_dirs = list(results_source.iterdir())
         if not result_dirs:
             print(f"No agent results found in {results_source}")
             return False
         print(f"Found {len(result_dirs)} agent result directories")
         # Create target directory and copy the results structure
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
         # Copy the entire results directory
         target_results = target_dir / "results"
         shutil.copytree(results_source, target_results)
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")

simple_data_loader.py CHANGED Viewed

@@ -96,43 +96,17 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
-    AGENT_FILTER_OPENHANDS = "openhands"
-    AGENT_FILTER_ALTERNATIVE = "alternative"
-    def __init__(
-        self,
-        data_dir: str,
-        config: str,
-        split: str,
-        agent_filter: str = AGENT_FILTER_OPENHANDS,
-    ):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
-            agent_filter: Which submissions to include.
-                ``"openhands"`` (default) loads only the default OpenHands
-                agent runs from ``results/{model}/`` — the canonical
-                leaderboard. ``"alternative"`` loads only third-party
-                harnesses (Claude Code / Codex / Gemini CLI / OpenHands
-                Sub-agents) from ``alternative_agents/{type}/{model}/``,
-                which power the standalone Alternative Agents page.
-                The two are kept on separate pages because their
-                cost/runtime numbers aren't apples-to-apples and mixing
-                them in one ranking would be misleading.
         """
-        if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
-            raise ValueError(
-                f"agent_filter must be one of "
-                f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
-                f"got {agent_filter!r}"
-            )
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
-        self.agent_filter = agent_filter
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
@@ -153,115 +127,55 @@ class SimpleLeaderboardViewer:
                 if benchmark not in self.tag_map[category]:
                     self.tag_map[category].append(benchmark)
-    # Default agent_name when metadata.json doesn't carry one. Matches the
-    # default-agent value used by push_to_index_from_archive.py so legacy
-    # entries (which omit the field) still group cleanly with new entries.
-    DEFAULT_AGENT_NAME = "OpenHands"
-    def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
-        """Build per-benchmark records from a single agent directory.
-        Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
-        ``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
-        Returns ``(records, validation_errors)``. Returns an empty list of
-        records when the directory has no scores or is hidden from the
-        leaderboard.
-        """
-        records: list[dict] = []
-        metadata, scores, errors = load_and_validate_agent_data(agent_dir)
-        if metadata is None or scores is None:
-            return records, errors
-        if metadata.get('hide_from_leaderboard', False):
-            logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
-            return records, errors
-        # Resolve the agent display name. Prefer the value stamped into
-        # metadata.json by push-to-index; fall back to the directory's
-        # default (e.g. "Claude Code" for acp-claude/) and finally to
-        # "OpenHands" for legacy results/ entries that predate the field.
-        agent_name = (
-            metadata.get('agent_name')
-            or default_agent_name
-            or self.DEFAULT_AGENT_NAME
-        )
-        for score_entry in scores:
-            record = {
-                'agent_name': agent_name,
-                'agent_version': metadata.get('agent_version', 'Unknown'),
-                'llm_base': metadata.get('model', 'unknown'),
-                'openness': metadata.get('openness', 'unknown'),
-                'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
-                'release_date': metadata.get('release_date', ''),
-                'parameter_count_b': metadata.get('parameter_count_b'),
-                'active_parameter_count_b': metadata.get('active_parameter_count_b'),
-                'score': score_entry.get('score'),
-                'metric': score_entry.get('metric', 'unknown'),
-                'cost_per_instance': score_entry.get('cost_per_instance'),
-                'average_runtime': score_entry.get('average_runtime'),
-                'tags': [score_entry.get('benchmark')],
-                'full_archive': score_entry.get('full_archive', ''),
-                'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
-            }
-            records.append(record)
-        return records, errors
     def _load_from_agent_dirs(self):
-        """Load agent records based on ``self.agent_filter``.
-        - ``"openhands"`` (default): only ``{config}/results/{model}/``,
-          which is the canonical OpenHands leaderboard. The Home page and
-          the per-category subpages use this.
-        - ``"alternative"``: only
-          ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
-          acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
-          Alternative Agents page uses this.
-        Returns ``None`` if no records were found (which makes the caller
-        render an empty-state placeholder).
-        """
         all_records = []
         all_validation_errors = []
-        if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
-            # Default OpenHands agent results
-            results_dir = self.config_path / "results"
-            if results_dir.exists():
-                for agent_dir in results_dir.iterdir():
-                    if not agent_dir.is_dir():
-                        continue
-                    records, errors = self._records_from_agent_dir(agent_dir)
-                    all_records.extend(records)
-                    all_validation_errors.extend(errors)
-        else:
-            # Alternative agents (one subdirectory per agent_type, then per model)
-            # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
-            # map in OpenHands/evaluation push_to_index_from_archive.py — keeping
-            # it in sync ensures rows are labelled the same way the index repo
-            # records them.
-            agent_type_default_name = {
-                'acp-claude': 'Claude Code',
-                'acp-codex': 'Codex',
-                'acp-gemini': 'Gemini CLI',
-                'openhands_subagents': 'OpenHands Sub-agents',
-            }
-            alt_dir = self.config_path / "alternative_agents"
-            if alt_dir.exists():
-                for type_dir in alt_dir.iterdir():
-                    if not type_dir.is_dir():
-                        continue
-                    default_name = agent_type_default_name.get(type_dir.name)
-                    for agent_dir in type_dir.iterdir():
-                        if not agent_dir.is_dir():
-                            continue
-                        records, errors = self._records_from_agent_dir(
-                            agent_dir, default_agent_name=default_name
-                        )
-                        all_records.extend(records)
-                        all_validation_errors.extend(errors)
         # Log validation errors if any
         if all_validation_errors:
             logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
@@ -269,10 +183,10 @@ class SimpleLeaderboardViewer:
                 logger.warning(f"  - {error}")
             if len(all_validation_errors) > 5:
                 logger.warning(f"  ... and {len(all_validation_errors) - 5} more")
         if not all_records:
-            return None  # Caller will render empty-state placeholder
         return pd.DataFrame(all_records)
     def _load(self):
@@ -292,36 +206,26 @@ class SimpleLeaderboardViewer:
             # Group by agent (version + model combination) to aggregate results across datasets
             transformed_records = []
-            # Create a unique identifier per (agent_name, agent_version, model)
-            # tuple. Including agent_name keeps an OpenHands run and a Claude
-            # Code run on the same SDK version + model from collapsing into
-            # one row when both submit to the leaderboard.
-            df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
-            df['agent_id'] = (
-                df['agent_name'].astype(str)
-                + '_' + df['agent_version'].astype(str)
-                + '_' + df['llm_base'].astype(str)
-            )
             for agent_id in df['agent_id'].unique():
                 agent_records = df[df['agent_id'] == agent_id]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 agent_version = first_record['agent_version']
-                agent_name = first_record['agent_name']
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
                 ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names
-                    'agent_name': agent_name,  # Will become "Agent"
                     'SDK version': agent_version,  # Will become "SDK Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
@@ -331,7 +235,7 @@ class SimpleLeaderboardViewer:
                     'parameter_count_b': first_record.get('parameter_count_b'),  # Total params in billions
                     'active_parameter_count_b': first_record.get('active_parameter_count_b'),  # Active params for MoE
                     # Additional columns expected by the transformer
-                    # Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
                     'id': agent_id,
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"

 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
+    def __init__(self, data_dir: str, config: str, split: str):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
         """
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
                 if benchmark not in self.tag_map[category]:
                     self.tag_map[category].append(benchmark)
     def _load_from_agent_dirs(self):
+        """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
+        results_dir = self.config_path / "results"
+        if not results_dir.exists():
+            return None  # Fall back to old format
         all_records = []
         all_validation_errors = []
+        # Iterate through each agent directory
+        for agent_dir in results_dir.iterdir():
+            if not agent_dir.is_dir():
+                continue
+            # Load and validate using pydantic models
+            metadata, scores, errors = load_and_validate_agent_data(agent_dir)
+            if errors:
+                all_validation_errors.extend(errors)
+            if metadata is None or scores is None:
+                continue
+            # Skip entries that are hidden from the leaderboard
+            if metadata.get('hide_from_leaderboard', False):
+                logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
+                continue
+            # Create one record per benchmark (mimicking old JSONL format)
+            for score_entry in scores:
+                record = {
+                    'agent_version': metadata.get('agent_version', 'Unknown'),
+                    'llm_base': metadata.get('model', 'unknown'),
+                    'openness': metadata.get('openness', 'unknown'),
+                    'submission_time': metadata.get('submission_time', ''),
+                    'release_date': metadata.get('release_date', ''),  # Model release date
+                    'parameter_count_b': metadata.get('parameter_count_b'),  # Total params in billions
+                    'active_parameter_count_b': metadata.get('active_parameter_count_b'),  # Active params for MoE
+                    'score': score_entry.get('score'),
+                    'metric': score_entry.get('metric', 'unknown'),
+                    'cost_per_instance': score_entry.get('cost_per_instance'),
+                    'average_runtime': score_entry.get('average_runtime'),
+                    'tags': [score_entry.get('benchmark')],
+                    'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
+                    'eval_visualization_page': score_entry.get('eval_visualization_page', ''),  # Laminar visualization URL
+                }
+                all_records.append(record)
         # Log validation errors if any
         if all_validation_errors:
             logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
                 logger.warning(f"  - {error}")
             if len(all_validation_errors) > 5:
                 logger.warning(f"  ... and {len(all_validation_errors) - 5} more")
         if not all_records:
+            return None  # Fall back to old format
         return pd.DataFrame(all_records)
     def _load(self):
             # Group by agent (version + model combination) to aggregate results across datasets
             transformed_records = []
+            # Create a unique identifier for each agent (version + model)
+            df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
             for agent_id in df['agent_id'].unique():
                 agent_records = df[df['agent_id'] == agent_id]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 agent_version = first_record['agent_version']
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
                 ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names
                     'SDK version': agent_version,  # Will become "SDK Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'parameter_count_b': first_record.get('parameter_count_b'),  # Total params in billions
                     'active_parameter_count_b': first_record.get('active_parameter_count_b'),  # Active params for MoE
                     # Additional columns expected by the transformer
+                    # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
                     'id': agent_id,
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"

tests/test_runtime_sorting.py DELETED Viewed

@@ -1,40 +0,0 @@
-import pandas as pd
-from leaderboard_transformer import format_runtime_column
-def test_runtime_strings_sort_numerically_in_ascending_order():
-    df = pd.DataFrame(
-        {
-            "Average Score": [0.8, 0.8, 0.8, 0.8, None],
-            "Average Runtime": [1323.0, 372.0, 410.0, None, None],
-        }
-    )
-    formatted = format_runtime_column(df.copy(), "Average Runtime")
-    runtimes = formatted["Average Runtime"].tolist()
-    assert sorted(runtimes) == [
-        runtimes[1],
-        runtimes[2],
-        runtimes[0],
-        runtimes[3],
-        runtimes[4],
-    ]
-def test_runtime_formatting_preserves_visible_labels():
-    df = pd.DataFrame(
-        {
-            "Average Score": [0.8, 0.8, None],
-            "Average Runtime": [45.2, None, None],
-        }
-    )
-    formatted = format_runtime_column(df.copy(), "Average Runtime")
-    values = formatted["Average Runtime"].tolist()
-    assert values[0].endswith("45s")
-    assert values[1].endswith("Missing</span>")
-    assert values[2].endswith("Not Submitted</span>")
-    assert 'display:none' in values[0]

ui_components.py CHANGED Viewed

@@ -508,36 +508,28 @@ class DummyViewer:
         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
-def get_leaderboard_viewer_instance(
-    split: str,
-    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
-):
     """
-    Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
-    thread-safe cache to avoid re-downloading data. The cache is keyed on
-    both axes so the OpenHands and Alternative Agents pages don't fight
-    over a single slot. On error, returns a stable DummyViewer.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
-    cache_key = (split, agent_filter)
     with _cache_lock:
-        if cache_key in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
-            return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
-        print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
-            split=split,
-            agent_filter=agent_filter,
         )
         # Simplify tag map creation
@@ -545,14 +537,14 @@ def get_leaderboard_viewer_instance(
         # Cache the results for next time (thread-safe)
         with _cache_lock:
-            CACHED_VIEWERS[cache_key] = viewer
-            CACHED_TAG_MAPS[cache_key] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
-        error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -561,8 +553,8 @@ def get_leaderboard_viewer_instance(
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
-            CACHED_VIEWERS[cache_key] = dummy_viewer
-            CACHED_TAG_MAPS[cache_key] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
@@ -1040,8 +1032,8 @@ def create_leaderboard_display(
                 outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
             )
-    # Return the filter controls so they can be used to update other sections
-    return show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown
 # # --- Detailed Benchmark Display ---
 def create_benchmark_details_display(
@@ -1276,17 +1268,12 @@ def create_benchmark_details_display(
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
-def get_full_leaderboard_data(
-    split: str,
-    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
-) -> tuple[pd.DataFrame, dict]:
     """
-    Loads and transforms the complete dataset for a (split, agent_filter)
-    pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
-    that don't pass it stay on the canonical leaderboard. The Alternative
-    Agents page passes ``"alternative"`` to get the third-party harnesses.
     """
-    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()

         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
+def get_leaderboard_viewer_instance(split: str):
     """
+    Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
+    re-downloading data. On error, returns a stable DummyViewer object.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
     with _cache_lock:
+        if split in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
+            return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
+        print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
+            split=split
         )
         # Simplify tag map creation
         # Cache the results for next time (thread-safe)
         with _cache_lock:
+            CACHED_VIEWERS[split] = viewer
+            CACHED_TAG_MAPS[split] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
+        error_message = f"Error loading data for split '{split}': {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
+            CACHED_VIEWERS[split] = dummy_viewer
+            CACHED_TAG_MAPS[split] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
                 outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
             )
+    # Return the show_open_only_checkbox and mark_by_dropdown so they can be used to update other sections
+    return show_open_only_checkbox, mark_by_dropdown
 # # --- Detailed Benchmark Display ---
 def create_benchmark_details_display(
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
+def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     """
+    Loads and transforms the complete dataset for a given split.
+    This function handles caching and returns the final "pretty" DataFrame and tag map.
     """
+    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()