| import gradio as gr |
| import pandas as pd |
| import plotly.graph_objects as go |
| import os |
| import base64 |
|
|
| from agenteval.leaderboard.view import LeaderboardViewer |
| from huggingface_hub import HfApi |
|
|
| import aliases |
| from leaderboard_transformer import ( |
| DataTransformer, |
| transform_raw_dataframe, |
| create_pretty_tag_map, |
| INFORMAL_TO_FORMAL_NAME_MAP, |
| _plot_scatter_plotly, |
| format_cost_column, |
| format_score_column, |
| get_pareto_df, |
| clean_llm_base_list, |
| ) |
| from config import ( |
| CONFIG_NAME, |
| EXTRACTED_DATA_DIR, |
| IS_INTERNAL, |
| RESULTS_DATASET, |
| ) |
| from content import ( |
| create_gradio_anchor_id, |
| format_error, |
| get_benchmark_description, |
| hf_uri_to_web_url, |
| hyperlink, |
| SCATTER_DISCLAIMER, |
| ) |
|
|
| api = HfApi() |
| os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) |
| |
| COMBINED_ICON_MAP = { |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg", |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg", |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg", |
| }, |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg", |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg", |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg", |
| }, |
| aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg", |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg", |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg", |
| }, |
| aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg", |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg", |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg", |
| } |
| } |
|
|
|
|
| |
| |
| for openness in COMBINED_ICON_MAP: |
| for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items(): |
| for tool_usage_alias in tool_usage_aliases: |
| COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage] |
|
|
| for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): |
| for openness_alias in openness_aliases: |
| COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness] |
|
|
|
|
| OPENNESS_SVG_MAP = { |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: { |
| "path": "assets/ellipse-pink.svg", |
| "description": "Code and models are open" |
| }, |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: { |
| "path": "assets/ellipse-coral.svg", |
| "description": "Code is open but uses closed-weight models" |
| }, |
| aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: { |
| "path": "assets/ellipse-yellow.svg", |
| "description": "No access to code; API access only" |
| }, |
| aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: { |
| "path": "assets/ellipse-white.svg", |
| "description": "No access to code or API; UI access only" |
| }, |
| } |
| TOOLING_SVG_MAP = { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: { |
| "path": "assets/five-point-star.svg", |
| "description": "Uses only tools explicitly provided in state.tools" |
| }, |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: { |
| "path": "assets/four-point-star.svg", |
| "description": "Custom tools for accessing an equivalent underlying environment" |
| }, |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: { |
| "path": "assets/three-point-star.svg", |
| "description": f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}" |
| }, |
| } |
|
|
| def get_svg_as_data_uri(path: str) -> str: |
| """Reads an SVG file and returns it as a base64-encoded data URI.""" |
| try: |
| with open(path, "rb") as svg_file: |
| encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8") |
| return f"data:image/svg+xml;base64,{encoded_svg}" |
| except FileNotFoundError: |
| print(f"Warning: SVG file not found at {path}") |
| return "" |
|
|
| |
| PRELOADED_URI_MAP = { |
| openness: { |
| tooling: get_svg_as_data_uri(path) |
| for tooling, path in tooling_map.items() |
| } |
| for openness, tooling_map in COMBINED_ICON_MAP.items() |
| } |
|
|
| def get_combined_icon_html(row, uri_map): |
| """ |
| Looks up the correct icon URI from the pre-loaded map based on the row's |
| 'Openness' and 'Agent Tooling' values and returns an HTML <img> tag. |
| """ |
| openness_val = row['Openness'] |
| tooling_val = row['Agent Tooling'] |
| uri = uri_map.get(openness_val, {}).get(tooling_val, "") |
| |
| tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}" |
|
|
| |
| return f'<img src="{uri}" alt="{tooltip}" title="{tooltip}" style="width:24px; height:24px;">' |
|
|
| def create_svg_html(value, svg_map): |
| """ |
| Generates the absolute simplest HTML for an icon, without any extra text. |
| This version is compatible with gr.DataFrame. |
| """ |
| if pd.isna(value) or value not in svg_map: |
| return "" |
|
|
| path_info = svg_map[value] |
| |
| if isinstance(path_info, dict): |
| path = path_info["path"] |
| else: |
| path = path_info |
|
|
| src = get_svg_as_data_uri(path) |
| |
| if src: |
| return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">' |
| return "" |
|
|
| def build_openness_tooltip_content() -> str: |
| """ |
| Generates the inner HTML for the Agent Openness tooltip card, |
| """ |
| descriptions = { |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "Both code and ML models are open", |
| aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "Code is open but uses an ML model with closed-weights", |
| aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "No access to code; API access only", |
| aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "No access to code or API; UI access only", |
| } |
| html_items = [] |
| for name, info in OPENNESS_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| desc = descriptions.get(name, "") |
|
|
| html_items.append(f""" |
| <div class="tooltip-legend-item"> |
| <img src="{uri}" alt="{name}"> |
| <div> |
| <strong>{name}</strong> |
| <span>{desc}</span> |
| </div> |
| </div> |
| """) |
|
|
| joined_items = "".join(html_items) |
|
|
| return f"""<span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card"> |
| <h3>Agent Openness</h3> |
| <p class="tooltip-description">Indicates how transparent and reproducible an agent is.</p> |
| <div class="tooltip-items-container">{joined_items}</div> |
| </span> |
| </span>""" |
|
|
|
|
| def build_pareto_tooltip_content() -> str: |
| """Generates the inner HTML for the Pareto tooltip card with final copy.""" |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| trophy_icon_html = f'<img src="{trophy_uri}" style="width: 25px; height: 25px; vertical-align: middle;">' |
| return f""" |
| <h3>On Pareto Frontier</h3> |
| <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p> |
| <p class="tooltip-description">Agents on the frontier either:</p> |
| <ul class="tooltip-sub-list"> |
| <li>Offer the lowest cost for a given performance, or</li> |
| <li>Deliver the best performance at a given cost.</li> |
| </ul> |
| <div class="tooltip-description" style="margin-top: 12px; display: flex; align-items: center;"> |
| <span>These agents are marked with this icon:</span> |
| <span>{trophy_icon_html}</span> |
| </div> |
| """ |
|
|
| def build_tooling_tooltip_content() -> str: |
| """Generates the inner HTML for the Agent Tooling tooltip card.""" |
| descriptions = { |
| aliases.CANONICAL_TOOL_USAGE_STANDARD: "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).", |
| aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "Custom tools for accessing an equivalent underlying environment:", |
| aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}", |
| } |
| custom_interface_sub_list = """ |
| <ul class="tooltip-sub-list"> |
| <li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li> |
| <li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li> |
| </ul> |
| """ |
| html_items = [] |
| for name, info in TOOLING_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| desc = descriptions.get(name, "") |
|
|
| |
| sub_list_html = custom_interface_sub_list if name == aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE else "" |
|
|
| html_items.append(f""" |
| <div class="tooltip-legend-item"> |
| <img src="{uri}" alt="{name}"> |
| <div> |
| <strong>{name}</strong> |
| <span>{desc}</span> |
| {sub_list_html} |
| </div> |
| </div> |
| """) |
|
|
| joined_items = "".join(html_items) |
|
|
| return f"""<span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card"> |
| <h3>Agent Tooling</h3> |
| <p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p> |
| <div class="tooltip-items-container">{joined_items}</div> |
| </span> |
| </span>""" |
|
|
|
|
| def build_descriptions_tooltip_content(table) -> str: |
| """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table.""" |
| if table == "Overall": |
| return """ |
| <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div> |
| <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div> |
| <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div> |
| <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div> |
| <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div> |
| <div class="tooltip-description-item"><b>Literature Understanding Cost:</b> Macro-average cost per problem (USD) across Literature Understanding benchmarks.</div> |
| <div class="tooltip-description-item"><b>Code Execution Score:</b> Macro-average score across Code & Execution benchmarks.</div> |
| <div class="tooltip-description-item"><b>Code Execution Cost:</b> Macro-average cost per problem (USD) across Code & Execution benchmarks.</div> |
| <div class="tooltip-description-item"><b>Data Analysis Score:</b> Macro-average score across Data Analysis benchmarks.</div> |
| <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div> |
| <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div> |
| <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div> |
| <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 4).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| """ |
| elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]: |
| return f""" |
| <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div> |
| <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div> |
| <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div> |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div> |
| <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div> |
| <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div> |
| <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| """ |
| else: |
| |
| return f""" |
| <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div> |
| <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div> |
| <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div> |
| <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div> |
| <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div> |
| <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div> |
| <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div> |
| """ |
|
|
| |
| openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP]) |
| tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP]) |
| |
| openness_html_items = [] |
| for name, info in OPENNESS_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| |
| openness_html_items.append( |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' |
| f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">' |
| f'<span>{name}</span>' |
| f'</div>' |
| ) |
| openness_html = " ".join(openness_html_items) |
|
|
| |
| tooling_html_items = [] |
| for name, info in TOOLING_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| tooling_html_items.append( |
| f'<div style="display: flex; align-items: center; white-space: nowrap;">' |
| f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">' |
| f'<span>{name}</span>' |
| f'</div>' |
| ) |
| tooling_html = " ".join(tooling_html_items) |
|
|
| pareto_tooltip_content = build_pareto_tooltip_content() |
| openness_tooltip_content = build_openness_tooltip_content() |
| tooling_tooltip_content = build_tooling_tooltip_content() |
|
|
| def create_legend_markdown(which_table: str) -> str: |
| """ |
| Generates the complete HTML for the legend section, including tooltips. |
| This is used in the main leaderboard display. |
| """ |
| descriptions_tooltip_content = build_descriptions_tooltip_content(which_table) |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| legend_markdown = f""" |
| <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;"> |
| |
| <div> <!-- Container for the Pareto section --> |
| <b>Pareto</b> |
| <span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card">{pareto_tooltip_content}</span> |
| </span> |
| <div class="table-legend-item"> |
| <img src="{trophy_uri}" alt="On Frontier" style="width:20px; height:20px; margin-right: 4px; flex-shrink: 0;"> |
| <span>On frontier</span> |
| </div> |
| </div> |
| |
| <div> <!-- Container for the Openness section --> |
| <b>Agent Openness</b> |
| {openness_tooltip_content} |
| <div class="table-legend-item">{openness_html}</div> |
| </div> |
| |
| <div> <!-- Container for the Tooling section --> |
| <b>Agent Tooling</b> |
| {tooling_tooltip_content} |
| <div class="table-legend-item">{tooling_html}</div> |
| </div> |
| |
| <div><!-- Container for the Column Descriptions section --> |
| <b>Column Descriptions</b> |
| <span class="tooltip-icon-legend"> |
| ⓘ |
| <span class="tooltip-card"> |
| <h3>Column Descriptions</h3> |
| <div class="tooltip-items-container">{descriptions_tooltip_content}</div> |
| </span> |
| </span> |
| </div> |
| </div> |
| """ |
| return legend_markdown |
|
|
| |
| openness_legend_items = [] |
| for name, info in OPENNESS_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| if uri: |
| openness_legend_items.append( |
| f'<div class="plot-legend-item">' |
| f'<img class="plot-legend-item-svg" src="{uri}" alt="{name}" title="{name}">' |
| f'<div class="plot-legend-item-text">' |
| f'<div>' |
| f'<span>{name}</span>' |
| f'</div>' |
| f'<span class="description">{info["description"]}</span>' |
| f'</div>' |
| f'</div>' |
| ) |
|
|
| tooling_legend_items = [] |
| for name, info in TOOLING_SVG_MAP.items(): |
| uri = get_svg_as_data_uri(info["path"]) |
| if uri: |
| tooling_legend_items.append( |
| f'<div class="plot-legend-item">' |
| f'<img class="plot-legend-item-svg plot-legend-tooling-svg" src="{uri}" alt="{name}" title="{name}">' |
| f'<div class="plot-legend-item-text">' |
| f'<div>' |
| f'<span>{name}</span>' |
| f'</div>' |
| f'<span class="description">{info["description"]}</span>' |
| f'</div>' |
| f'</div>' |
| ) |
|
|
| plot_legend_html = f""" |
| <div class="plot-legend-container"> |
| <div id="plot-legend-logo"> |
| <img src="{get_svg_as_data_uri("assets/logo.svg")}"> |
| </div> |
| <div style="margin-bottom: 16px;"> |
| <span class="plot-legend-category-heading">Pareto</span> |
| <div style="margin-top: 8px;"> |
| <div class="plot-legend-item"> |
| <img id="plot-legend-item-pareto-svg" class="plot-legend-item-svg" src="{get_svg_as_data_uri("assets/pareto.svg")}"> |
| <span>On frontier</span> |
| </div> |
| </div> |
| </div> |
| <div style="margin-bottom: 16px;"> |
| <span class="plot-legend-category-heading">Agent Openness</span> |
| <div style="margin-top: 8px;"> |
| {''.join(openness_legend_items)} |
| </div> |
| </div> |
| <div> |
| <span class="plot-legend-category-heading">Agent Tooling</span> |
| <div style="margin-top: 8px;"> |
| {''.join(tooling_legend_items)} |
| </div> |
| </div> |
| </div> |
| """; |
|
|
| |
| CACHED_VIEWERS = {} |
| CACHED_TAG_MAPS = {} |
|
|
|
|
| class DummyViewer: |
| """A mock viewer to be cached on error. It has a ._load() method |
| to ensure it behaves like the real LeaderboardViewer.""" |
| def __init__(self, error_df): |
| self._error_df = error_df |
|
|
| def _load(self): |
| |
| return self._error_df, {} |
|
|
| def get_leaderboard_viewer_instance(split: str): |
| """ |
| Fetches the LeaderboardViewer for a split, using a cache to avoid |
| re-downloading data. On error, returns a stable DummyViewer object. |
| """ |
| global CACHED_VIEWERS, CACHED_TAG_MAPS |
|
|
| if split in CACHED_VIEWERS: |
| |
| return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []}) |
|
|
| |
| try: |
| print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}") |
| viewer = LeaderboardViewer( |
| repo_id=RESULTS_DATASET, |
| config=CONFIG_NAME, |
| split=split, |
| is_internal=IS_INTERNAL |
| ) |
|
|
| |
| pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP) |
|
|
| |
| CACHED_VIEWERS[split] = viewer |
| CACHED_TAG_MAPS[split] = pretty_tag_map |
|
|
| return viewer, pretty_tag_map |
|
|
| except Exception as e: |
| |
| error_message = f"Error loading data for split '{split}': {e}" |
| print(format_error(error_message)) |
|
|
| dummy_df = pd.DataFrame({"Message": [error_message]}) |
| dummy_viewer = DummyViewer(dummy_df) |
| dummy_tag_map = {"Overall": []} |
|
|
| |
| CACHED_VIEWERS[split] = dummy_viewer |
| CACHED_TAG_MAPS[split] = dummy_tag_map |
|
|
| return dummy_viewer, dummy_tag_map |
|
|
|
|
| def create_leaderboard_display( |
| full_df: pd.DataFrame, |
| tag_map: dict, |
| category_name: str, |
| split_name: str |
| ): |
| """ |
| This UI factory takes pre-loaded data and renders the main DataFrame and Plot |
| for a given category (e.g., "Overall" or "Literature Understanding"). |
| """ |
| |
| |
| transformer = DataTransformer(full_df, tag_map) |
| df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True) |
| pareto_df = get_pareto_df(df_view) |
| |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">' |
| if not pareto_df.empty and 'id' in pareto_df.columns: |
| pareto_agent_names = pareto_df['id'].tolist() |
| else: |
| pareto_agent_names = [] |
| df_view['Pareto'] = df_view.apply( |
| lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '', |
| axis=1 |
| ) |
| |
| df_view['Icon'] = df_view.apply( |
| lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), |
| axis=1 |
| ) |
|
|
| |
| for col in df_view.columns: |
| if "Cost" in col: |
| df_view = format_cost_column(df_view, col) |
|
|
| |
| for col in df_view.columns: |
| if "Score" in col: |
| df_view = format_score_column(df_view, col) |
| scatter_plot = plots_dict.get('scatter_plot', go.Figure()) |
| |
| df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list) |
| df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html) |
| |
| if 'Source' in df_view.columns: |
| df_view['Agent'] = df_view.apply( |
| lambda row: f"{row['Agent']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['Agent'], |
| axis=1 |
| ) |
|
|
| all_cols = df_view.columns.tolist() |
| |
| all_cols.insert(0, all_cols.pop(all_cols.index('Icon'))) |
| all_cols.insert(0, all_cols.pop(all_cols.index('Pareto'))) |
| df_view = df_view[all_cols] |
| |
| columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source'] |
| df_view = df_view.drop(columns=columns_to_drop, errors='ignore') |
|
|
| df_headers = df_view.columns.tolist() |
| df_datatypes = [] |
| for col in df_headers: |
| if col == "Logs" or "Cost" in col or "Score" in col: |
| df_datatypes.append("markdown") |
| elif col in ["Agent","Icon","Models Used", "Pareto"]: |
| df_datatypes.append("html") |
| else: |
| df_datatypes.append("str") |
|
|
| header_rename_map = { |
| "Pareto": "", |
| "Icon": "", |
| } |
| |
| df_view = df_view.rename(columns=header_rename_map) |
| |
| fixed_start_widths = [40, 40, 200, 100, 200] |
| num_score_cost_cols = 0 |
| remaining_headers = df_headers[len(fixed_start_widths):] |
| for col in remaining_headers: |
| if "Score" in col or "Cost" in col: |
| num_score_cost_cols += 1 |
| dynamic_widths = [90] * num_score_cost_cols |
| fixed_end_widths = [90, 100, 50] |
| |
| final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths |
|
|
| with gr.Row(): |
| with gr.Column(scale=3): |
| plot_component = gr.Plot( |
| value=scatter_plot, |
| show_label=False, |
| ) |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") |
| with gr.Column(scale=1): |
| gr.HTML(value=plot_legend_html) |
|
|
| |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): |
| dataframe_component = gr.DataFrame( |
| headers=df_headers, |
| value=df_view, |
| datatype=df_datatypes, |
| interactive=False, |
| wrap=True, |
| column_widths=final_column_widths, |
| elem_classes=["wrap-header-df"], |
| show_search="search", |
| elem_id="main-leaderboard" |
| ) |
| legend_markdown = create_legend_markdown(category_name) |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") |
|
|
| |
| return plot_component, dataframe_component |
|
|
| |
| def create_benchmark_details_display( |
| full_df: pd.DataFrame, |
| tag_map: dict, |
| category_name: str, |
| validation: bool = False, |
| ): |
| """ |
| Generates a detailed breakdown for each benchmark within a given category. |
| For each benchmark, it creates a title, a filtered table, and a scatter plot. |
| Args: |
| full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split. |
| tag_map (dict): The "pretty" tag map to find the list of benchmarks. |
| category_name (str): The main category to display details for (e.g., "Literature Understanding"). |
| """ |
| |
| benchmark_names = tag_map.get(category_name, []) |
|
|
| if not benchmark_names: |
| gr.Markdown(f"No detailed benchmarks found for the category: {category_name}") |
| return |
|
|
| gr.HTML(f'<h2 class="benchmark-main-subtitle">{category_name} Detailed Benchmark Results</h2>') |
| gr.Markdown("---") |
| |
| for benchmark_name in benchmark_names: |
| anchor_id = create_gradio_anchor_id(benchmark_name, validation) |
| gr.HTML( |
| f""" |
| <h3 class="benchmark-title" id="{anchor_id}">{benchmark_name} Leaderboard <a href="#{anchor_id}" class="header-link-icon">🔗</a></h3> |
| <div class="benchmark-description">{get_benchmark_description(benchmark_name, validation)}</div> |
| <button onclick="scroll_to_element('page-content-wrapper')" class="primary-link-button">Return to the aggregate {category_name} leaderboard</button> |
| """ |
| ) |
|
|
| |
| benchmark_score_col = f"{benchmark_name} Score" |
| benchmark_cost_col = f"{benchmark_name} Cost" |
|
|
| |
| table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used'] |
|
|
| |
| existing_table_cols = [col for col in table_cols if col in full_df.columns] |
|
|
| if benchmark_score_col not in existing_table_cols: |
| gr.Markdown(f"Score data for {benchmark_name} not available.") |
| continue |
|
|
| |
| benchmark_table_df = full_df[existing_table_cols].copy() |
| pareto_df = get_pareto_df(benchmark_table_df) |
| |
| trophy_uri = get_svg_as_data_uri("assets/trophy.svg") |
| trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">' |
| if not pareto_df.empty and 'id' in pareto_df.columns: |
| pareto_agent_names = pareto_df['id'].tolist() |
| else: |
| pareto_agent_names = [] |
| benchmark_table_df['Pareto'] = benchmark_table_df.apply( |
| lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '', |
| axis=1 |
| ) |
|
|
| benchmark_table_df['Icon'] = benchmark_table_df.apply( |
| lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), |
| axis=1 |
| ) |
|
|
| |
| benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list) |
| benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html) |
| |
| if 'Source' in benchmark_table_df.columns: |
| benchmark_table_df['Agent'] = benchmark_table_df.apply( |
| lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'], |
| axis=1 |
| ) |
|
|
| |
| def check_benchmark_status(row): |
| has_score = pd.notna(row.get(benchmark_score_col)) |
| has_cost = pd.notna(row.get(benchmark_cost_col)) |
| if has_score and has_cost: |
| return "✅" |
| if has_score or has_cost: |
| return "⚠️" |
| return "🚫 " |
|
|
| |
| benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1) |
| |
| if benchmark_score_col in benchmark_table_df.columns: |
| benchmark_table_df = benchmark_table_df.sort_values( |
| by=benchmark_score_col, ascending=False, na_position='last' |
| ) |
| |
| benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col) |
| benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col) |
| desired_cols_in_order = [ |
| 'Pareto', |
| 'Icon', |
| 'Agent', |
| 'Submitter', |
| 'Models Used', |
| 'Attempted Benchmark', |
| benchmark_score_col, |
| benchmark_cost_col, |
| 'Date', |
| 'Logs' |
| ] |
| for col in desired_cols_in_order: |
| if col not in benchmark_table_df.columns: |
| benchmark_table_df[col] = pd.NA |
| benchmark_table_df = benchmark_table_df[desired_cols_in_order] |
| |
| benchmark_table_df.rename({ |
| benchmark_score_col: 'Score', |
| benchmark_cost_col: 'Cost', |
| }, inplace=True) |
| |
| df_headers = benchmark_table_df.columns.tolist() |
| df_datatypes = [] |
| for col in df_headers: |
| if "Logs" in col or "Cost" in col or "Score" in col: |
| df_datatypes.append("markdown") |
| elif col in ["Agent", "Icon", "Models Used", "Pareto"]: |
| df_datatypes.append("html") |
| else: |
| df_datatypes.append("str") |
| |
| header_rename_map = { |
| "Pareto": "", |
| "Icon": "", |
| } |
| |
| benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map) |
| benchmark_plot = _plot_scatter_plotly( |
| data=full_df, |
| x=benchmark_cost_col, |
| y=benchmark_score_col, |
| agent_col="Agent", |
| name=benchmark_name |
| ) |
| with gr.Row(): |
| with gr.Column(scale=3): |
| gr.Plot(value=benchmark_plot, show_label=False) |
| gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") |
| with gr.Column(scale=1): |
| gr.HTML(value=plot_legend_html) |
|
|
| |
| with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): |
| gr.DataFrame( |
| headers=df_headers, |
| value=benchmark_table_df, |
| datatype=df_datatypes, |
| interactive=False, |
| wrap=True, |
| column_widths=[40, 40, 200, 150, 175, 85, 100, 100, 80, 40], |
| show_search="search", |
| elem_classes=["wrap-header-df"] |
| ) |
| legend_markdown = create_legend_markdown(benchmark_name) |
| gr.HTML(value=legend_markdown, elem_id="legend-markdown") |
|
|
| def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]: |
| """ |
| Loads and transforms the complete dataset for a given split. |
| This function handles caching and returns the final "pretty" DataFrame and tag map. |
| """ |
| viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split) |
|
|
| if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)): |
| raw_df, _ = viewer_or_data._load() |
| if raw_df.empty: |
| return pd.DataFrame(), {} |
|
|
| pretty_df = transform_raw_dataframe(raw_df) |
| pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP) |
| if "Logs" in pretty_df.columns: |
| def format_log_entry_to_html(raw_uri): |
| if pd.isna(raw_uri) or raw_uri == "": return "" |
| web_url = hf_uri_to_web_url(str(raw_uri)) |
| return hyperlink(web_url, "🔗") if web_url else "" |
| |
| pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html) |
|
|
| if "Source" in pretty_df.columns: |
| def format_source_url_to_html(raw_url): |
| |
| if pd.isna(raw_url) or raw_url == "": return "" |
| |
| return hyperlink(str(raw_url), "🔗") |
| |
| pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html) |
| return pretty_df, pretty_tag_map |
|
|
| |
| return pd.DataFrame(), {} |
| def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML: |
| """ |
| Builds the entire sub-navigation bar as a single, self-contained HTML component. |
| This bypasses Gradio's layout components, giving us full control. |
| """ |
| benchmark_names = tag_map.get(category_name, []) |
| if not benchmark_names: |
| |
| return gr.HTML() |
|
|
| |
| html_buttons = [] |
| for name in benchmark_names: |
| target_id = create_gradio_anchor_id(name, validation) |
|
|
| |
| |
| |
| button_str = f""" |
| <button |
| class="primary-link-button" |
| onclick="scroll_to_element('{target_id}')" |
| > |
| {name} |
| </button> |
| """ |
| html_buttons.append(button_str) |
|
|
| |
| |
| full_html = f""" |
| <div class="sub-nav-bar-container"> |
| <span class="sub-nav-label">Benchmarks in this category:</span> |
| {' | '.join(html_buttons)} |
| </div> |
| """ |
|
|
| |
| return gr.HTML(full_html) |
|
|
| def format_llm_base_with_html(value): |
| """ |
| Formats the 'Models Used' cell value. |
| If the value is a list with more than 1 element, it returns an |
| HTML <span> with the full list in a hover-over tooltip. |
| If it's a single-element list, it returns just that element. |
| Otherwise, it returns the original value. |
| """ |
| if isinstance(value, list): |
| if len(value) > 1: |
| |
| tooltip_text = "\n".join(map(str, value)) |
| |
| return f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{value[0]} (+ {len(value) - 1}) ⓘ</span>' |
| if len(value) == 1: |
| |
| return value[0] |
| |
| return value |
|
|