| """ |
| GuardBench Leaderboard Application |
| """ |
|
|
| import os |
| import json |
| import tempfile |
| import logging |
| import gradio as gr |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from apscheduler.schedulers.background import BackgroundScheduler |
| import numpy as np |
| from gradio.themes.utils import fonts, colors |
|
|
| from src.about import ( |
| CITATION_BUTTON_LABEL, |
| CITATION_BUTTON_TEXT, |
| EVALUATION_QUEUE_TEXT, |
| INTRODUCTION_TEXT, |
| LLM_BENCHMARKS_TEXT, |
| TITLE, |
| ) |
| from src.display.css_html_js import custom_css |
| from src.display.utils import ( |
| GUARDBENCH_COLUMN, |
| DISPLAY_COLS, |
| METRIC_COLS, |
| HIDDEN_COLS, |
| NEVER_HIDDEN_COLS, |
| CATEGORIES, |
| TEST_TYPES, |
| ModelType, |
| Precision, |
| WeightType, |
| GuardModelType, |
| get_all_column_choices, |
| get_default_visible_columns, |
| ) |
| from src.display.formatting import styled_message, styled_error, styled_warning |
| from src.envs import ( |
| ADMIN_USERNAME, |
| ADMIN_PASSWORD, |
| RESULTS_DATASET_ID, |
| SUBMITTER_TOKEN, |
| TOKEN, |
| DATA_PATH |
| ) |
| from src.populate import get_leaderboard_df, get_category_leaderboard_df |
| from src.submission.submit import process_submission |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| |
| os.makedirs(DATA_PATH, exist_ok=True) |
|
|
| |
| BENCHMARK_VERSIONS = ["v0"] |
| CURRENT_VERSION = "v0" |
|
|
| |
| try: |
| logger.info("Initializing leaderboard data...") |
| LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) |
| logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") |
| except Exception as e: |
| logger.error(f"Error loading leaderboard data: {e}") |
| LEADERBOARD_DF = pd.DataFrame() |
|
|
| custom_theme = gr.themes.Default( |
| primary_hue=colors.slate, |
| secondary_hue=colors.slate, |
| neutral_hue=colors.neutral, |
| font=(fonts.GoogleFont("Inter"), "sans-serif") |
| ).set( |
| |
| body_background_fill="#0f0f10", |
| body_background_fill_dark="#0f0f10", |
| body_text_color="#f4f4f5", |
| body_text_color_subdued="#a1a1aa", |
| block_background_fill="#1e1e1e", |
| block_border_color="#333333", |
| block_shadow="none", |
| |
| button_primary_background_fill="#121212", |
| button_primary_text_color="#f4f4f5", |
| button_primary_border_color="#333333", |
| button_secondary_background_fill="#f4f4f5", |
| button_secondary_text_color="#0f0f10", |
| button_secondary_border_color="#f4f4f5", |
| input_background_fill="#1e1e1e", |
| input_border_color="#333333", |
| input_placeholder_color="#71717a", |
| table_border_color="#333333", |
| table_even_background_fill="#2d2d2d", |
| table_odd_background_fill="#1e1e1e", |
| table_text_color="#f4f4f5", |
| link_text_color="#ffffff", |
| border_color_primary="#333333", |
| background_fill_secondary="#333333", |
| color_accent="#f4f4f5", |
| border_color_accent="#333333", |
| button_primary_background_fill_hover="#424242", |
| block_title_text_color="#f4f4f5", |
| accordion_text_color="#f4f4f5", |
| panel_background_fill="#1e1e1e", |
| panel_border_color="#333333", |
| |
| background_fill_primary="#0f0f10", |
| background_fill_primary_dark="#0f0f10", |
| background_fill_secondary_dark="#333333", |
| border_color_primary_dark="#333333", |
| border_color_accent_dark="#333333", |
| border_color_accent_subdued="#424242", |
| border_color_accent_subdued_dark="#424242", |
| color_accent_soft="#a1a1aa", |
| color_accent_soft_dark="#a1a1aa", |
| |
| input_background_fill_dark="#1e1e1e", |
| input_background_fill_focus="#424242", |
| input_background_fill_focus_dark="#424242", |
| input_background_fill_hover="#2d2d2d", |
| input_background_fill_hover_dark="#2d2d2d", |
| input_border_color_dark="#333333", |
| input_border_color_focus="#f4f4f5", |
| input_border_color_focus_dark="#f4f4f5", |
| input_border_color_hover="#424242", |
| input_border_color_hover_dark="#424242", |
| input_placeholder_color_dark="#71717a", |
| |
| table_even_background_fill_dark="#2d2d2d", |
| table_odd_background_fill_dark="#1e1e1e", |
| |
| body_text_color_dark="#f4f4f5", |
| body_text_color_subdued_dark="#a1a1aa", |
| block_title_text_color_dark="#f4f4f5", |
| accordion_text_color_dark="#f4f4f5", |
| table_text_color_dark="#f4f4f5", |
| |
| panel_background_fill_dark="#1e1e1e", |
| panel_border_color_dark="#333333", |
| block_background_fill_dark="#1e1e1e", |
| block_border_color_dark="#333333", |
| ) |
|
|
| |
| def update_column_choices(df): |
| """Update column choices based on what's actually in the dataframe""" |
| if df is None or df.empty: |
| return get_all_column_choices() |
|
|
| |
| existing_columns = list(df.columns) |
|
|
| |
| all_columns = get_all_column_choices() |
|
|
| |
| valid_columns = [(col_name, display_name) for col_name, display_name in all_columns |
| if col_name in existing_columns] |
|
|
| |
| if not valid_columns: |
| return get_all_column_choices() |
|
|
| return valid_columns |
|
|
| |
| def get_initial_columns(): |
| """Get initial columns to show in the dropdown""" |
| try: |
| |
| available_cols = list(LEADERBOARD_DF.columns) |
| logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}") |
|
|
| |
| if not available_cols: |
| return get_default_visible_columns() |
|
|
| |
| valid_defaults = [col for col in get_default_visible_columns() if col in available_cols] |
|
|
| |
| if not valid_defaults: |
| return available_cols |
|
|
| return valid_defaults |
| except Exception as e: |
| logger.error(f"Error getting initial columns: {e}") |
| return get_default_visible_columns() |
|
|
| def init_leaderboard(dataframe, visible_columns=None): |
| """ |
| Initialize a standard Gradio Dataframe component for the leaderboard. |
| """ |
| if dataframe is None or dataframe.empty: |
| |
| columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
| dataframe = pd.DataFrame(columns=columns) |
| logger.warning("Initializing empty leaderboard") |
|
|
| |
|
|
| |
| display_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] |
| hidden_column_names = [getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS] |
|
|
| |
| always_visible = [getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS] |
|
|
| |
| if visible_columns is None: |
| |
| visible_columns = [col for col in display_column_names if col not in hidden_column_names] |
|
|
| |
| for col in always_visible: |
| if col not in visible_columns and col in dataframe.columns: |
| visible_columns.append(col) |
|
|
| |
| visible_columns = [col for col in visible_columns if col in dataframe.columns] |
|
|
| |
| |
| type_mapping = { |
| 'text': 'str', |
| 'number': 'number', |
| 'bool': 'bool', |
| 'date': 'date', |
| 'markdown': 'markdown', |
| 'html': 'html', |
| 'image': 'image' |
| } |
|
|
| |
| datatypes = [] |
| for col in visible_columns: |
| |
| col_type = None |
| for display_col in DISPLAY_COLS: |
| if getattr(GUARDBENCH_COLUMN, display_col).name == col: |
| orig_type = getattr(GUARDBENCH_COLUMN, display_col).type |
| |
| col_type = type_mapping.get(orig_type, 'str') |
| break |
|
|
| |
| if col_type is None: |
| col_type = 'str' |
|
|
| datatypes.append(col_type) |
|
|
| |
| if 'search_dummy' not in dataframe.columns: |
| dataframe['search_dummy'] = dataframe.apply( |
| lambda row: ' '.join(str(val) for val in row.values if pd.notna(val)), |
| axis=1 |
| ) |
|
|
| |
| visible_columns.remove('model_name') |
| visible_columns = ['model_name'] + visible_columns |
| display_df = dataframe[visible_columns].copy() |
|
|
| |
| |
| |
|
|
| |
| numeric_cols = display_df.select_dtypes(include=np.number).columns |
| for col in numeric_cols: |
| |
| if not pd.api.types.is_integer_dtype(display_df[col]): |
| |
| display_df[col] = display_df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else None) |
|
|
| return gr.Dataframe( |
| value=display_df, |
| headers=visible_columns, |
| datatype=datatypes, |
| interactive=False, |
| wrap=True, |
| elem_id="leaderboard-table", |
| row_count=len(display_df) |
| ) |
|
|
|
|
| def search_filter_leaderboard(df, search_query="", model_types=None, version=CURRENT_VERSION): |
| """ |
| Filter the leaderboard based on search query and model types. |
| """ |
| if df is None or df.empty: |
| return df |
|
|
| filtered_df = df.copy() |
|
|
| |
| if 'search_dummy' not in filtered_df.columns: |
| filtered_df['search_dummy'] = filtered_df.apply( |
| lambda row: ' '.join(str(val) for val in row.values if pd.notna(val)), |
| axis=1 |
| ) |
|
|
| |
| if model_types and len(model_types) > 0: |
| filtered_df = filtered_df[filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)] |
|
|
| |
| if search_query: |
| search_terms = [term.strip() for term in search_query.split(";") if term.strip()] |
| if search_terms: |
| combined_mask = None |
| for term in search_terms: |
| mask = filtered_df['search_dummy'].str.contains(term, case=False, na=False) |
| if combined_mask is None: |
| combined_mask = mask |
| else: |
| combined_mask = combined_mask | mask |
|
|
| if combined_mask is not None: |
| filtered_df = filtered_df[combined_mask] |
|
|
| |
| visible_columns = [col for col in filtered_df.columns if col != 'search_dummy'] |
| return filtered_df[visible_columns] |
|
|
|
|
| def refresh_data_with_filters(version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None): |
| """ |
| Refresh the leaderboard data and update all components with filtering. |
| Ensures we handle cases where dataframes might have limited columns. |
| """ |
| try: |
| logger.info(f"Performing refresh of leaderboard data with filters...") |
| |
| main_df = get_leaderboard_df(version=version) |
| category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] |
| selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns] |
|
|
| |
| logger.info(f"Main dataframe columns: {list(main_df.columns)}") |
|
|
| |
| filtered_main_df = search_filter_leaderboard(main_df, search_query, model_types, version) |
| filtered_category_dfs = [ |
| search_filter_leaderboard(df, search_query, model_types, version) |
| for df in category_dfs |
| ] |
|
|
| |
| available_columns = list(filtered_main_df.columns) |
|
|
| |
| if selected_columns: |
| |
| internal_selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns] |
| valid_selected_columns = [col for col in internal_selected_columns if col in available_columns] |
| if not valid_selected_columns and 'model_name' in available_columns: |
| |
| valid_selected_columns = ['model_name'] + [col for col in get_default_visible_columns() if col in available_columns] |
| else: |
| |
| valid_selected_columns = [col for col in get_default_visible_columns() if col in available_columns] |
|
|
|
|
| |
| main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns) |
|
|
| |
| category_dataframes = [] |
| for df in filtered_category_dfs: |
| df_columns = list(df.columns) |
| df_valid_columns = [col for col in valid_selected_columns if col in df_columns] |
| if not df_valid_columns and 'model_name' in df_columns: |
| df_valid_columns = ['model_name'] + get_default_visible_columns() |
| category_dataframes.append(init_leaderboard(df, df_valid_columns)) |
|
|
| return main_dataframe, *category_dataframes |
|
|
| except Exception as e: |
| logger.error(f"Error in refresh with filters: {e}") |
| |
| return leaderboard, *[tab.children[0] for tab in category_tabs.children[1:len(CATEGORIES)+1]] |
|
|
|
|
| def submit_results( |
| model_name: str, |
| base_model: str, |
| revision: str, |
| precision: str, |
| weight_type: str, |
| model_type: str, |
| submission_file: tempfile._TemporaryFileWrapper, |
| version: str, |
| guard_model_type: GuardModelType |
| ): |
| """ |
| Handle submission of results with model metadata. |
| """ |
| if submission_file is None: |
| return styled_error("No submission file provided") |
|
|
| if not model_name: |
| return styled_error("Model name is required") |
|
|
| if not model_type: |
| return styled_error("Please select a model type") |
|
|
| file_path = submission_file.name |
| logger.info(f"Received submission for model {model_name}: {file_path}") |
|
|
| |
| metadata = { |
| "model_name": model_name, |
| "base_model": base_model, |
| "revision": revision if revision else "main", |
| "precision": precision, |
| "weight_type": weight_type, |
| "model_type": model_type, |
| "version": version, |
| "guard_model_type": guard_model_type |
| } |
|
|
| |
| result = process_submission(file_path, metadata, version=version) |
|
|
| |
| global LEADERBOARD_DF |
| try: |
| logger.info(f"Refreshing leaderboard data after submission for version {version}...") |
| LEADERBOARD_DF = get_leaderboard_df(version=version) |
| logger.info("Refreshed leaderboard data after submission") |
| except Exception as e: |
| logger.error(f"Error refreshing leaderboard data: {e}") |
|
|
| return result |
|
|
|
|
| def refresh_data(version=CURRENT_VERSION): |
| """ |
| Refresh the leaderboard data and update all components. |
| """ |
| try: |
| logger.info(f"Performing scheduled refresh of leaderboard data...") |
| |
| main_df = get_leaderboard_df(version=version) |
| category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] |
|
|
| |
| return main_df, *category_dfs |
|
|
| except Exception as e: |
| logger.error(f"Error in scheduled refresh: {e}") |
| return None, *[None for _ in CATEGORIES] |
|
|
|
|
| def update_leaderboards(version): |
| """ |
| Update all leaderboard components with data for the selected version. |
| """ |
| try: |
| new_df = get_leaderboard_df(version=version) |
| category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] |
| return new_df, *category_dfs |
| except Exception as e: |
| logger.error(f"Error updating leaderboards for version {version}: {e}") |
| return None, *[None for _ in CATEGORIES] |
|
|
|
|
| def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION): |
| """ |
| Create a radar plot comparing model performance for selected models. |
| """ |
| if category == "Overall Performance": |
| df = get_leaderboard_df(version=version) |
| else: |
| df = get_category_leaderboard_df(category, version=version) |
|
|
| if df.empty: |
| return go.Figure() |
|
|
| |
| df = df[df['model_name'].isin(selected_models)] |
|
|
| |
| metric_cols = [col for col in df.columns if metric in col] |
|
|
| |
| fig = go.Figure() |
|
|
| |
| colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C'] |
|
|
| |
| for idx, model in enumerate(selected_models): |
| model_data = df[df['model_name'] == model] |
| if not model_data.empty: |
| values = model_data[metric_cols].values[0].tolist() |
| |
| values = values + [values[0]] |
|
|
| |
| categories = [col.replace(f'_{metric}', '') for col in metric_cols] |
| |
| categories = categories + [categories[0]] |
|
|
| fig.add_trace(go.Scatterpolar( |
| r=values, |
| theta=categories, |
| name=model, |
| line_color=colors[idx % len(colors)], |
| fill='toself' |
| )) |
|
|
| |
| fig.update_layout( |
| paper_bgcolor='#000000', |
| plot_bgcolor='#000000', |
| font={'color': '#ffffff'}, |
| title={ |
| 'text': f'{category} - {metric.upper()} Score Comparison', |
| 'font': {'color': '#ffffff', 'size': 24} |
| }, |
| polar=dict( |
| bgcolor='#000000', |
| radialaxis=dict( |
| visible=True, |
| range=[0, 1], |
| gridcolor='#333333', |
| linecolor='#333333', |
| tickfont={'color': '#ffffff'}, |
| ), |
| angularaxis=dict( |
| gridcolor='#333333', |
| linecolor='#333333', |
| tickfont={'color': '#ffffff'}, |
| ) |
| ), |
| height=600, |
| showlegend=True, |
| legend=dict( |
| yanchor="top", |
| y=0.99, |
| xanchor="right", |
| x=0.99, |
| bgcolor='rgba(0,0,0,0.5)', |
| font={'color': '#ffffff'} |
| ) |
| ) |
|
|
| return fig |
|
|
|
|
| def update_model_choices(version): |
| """ |
| Update the list of available models for the given version. |
| """ |
| df = get_leaderboard_df(version=version) |
| if df.empty: |
| return [] |
| return sorted(df['model_name'].unique().tolist()) |
|
|
|
|
| def update_visualization(selected_models, selected_category, selected_metric, version): |
| """ |
| Update the visualization based on user selections. |
| """ |
| if not selected_models: |
| return go.Figure() |
| return create_performance_plot(selected_models, selected_category, selected_metric, version) |
|
|
|
|
| |
| demo = gr.Blocks(css=custom_css, theme=custom_theme) |
|
|
| with demo: |
| gr.HTML(TITLE) |
| |
|
|
| with gr.Row(): |
| tabs = gr.Tabs(elem_classes="tab-buttons") |
|
|
| with tabs: |
| with gr.TabItem("Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): |
| with gr.Row(): |
| version_selector = gr.Dropdown( |
| choices=BENCHMARK_VERSIONS, |
| label="Benchmark Version", |
| value=CURRENT_VERSION, |
| interactive=True, |
| elem_classes="version-selector", |
| scale=1, |
| visible=False |
| ) |
|
|
| with gr.Row(): |
| search_input = gr.Textbox( |
| placeholder="Search models (separate queries with ;)...", |
| label="Search", |
| elem_id="search-bar", |
| scale=2 |
| ) |
| model_type_filter = gr.Dropdown( |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
| label="Filter by Model Type", |
| multiselect=True, |
| value=[], |
| interactive=True, |
| scale=1 |
| ) |
| column_selector = gr.Dropdown( |
| choices=get_all_column_choices(), |
| label="Customize Columns", |
| multiselect=True, |
| value=get_initial_columns(), |
| interactive=True, |
| scale=1 |
| ) |
| with gr.Row(): |
| refresh_button = gr.Button("Refresh", scale=0) |
|
|
| |
| with gr.Tabs(elem_classes="category-tabs") as category_tabs: |
| |
| with gr.TabItem("Overall Performance", elem_id="overall-tab"): |
| leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
|
| |
| for category in CATEGORIES: |
| with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): |
| category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) |
| category_leaderboard = init_leaderboard(category_df) |
|
|
| |
| def update_with_search_filters(version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None): |
| """ |
| Update the leaderboards with search and filter settings. |
| """ |
| return refresh_data_with_filters(version, search_query, model_types, selected_columns) |
|
|
| |
| refresh_button.click( |
| fn=refresh_data_with_filters, |
| inputs=[version_selector, search_input, model_type_filter, column_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
| |
| search_input.change( |
| fn=refresh_data_with_filters, |
| inputs=[version_selector, search_input, model_type_filter, column_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
| |
| model_type_filter.change( |
| fn=refresh_data_with_filters, |
| inputs=[version_selector, search_input, model_type_filter, column_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
| |
| version_selector.change( |
| fn=refresh_data_with_filters, |
| inputs=[version_selector, search_input, model_type_filter, column_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
| |
| def update_columns(selected_columns): |
| """ |
| Update all leaderboards to show the selected columns. |
| Ensures all selected columns are preserved in the update. |
| |
| """ |
|
|
| try: |
| logger.info(f"Updating columns to show: {selected_columns}") |
|
|
| |
| if not selected_columns or len(selected_columns) == 0: |
| selected_columns = get_default_visible_columns() |
| logger.info(f"No columns selected, using defaults: {selected_columns}") |
|
|
| |
| internal_selected_columns = [x.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("_recall", "_recall_binary").replace("_precision", "_precision_binary") for x in selected_columns] |
|
|
|
|
| |
| main_df = get_leaderboard_df(version=version_selector.value) |
|
|
| |
| category_dfs = [get_category_leaderboard_df(category, version=version_selector.value) |
| for category in CATEGORIES] |
|
|
| |
| logger.info(f"Main dataframe columns: {list(main_df.columns)}") |
| logger.info(f"Selected columns (internal): {internal_selected_columns}") |
|
|
| |
| if 'model_name' in main_df.columns and 'model_name' not in internal_selected_columns: |
| internal_selected_columns = ['model_name'] + internal_selected_columns |
|
|
| |
| |
| main_leaderboard = init_leaderboard(main_df, internal_selected_columns) |
|
|
| |
| |
| category_leaderboards = [] |
| for df in category_dfs: |
| |
| |
| category_leaderboards.append(init_leaderboard(df, internal_selected_columns)) |
|
|
| return main_leaderboard, *category_leaderboards |
|
|
| except Exception as e: |
| logger.error(f"Error updating columns: {e}") |
| import traceback |
| logger.error(traceback.format_exc()) |
| return leaderboard, *[tab.children[0] for tab in category_tabs.children[1:len(CATEGORIES)+1]] |
|
|
| |
| column_selector.change( |
| fn=update_columns, |
| inputs=[column_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
| with gr.TabItem("Visualize", elem_id="guardbench-viz-tab", id=1): |
| with gr.Row(): |
| with gr.Column(): |
| viz_version_selector = gr.Dropdown( |
| choices=BENCHMARK_VERSIONS, |
| label="Benchmark Version", |
| value=CURRENT_VERSION, |
| interactive=True, |
| visible=False |
| ) |
| model_selector = gr.Dropdown( |
| choices=update_model_choices(CURRENT_VERSION), |
| label="Select Models to Compare", |
| multiselect=True, |
| interactive=True |
| ) |
| with gr.Column(): |
| |
| viz_categories = ["Overall Performance"] + CATEGORIES |
| category_selector = gr.Dropdown( |
| choices=viz_categories, |
| label="Select Category", |
| value=viz_categories[0], |
| interactive=True |
| ) |
| metric_selector = gr.Dropdown( |
| choices=["f1_binary", "precision_binary", "recall_binary"], |
| label="Select Metric", |
| value="f1_binary", |
| interactive=True |
| ) |
|
|
| plot_output = gr.Plot() |
|
|
| |
| for control in [viz_version_selector, model_selector, category_selector, metric_selector]: |
| control.change( |
| fn=update_visualization, |
| inputs=[model_selector, category_selector, metric_selector, viz_version_selector], |
| outputs=plot_output |
| ) |
|
|
| |
| viz_version_selector.change( |
| fn=update_model_choices, |
| inputs=[viz_version_selector], |
| outputs=[model_selector] |
| ) |
|
|
| with gr.TabItem("About", elem_id="guardbench-about-tab", id=2): |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
| with gr.TabItem("Submit", elem_id="guardbench-submit-tab", id=3): |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
| with gr.Row(): |
| with gr.Column(scale=3): |
| gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text") |
| with gr.Column(scale=1): |
| |
| submission_version_selector = gr.Dropdown( |
| choices=BENCHMARK_VERSIONS, |
| label="Benchmark Version", |
| value=CURRENT_VERSION, |
| interactive=True, |
| elem_classes="version-selector", |
| visible=False |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| model_name_textbox = gr.Textbox(label="Model name") |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
| model_type = gr.Dropdown( |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
| label="Model type", |
| multiselect=False, |
| value=None, |
| interactive=True, |
| ) |
| guard_model_type = gr.Dropdown( |
| choices=[t.name for t in GuardModelType], |
| label="Guard model type", |
| multiselect=False, |
| value=GuardModelType.LLM_REGEXP.name, |
| interactive=True, |
| ) |
|
|
| with gr.Column(): |
| precision = gr.Dropdown( |
| choices=[i.name for i in Precision if i != Precision.Unknown], |
| label="Precision", |
| multiselect=False, |
| value="float16", |
| interactive=True, |
| ) |
| weight_type = gr.Dropdown( |
| choices=[i.name for i in WeightType], |
| label="Weights type", |
| multiselect=False, |
| value="Original", |
| interactive=True, |
| ) |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
|
|
| with gr.Row(): |
| file_input = gr.File( |
| label="Upload JSONL Results File", |
| file_types=[".jsonl"] |
| ) |
|
|
| submit_button = gr.Button("Submit Results") |
| result_output = gr.Markdown() |
|
|
| submit_button.click( |
| fn=submit_results, |
| inputs=[ |
| model_name_textbox, |
| base_model_name_textbox, |
| revision_name_textbox, |
| precision, |
| weight_type, |
| model_type, |
| file_input, |
| submission_version_selector, |
| guard_model_type |
| ], |
| outputs=result_output |
| ) |
|
|
| |
| version_selector.change( |
| fn=update_leaderboards, |
| inputs=[version_selector], |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] |
| ) |
|
|
|
|
| |
| scheduler = BackgroundScheduler() |
| scheduler.add_job(refresh_data, 'interval', minutes=30) |
| scheduler.start() |
|
|
| |
| if __name__ == "__main__": |
|
|
| demo.launch() |
|
|
|
|
|
|