| """Streamlit visualizer for the evaluation model outputs. |
| |
| Run the following command to start the visualizer: |
| streamlit run 0_π_OpenHands_Benchmark.py --server.port 8501 --server.address 0.0.0.0 |
| NOTE: YOU SHOULD BE AT THE ROOT OF THE REPOSITORY TO RUN THIS COMMAND. |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import streamlit as st |
| import altair as alt |
|
|
| from utils import load_filepaths, filter_dataframe |
| from utils.swe_bench import get_resolved_stats_from_filepath |
|
|
| st.write("# π OpenHands Evaluation Benchmark") |
| filepaths = load_filepaths() |
| with st.expander("Show filepaths"): |
| st.write(filepaths) |
|
|
| |
| st.write("## SWE-Bench Lite") |
|
|
| st.write("All results are obtained *without hints*.") |
| filepaths = filepaths.query('benchmark == "swe_bench_lite" or benchmark == "swe-bench-lite"') |
| swe_bench_results = filepaths[filepaths['note'].apply(lambda x: 'no-hint' in x)] |
|
|
| swe_bench_results = pd.concat([ |
| swe_bench_results, |
| swe_bench_results['filepath'].apply(get_resolved_stats_from_filepath).apply(pd.Series) |
| ], axis=1) |
| swe_bench_results = swe_bench_results.drop( |
| columns=['filepath', 'eval_output_dir', 'agent_class', 'benchmark'] |
| ) |
| swe_bench_results = swe_bench_results[[ |
| 'agent_name', 'note', |
| 'model_name', |
| 'success_rate', 'n_solved', 'n_error', 'n_stuck_in_loop', |
| 'total', 'total_cost', |
| 'max_iterations', 'git_commit', 'start_time' |
| ]] |
|
|
| |
| _below_v1_5_mask = swe_bench_results['note'].apply(lambda x: 'v1.0' in x or 'v1.3' in x) \ |
| & swe_bench_results['agent_name'].apply(lambda x: 'CodeActAgent' in x) |
| swe_bench_results.loc[_below_v1_5_mask, 'n_error'] = np.nan |
| swe_bench_results.loc[_below_v1_5_mask, 'n_stuck_in_loop'] = np.nan |
| swe_bench_results.loc[_below_v1_5_mask, 'total_cost'] = np.nan |
| |
|
|
| swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False) |
| swe_bench_results['success_rate'] = swe_bench_results['success_rate'].apply(lambda x: round(x, 4) * 100) |
| swe_bench_results['total'] = swe_bench_results['total'].apply(lambda x: f"{x:,.0f}") |
| swe_bench_results['max_iterations'] = swe_bench_results['max_iterations'].apply(lambda x: f"{x:,.0f}") |
|
|
| swe_bench_results = filter_dataframe(swe_bench_results) |
| |
| st.dataframe(swe_bench_results, use_container_width=True) |
|
|
| |
| |
| |
| st.write("### Success Rate") |
| swe_bench_results['exp_name'] = swe_bench_results['agent_name'] + ' (' + swe_bench_results['note'] + ')' + ' + ' + swe_bench_results['model_name'] |
| swe_bench_results = swe_bench_results.sort_values(by='success_rate', ascending=False) |
|
|
| chart = ( |
| alt.Chart(swe_bench_results) |
| .mark_bar() |
| .encode( |
| x=alt.X( |
| 'success_rate', type='quantitative', title='Success Rate', |
| ), |
| y=alt.Y( |
| 'exp_name', type='nominal', sort='-x', |
| axis=alt.Axis(labelLimit=800), |
| |
| title=None |
| ), |
| color=alt.Color('success_rate', type='quantitative', scale=alt.Scale(scheme='viridis')) |
| ) |
| ) |
| st.altair_chart(chart, use_container_width=True) |
|
|
| |
| |
| st.write("### Success Rate vs. Average Cost") |
| swe_bench_results.dropna(subset=['total', 'total_cost'], inplace=True) |
| swe_bench_results['avg_cost'] = swe_bench_results['total_cost'] / swe_bench_results['total'].replace({',': ''}, regex=True).astype(int) |
| |
| swe_bench_results = swe_bench_results[(swe_bench_results['avg_cost'] > 0) & (swe_bench_results['success_rate'] > 0)] |
|
|
| chart = ( |
| alt.Chart(swe_bench_results) |
| .mark_circle(size=250) |
| .encode( |
| x=alt.X('avg_cost', title='Average Cost (USD per instance)'), |
| y=alt.Y('success_rate', title='Success Rate (%)'), |
| color=alt.Color('model_name', legend=alt.Legend(title="Model", labelLimit=200)), |
| tooltip=['agent_name', 'note', 'model_name', 'success_rate', 'avg_cost'] |
| ) |
| ) |
| st.altair_chart(chart, use_container_width=True) |
|
|
|
|