| import gradio as gr |
| import pandas as pd |
| import numpy as np |
|
|
| |
| extension_to_language = { |
| "clj": "Clojure", |
| "cpp": "C++", |
| "cs": "C#", |
| "d": "D", |
| "dart": "Dart", |
| "elixir": "Elixir", |
| "go": "Go", |
| "hs": "Haskell", |
| "java": "Java", |
| "jl": "Julia", |
| "js": "JavaScript", |
| "lua": "Lua", |
| "ml": "OCaml", |
| "php": "PHP", |
| "pl": "Perl", |
| "r": "R", |
| "rb": "Ruby", |
| "rkt": "Racket", |
| "rs": "Rust", |
| "scala": "Scala", |
| "sh": "Shell", |
| "swift": "Swift", |
| "ts": "TypeScript" |
| } |
|
|
| |
| df = pd.read_csv('passk.csv') |
|
|
| |
| def extract_info(dataset): |
| parts = dataset.split('-') |
| language = parts[1] |
| model = '-'.join(parts[2:-2]) |
| return pd.Series({'Language': language, 'Model': model}) |
|
|
| |
| df[['Language', 'Model']] = df['Dataset'].apply(extract_info) |
|
|
| |
| model_to_friendly = { |
| "starcoder2_15b": "StarCoder2-15B", |
| "deepseekcoder_v2lite_base": "DeepSeekCoder2-Lite-Base" |
| } |
|
|
| |
| def get_friendly_name(model): |
| return model_to_friendly.get(model, model) |
|
|
| |
| pivot = df.pivot(index='Model', columns='Language', values='Estimate') |
|
|
| |
| languages = sorted(pivot.columns) |
| models = sorted(pivot.index) |
|
|
| |
| def update_table(selected_languages): |
| if not selected_languages: |
| return pd.DataFrame({'Model': [get_friendly_name(model) for model in models]}) |
| |
| display_data = pivot[selected_languages].replace(np.nan, "-") |
| display_data = display_data.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x) |
| |
| |
| display_data.insert(0, 'Model', [get_friendly_name(model) for model in display_data.index]) |
| |
| |
| display_data = display_data.reset_index(drop=True) |
| |
| |
| display_data.columns = ['Model'] + [extension_to_language.get(lang, lang) for lang in selected_languages] |
| |
| return display_data |
|
|
| |
| def get_initial_table(): |
| return update_table(languages) |
|
|
| |
| with gr.Blocks() as app: |
| gr.Markdown(""" |
| # MultiPL-E Results |
| |
| [MultiPL-E](https://huggingface.co/datasets/nuprl/MultiPL-E) is a dataset for |
| evaluating large language models for code generation that supports several |
| programming languages. It takes the OpenAI HumanEval and the Mostly Basic |
| Python Programs (MBPP) benchmarks and uses little compilers to translate them |
| to other languages. It is easy to add support for new languages and benchmarks. |
| |
| This table shows how some recent Code LLMs perform on MultiPL-HumanEval. |
| |
| We use the MultiPL-E 3.0 problems, which incorporates several fixes and |
| supports several new programming languages. |
| |
| """) |
| |
| with gr.Row(): |
| language_checkboxes = gr.CheckboxGroup( |
| choices=[f"{extension_to_language[lang]} ({lang})" for lang in languages], |
| label="Select Languages", |
| value=[f"{extension_to_language[lang]} ({lang})" for lang in languages] |
| ) |
| |
| table = gr.Dataframe( |
| value=get_initial_table, |
| headers=['Model'] + [extension_to_language.get(lang, lang) for lang in languages], |
| type="pandas" |
| ) |
| |
| def update_table_wrapper(selected_languages): |
| |
| selected_codes = [lang.split('(')[-1].strip(')') for lang in selected_languages] |
| return update_table(selected_codes) |
| |
| language_checkboxes.change(update_table_wrapper, inputs=[language_checkboxes], outputs=[table]) |
|
|
| |
| if __name__ == "__main__": |
| app.launch() |