GenerativeReasoningBenchmark

Running

App Files Files Community

GenerativeReasoningBenchmark / app.py

zhwang4ai

fix

c8f6405 about 1 year ago

raw

history blame contribute delete

9.82 kB

	import json
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	from texts import TITLE, DESCRIPTION, ABOUT
	from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
	from display import custom_css
	BENCHMARKS_TO_SKIP = []

	color_map = {
	"Pretrained": "#7497db",
	"RL": "#E8ECF2",
	"Finetuned": "#ffcd75",
	# "DPO": "#75809c",
	}

	model_name_map = {
	"qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
	"qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
	"qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
	"qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
	"qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
	"llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
	"llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
	"llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
	"llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
	"mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
	"gemma-2-27b-it": "google/gemma-2-27b-it",
	"gemma-2-9b-it": "google/gemma-2-9b-it",
	"deepseek-v3": "deepseek-ai/DeepSeek-V3",
	"deepseek-r1": "deepseek-ai/DeepSeek-R1",
	"qwq-32b": "Qwen/QwQ-32B",
	"yi-lightning": "Yi/Yi-Lightning",
	'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
	'gpt-4o': "openai/gpt-4o",
	'gpt-4o-mini': "openai/gpt-4o-mini",
	'o1-mini': "openai/o1-mini",
	'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
	'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
	}

	def map_model_name(model_id):
	if model_id not in model_name_map.keys():
	return model_id
	else:
	return model_name_map[model_id]

	# 定义函数，将模型名称转换为带有链接的 HTML 格式
	def model_hyperlink(link, model_name):
	# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
	return f"[{model_name}]({link})"

	def make_clickable_model(model_name):
	link = f"https://huggingface.co/{model_name}"
	return model_hyperlink(link, model_name)

	rl_models = ['deepseek-r1', 'o1-mini']
	def map_model_type(model_name):
	if model_name in rl_models:
	return "RL"
	else:
	return "Pretrained"


	def prep_leaderboard_df():
	average_df = load_average_data()
	hard_df = load_hard_data()
	easy_df = load_easy_data()
	df = pd.concat([easy_df, hard_df, average_df], axis=1)
	# insert a column named "Model" at the first position
	df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
	df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
	# 对 Model 列应用函数，将模型名称转换为链接形式
	# df['Model'] = df['Model'].apply(make_clickable_model)
	df = df.round(2)
	return df

	def prep_detailed_success_rate_df():
	df = load_detailed_success_rate_data()
	# df = df.T # 转置为 model 是行，指标是列
	df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
	df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
	df = df.round(2)
	return df

	def prep_detailed_action_counts_df():
	df = load_detailed_action_counts_data()
	# df = df.T # 转置为 model 是行，指标是列
	df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
	df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
	df = df.round(2)
	return df

	leaderboard_df = prep_leaderboard_df()
	detailed_success_rate_df = prep_detailed_success_rate_df()
	detailed_action_counts_df = prep_detailed_action_counts_df()

	# Function to update the table based on search query
	def filter_and_search_success_rate(cols: list[str], search_query: str, agg: str,):
	# print("filter")
	df = detailed_success_rate_df
	search_terms = "Model"
	if len(search_query) > 0:
	search_terms = search_query.split(";")
	search_terms = [term.strip().lower() for term in search_terms]
	pattern = "\|".join(search_terms)
	df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
	# Drop any columns which are all NaN
	df = df.dropna(how="all", axis=1)

	if len(cols) > 0:
	index_cols = list(leaderboard_df.columns[:1])
	new_cols = index_cols + cols
	df = df.copy()[new_cols]
	df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

	df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
	df = df.sort_values(by=cols, ascending=False, na_position='last')
	df[cols] = df[cols].astype(str)
	return df

	# Function to update the table based on search query
	def filter_and_search_action_counts(cols: list[str], search_query: str, agg: str,):
	# print("filter")
	df = detailed_action_counts_df
	search_terms = "Model"
	if len(search_query) > 0:
	search_terms = search_query.split(";")
	search_terms = [term.strip().lower() for term in search_terms]
	pattern = "\|".join(search_terms)
	df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
	# Drop any columns which are all NaN
	df = df.dropna(how="all", axis=1)

	if len(cols) > 0:
	index_cols = list(leaderboard_df.columns[:1])
	new_cols = index_cols + cols
	df = df.copy()[new_cols]
	df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

	df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
	df = df.sort_values(by=cols, ascending=False, na_position='last')
	df[cols] = df[cols].astype(str)
	return df


	demo = gr.Blocks(css=custom_css)

	with demo:
	gr.HTML(TITLE)
	with gr.Row():
	with gr.Column():
	gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏆 Leaderboard"):
	with gr.Row():
	# search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

	# cols_bar = gr.CheckboxGroup(
	# choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
	# show_label=False,
	# # info="Select columns to display",
	# )
	with gr.Group():
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	wrap=True,
	column_widths=[250, 120] + [(60 + len(c)) for c in leaderboard_df.columns[2:]],
	)

	#cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
	# search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
	with gr.TabItem("Success Rates - Detailed"):
	with gr.Column():
	with gr.Row():
	search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

	with gr.Row():
	cols_bar = gr.CheckboxGroup(
	choices=[c for c in detailed_success_rate_df.columns[2:] if c != "Average"],
	show_label=False,
	# info="Select columns to display",
	)
	detailed_success_rate_table = gr.Dataframe(
	value=detailed_success_rate_df,
	wrap=True,
	column_widths=[350, 120] + [(150 + len(c)) for c in detailed_success_rate_df.columns[2:]],
	)
	cols_bar.change(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
	search_bar.submit(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])

	with gr.TabItem("Action Counts - Detailed"):
	with gr.Column():
	with gr.Row():
	search_bar_1 = gr.Textbox(placeholder="Search for your model...", show_label=False)

	with gr.Row():
	cols_bar_1 = gr.CheckboxGroup(
	choices=[c for c in detailed_action_counts_df.columns[2:] if c != "Average"],
	show_label=False,
	# info="Select columns to display",
	)
	detailed_action_counts_table = gr.Dataframe(
	value=detailed_action_counts_df,
	wrap=True,
	column_widths=[350, 120] + [(100 + len(c)) for c in detailed_action_counts_df.columns[2:]],
	)
	cols_bar_1.change(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
	search_bar_1.submit(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])

	with gr.TabItem("About"):
	gr.Markdown(ABOUT)


	with gr.Row():
	with gr.Accordion("📚 Citation", open=False):
	citation_button = gr.Textbox(
	value=r"""@article{lin2025generative,
	title={Generative Evaluation of Complex Reasoning in Large Language Models},
	author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
	journal={arXiv preprint arXiv:2504.02810},
	year={2025}
	}""",
	lines=7,
	label="Copy the following to cite these results.",
	elem_id="citation-button",
	show_copy_button=True,
	)

	demo.launch()