Spaces:

Genentech
/

compbiobench-leaderboard-v1

Running

Surag

cache + load latest version

54d61b3 11 days ago

15.4 kB

	import json
	import os
	from datetime import datetime, timezone
	from io import BytesIO

	import gradio as gr
	import pandas as pd
	from huggingface_hub import HfApi, list_repo_files, hf_hub_download

	TOKEN = os.environ.get("HF_TOKEN")
	API = HfApi(token=TOKEN)

	SUBMISSIONS_REPO = "Genentech/compbiobench-submissions-v1"
	RESULTS_REPO = "Genentech/compbiobench-results-v1"
	DATA_REPO = "Genentech/compbiobench-data-v1"

	DAILY_SUBMISSION_LIMIT = 2
	RATE_LIMIT_EXEMPT_USERS = {"surag-genentech"}

	_results_cache = {
	"last_modified": None,
	"by_model": pd.DataFrame(),
	"all_subs": pd.DataFrame(),
	}

	QUESTIONS_TSV = os.path.join(os.path.dirname(__file__), "compbiobench.v1.tsv")
	_questions_df = pd.read_csv(QUESTIONS_TSV, sep="\t", dtype=str)
	EXPECTED_QUESTION_IDS = set(_questions_df["question_id"].values)
	TOTAL_QUESTIONS = len(EXPECTED_QUESTION_IDS)


	def load_results_cached():
	try:
	info = API.repo_info(RESULTS_REPO, repo_type="dataset", token=TOKEN)
	last_modified = info.last_modified
	except Exception:
	last_modified = None

	if last_modified is not None and last_modified == _results_cache["last_modified"]:
	return _results_cache["by_model"], _results_cache["all_subs"]

	by_model, all_subs = load_results()
	_results_cache["last_modified"] = last_modified
	_results_cache["by_model"] = by_model
	_results_cache["all_subs"] = all_subs
	return by_model, all_subs


	def load_excluded_submissions():
	try:
	path = hf_hub_download(
	repo_id=RESULTS_REPO, filename="excluded.tsv",
	repo_type="dataset", token=TOKEN,
	)
	df = pd.read_csv(path, sep="\t")
	return set(df["submission_id"].astype(str).values)
	except Exception:
	return set()


	def load_results():
	try:
	files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
	except Exception:
	return pd.DataFrame(), pd.DataFrame()

	result_files = [f for f in files if f.startswith("results/") and f.endswith(".json")]
	if not result_files:
	return pd.DataFrame(), pd.DataFrame()

	excluded = load_excluded_submissions()

	all_results = []
	for rf in result_files:
	submission_id = rf.replace("results/", "").replace(".json", "")
	if submission_id in excluded:
	continue
	try:
	path = hf_hub_download(
	repo_id=RESULTS_REPO, filename=rf,
	repo_type="dataset", token=TOKEN,
	)
	with open(path) as f:
	data = json.load(f)
	all_results.append({
	"Agent Harness": data.get("agent_harness", ""),
	"Model": data.get("model", data.get("model_name", "")),
	"Submitter": data.get("submitter", ""),
	"Organisation": data.get("organisation", ""),
	"Accuracy (%)": round(data["accuracy"] * 100, 1),
	"Correct": data["num_correct"],
	"Total": data["total_questions"],
	"Submitted": data["submission_time"][:16].replace("T", " "),
	})
	except Exception:
	continue

	if not all_results:
	return pd.DataFrame(), pd.DataFrame()

	all_df = pd.DataFrame(all_results)
	all_df = all_df.sort_values("Accuracy (%)", ascending=False).reset_index(drop=True)
	all_df.index = all_df.index + 1
	all_df.index.name = "Rank"
	all_df = all_df.reset_index()

	by_model = all_df.groupby(["Agent Harness", "Model", "Submitter"]).agg(
	**{
	"Organisation": ("Organisation", "first"),
	"Avg Accuracy (%)": ("Accuracy (%)", "mean"),
	"Submissions": ("Model", "count"),
	"Best (%)": ("Accuracy (%)", "max"),
	"Latest Submission": ("Submitted", "max"),
	}
	).reset_index()
	by_model["Avg Accuracy (%)"] = by_model["Avg Accuracy (%)"].round(1)
	by_model["Best (%)"] = by_model["Best (%)"].round(1)
	by_model = by_model.sort_values("Avg Accuracy (%)", ascending=False).reset_index(drop=True)
	by_model.index = by_model.index + 1
	by_model.index.name = "Rank"
	by_model = by_model.reset_index()

	return by_model, all_df


	def validate_submission(file_path):
	try:
	df = pd.read_csv(file_path, sep="\t", dtype=str)
	except Exception as e:
	return None, f"Could not parse file as TSV: {e}"

	if "question_id" not in df.columns or "answer" not in df.columns:
	return None, "File must have 'question_id' and 'answer' columns."

	submitted_ids = set(df["question_id"].astype(str).values)
	missing = EXPECTED_QUESTION_IDS - submitted_ids
	extra = submitted_ids - EXPECTED_QUESTION_IDS

	if missing:
	return None, f"Missing {len(missing)} question(s): {', '.join(sorted(missing)[:5])}{'...' if len(missing) > 5 else ''}"
	if extra:
	return None, f"Unknown question ID(s): {', '.join(sorted(extra)[:5])}{'...' if len(extra) > 5 else ''}"
	if len(df) != TOTAL_QUESTIONS:
	return None, f"Expected {TOTAL_QUESTIONS} answers, got {len(df)}."

	return df, None


	def check_rate_limit(username):
	"""Check if the user has exceeded the daily submission limit."""
	if username in RATE_LIMIT_EXEMPT_USERS:
	return True
	try:
	files = list_repo_files(SUBMISSIONS_REPO, repo_type="dataset", token=TOKEN)
	except Exception:
	return True

	today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
	metadata_files = [f for f in files if f.startswith("metadata/") and f.endswith(".json")]

	count_today = 0
	for mf in metadata_files:
	try:
	path = hf_hub_download(
	repo_id=SUBMISSIONS_REPO, filename=mf,
	repo_type="dataset", token=TOKEN,
	)
	with open(path) as f:
	data = json.load(f)
	if data.get("submitter") == username and data.get("submission_time", "").startswith(today):
	count_today += 1
	except Exception:
	continue

	return count_today < DAILY_SUBMISSION_LIMIT


	def submit_predictions(agent_harness, model, organisation, contact_email, file, profile: gr.OAuthProfile \| None):
	if profile is None:
	return "Please sign in with your Hugging Face account before submitting."
	if not agent_harness or not agent_harness.strip():
	return "Please enter an agent harness name."
	if not model or not model.strip():
	return "Please enter a model name."
	if contact_email and "@" not in contact_email:
	return "Please enter a valid email address."
	if file is None:
	return "Please upload a predictions TSV file."

	agent_harness = agent_harness.strip()
	model = model.strip()
	organisation = organisation.strip() if organisation else ""
	contact_email = contact_email.strip() if contact_email else ""
	submitter = profile.username

	if not check_rate_limit(submitter):
	return f"You have reached the daily submission limit ({DAILY_SUBMISSION_LIMIT} per day). Please try again tomorrow."

	df, error = validate_submission(file)
	if error:
	return f"Validation failed: {error}"

	timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
	safe_harness = agent_harness.replace("/", "_").replace(" ", "_")
	safe_model = model.replace("/", "_").replace(" ", "_")
	submission_id = f"{safe_harness}_{safe_model}_{timestamp.replace(':', '-')}"

	metadata = {
	"agent_harness": agent_harness,
	"model": model,
	"organisation": organisation,
	"contact_email": contact_email,
	"submitter": submitter,
	"submission_time": timestamp,
	"status": "pending",
	}
	metadata_bytes = json.dumps(metadata, indent=2).encode()
	API.upload_file(
	path_or_fileobj=BytesIO(metadata_bytes),
	path_in_repo=f"metadata/{submission_id}.json",
	repo_id=SUBMISSIONS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Add submission: {agent_harness} + {model}",
	)

	with open(file, "rb") as f:
	predictions_bytes = f.read()
	API.upload_file(
	path_or_fileobj=BytesIO(predictions_bytes),
	path_in_repo=f"predictions/{submission_id}.tsv",
	repo_id=SUBMISSIONS_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message=f"Add predictions: {agent_harness} + {model}",
	)

	try:
	API.restart_space("Genentech/compbiobench-evaluator-v1", token=TOKEN)
	except Exception:
	pass

	return f"Submitted successfully! Your submission '{agent_harness} + {model}' is being evaluated. Results will appear on the leaderboard shortly."


	TITLE = "# CompBioBench v1 Leaderboard"

	INTRODUCTION = f"""
	We introduce CompBioBench v1, a benchmark of 100 diverse tasks for evaluating agentic systems in computational biology.
	Unlike mathematics and programming, which more readily admit systematic verification, biological data are inherently
	noisy and open to interpretation. To enable objective evaluation without reducing tasks to prescriptive checklists,
	we propose a new benchmark-construction strategy based on synthetic/augmented data and metadata scrambling/scrubbing
	of real datasets to create challenging problems with a single ground-truth answer that require multi-step reasoning,
	tool use, bespoke code, and interaction with real-world external resources. The benchmark spans genomics,
	transcriptomics, epigenomics, single-cell analysis, human genetics, and machine learning workflows. Questions are
	curated by domain experts to cover a broad range of skills with varying difficulty.

	For more information, see the [paper](https://www.biorxiv.org/content/10.64898/2026.04.06.716850v1). Questions and input data are available [here](https://huggingface.co/datasets/{DATA_REPO}).

	Check the Instructions tab for submission details.

	Submissions from the original authors are the ones submitted by `surag-genentech`.
	"""

	ABOUT_TEXT = f"""
	## Getting Started

	1. Download questions and input files from the [dataset]({f"https://huggingface.co/datasets/{DATA_REPO}"})
	2. Run your agent on all 100 questions
	3. Submit your answers as a TSV file with columns `question_id` and `answer`

	### Running the Benchmark

	Each question should be run in an isolated environment (e.g., a separate conda clone) to prevent
	cross-contamination between questions. In our experiments, we created a base conda environment with
	Python 3.11, standard scientific and bioinformatics tooling, and cloned it per question. You can see
	our code [here](https://github.com/Genentech/compbiobench-runner/tree/dc350ed). Users are free to choose their own isolation strategy.

	For each question, input files should be placed in an isolated workspace directory, and the agent
	should be launched with that workspace as its working directory. Web access should be enabled.

	We recommend running the full benchmark 3 times and submitting each run separately. The leaderboard
	automatically averages scores across multiple submissions of the same agent harness + model combination.

	### Submission Format

	Upload a tab-separated file with exactly {TOTAL_QUESTIONS+1} rows (header + one per question):

	```
	question_id\\tanswer
	ensembl-grab-q1\\tENSG00000000001;ENSG00000000002
	bam-ops-q1\\t42
	...
	```

	### Evaluation

	Answers are evaluated by exact string match (whitespace-stripped). Your accuracy is the fraction of
	questions answered correctly out of {TOTAL_QUESTIONS}.

	### Prompt Template

	The following is the prompt template we used in our experiments:

	```
	QUESTION: {{question}}

	FILES: {{workspace_file_paths}}

	Note: All files are located in your current working directory (workspace).

	INSTRUCTIONS:
	- You have {{timeout_minutes}} minutes to complete this task
	- Do not read/access any other files outside the workspace.
	- Get any files or tools you need from the internet.
	- You are free to modify the current conda environment as needed.
	- Keep all scripts and intermediate data in the workspace only.

	OUTPUT CONTRACT (IMPORTANT):
	- Your final response will be graded by exact string match.
	- Return EXACTLY ONE LINE containing ONLY the final answer
	in the format required by the question.
	- Do NOT include any explanation, reasoning, labels, prefixes,
	markdown, code fences, citations, or extra whitespace lines.
	- Any extra text before or after the answer is incorrect.
	- Before sending your final response, verify it is exactly one
	line and nothing else.
	```
	"""

	with gr.Blocks(title="CompBioBench v1 Leaderboard") as demo:
	gr.Markdown(TITLE)
	gr.Markdown(INTRODUCTION)

	with gr.Tabs():
	with gr.TabItem("Leaderboard"):
	with gr.Tabs():
	with gr.TabItem("By Model"):
	by_model_table = gr.Dataframe(
	interactive=False,
	column_widths=["5%", "20%", "20%", "10%", "15%", "15%", "5%", "8%", "15%"],
	)
	with gr.TabItem("All Submissions"):
	all_subs_table = gr.Dataframe(
	interactive=False,
	column_widths=["5%", "20%", "20%", "10%", "15%", "10%", "5%", "5%", "20%"],
	)

	refresh_btn = gr.Button("Refresh")

	def refresh():
	by_model, all_subs = load_results_cached()
	return (
	by_model if not by_model.empty else None,
	all_subs if not all_subs.empty else None,
	)

	refresh_btn.click(fn=refresh, outputs=[by_model_table, all_subs_table])

	with gr.TabItem("Instructions"):
	gr.Markdown(ABOUT_TEXT)

	with gr.TabItem("Submit"):
	gr.Markdown("## Submit Your Predictions")
	gr.Markdown(
	"Sign in with your Hugging Face account, then fill in the details and upload a TSV file with "
	f"`question_id` and `answer` columns containing answers to all {TOTAL_QUESTIONS} questions. "
	f"Limit: {DAILY_SUBMISSION_LIMIT} submissions per day. "
	"We recommend submitting 3 independent runs; the leaderboard averages scores automatically."
	)
	gr.LoginButton()
	with gr.Row():
	with gr.Column():
	harness_input = gr.Textbox(label="Agent Harness", placeholder="e.g. OpenHands, Claude Code, custom")
	model_input = gr.Textbox(label="Model", placeholder="e.g. GPT-4o, Claude 3.5 Sonnet")
	with gr.Column():
	org_input = gr.Textbox(label="Organisation (optional)", placeholder="e.g. Genentech")
	email_input = gr.Textbox(label="Contact Email (private, used only to contact you regarding your submission)", placeholder="your@email.com")
	file_input = gr.File(label="Predictions TSV (.tsv file)")
	submit_btn = gr.Button("Submit")
	result_output = gr.Markdown()

	submit_btn.click(
	fn=submit_predictions,
	inputs=[harness_input, model_input, org_input, email_input, file_input],
	outputs=result_output,
	)

	demo.load(fn=refresh, outputs=[by_model_table, all_subs_table])

	demo.queue(default_concurrency_limit=10).launch()