| import json |
| import os |
| from datetime import datetime, timezone |
| from io import BytesIO |
|
|
| import gradio as gr |
| import pandas as pd |
| from huggingface_hub import HfApi, list_repo_files, hf_hub_download |
|
|
| TOKEN = os.environ.get("HF_TOKEN") |
| API = HfApi(token=TOKEN) |
|
|
| SUBMISSIONS_REPO = "Genentech/compbiobench-submissions-v1" |
| RESULTS_REPO = "Genentech/compbiobench-results-v1" |
| DATA_REPO = "Genentech/compbiobench-data-v1" |
|
|
| DAILY_SUBMISSION_LIMIT = 2 |
| RATE_LIMIT_EXEMPT_USERS = {"surag-genentech"} |
|
|
| _results_cache = { |
| "last_modified": None, |
| "by_model": pd.DataFrame(), |
| "all_subs": pd.DataFrame(), |
| } |
|
|
| QUESTIONS_TSV = os.path.join(os.path.dirname(__file__), "compbiobench.v1.tsv") |
| _questions_df = pd.read_csv(QUESTIONS_TSV, sep="\t", dtype=str) |
| EXPECTED_QUESTION_IDS = set(_questions_df["question_id"].values) |
| TOTAL_QUESTIONS = len(EXPECTED_QUESTION_IDS) |
|
|
|
|
| def load_results_cached(): |
| try: |
| info = API.repo_info(RESULTS_REPO, repo_type="dataset", token=TOKEN) |
| last_modified = info.last_modified |
| except Exception: |
| last_modified = None |
|
|
| if last_modified is not None and last_modified == _results_cache["last_modified"]: |
| return _results_cache["by_model"], _results_cache["all_subs"] |
|
|
| by_model, all_subs = load_results() |
| _results_cache["last_modified"] = last_modified |
| _results_cache["by_model"] = by_model |
| _results_cache["all_subs"] = all_subs |
| return by_model, all_subs |
|
|
|
|
| def load_excluded_submissions(): |
| try: |
| path = hf_hub_download( |
| repo_id=RESULTS_REPO, filename="excluded.tsv", |
| repo_type="dataset", token=TOKEN, |
| ) |
| df = pd.read_csv(path, sep="\t") |
| return set(df["submission_id"].astype(str).values) |
| except Exception: |
| return set() |
|
|
|
|
| def load_results(): |
| try: |
| files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN) |
| except Exception: |
| return pd.DataFrame(), pd.DataFrame() |
|
|
| result_files = [f for f in files if f.startswith("results/") and f.endswith(".json")] |
| if not result_files: |
| return pd.DataFrame(), pd.DataFrame() |
|
|
| excluded = load_excluded_submissions() |
|
|
| all_results = [] |
| for rf in result_files: |
| submission_id = rf.replace("results/", "").replace(".json", "") |
| if submission_id in excluded: |
| continue |
| try: |
| path = hf_hub_download( |
| repo_id=RESULTS_REPO, filename=rf, |
| repo_type="dataset", token=TOKEN, |
| ) |
| with open(path) as f: |
| data = json.load(f) |
| all_results.append({ |
| "Agent Harness": data.get("agent_harness", ""), |
| "Model": data.get("model", data.get("model_name", "")), |
| "Submitter": data.get("submitter", ""), |
| "Organisation": data.get("organisation", ""), |
| "Accuracy (%)": round(data["accuracy"] * 100, 1), |
| "Correct": data["num_correct"], |
| "Total": data["total_questions"], |
| "Submitted": data["submission_time"][:16].replace("T", " "), |
| }) |
| except Exception: |
| continue |
|
|
| if not all_results: |
| return pd.DataFrame(), pd.DataFrame() |
|
|
| all_df = pd.DataFrame(all_results) |
| all_df = all_df.sort_values("Accuracy (%)", ascending=False).reset_index(drop=True) |
| all_df.index = all_df.index + 1 |
| all_df.index.name = "Rank" |
| all_df = all_df.reset_index() |
|
|
| by_model = all_df.groupby(["Agent Harness", "Model", "Submitter"]).agg( |
| **{ |
| "Organisation": ("Organisation", "first"), |
| "Avg Accuracy (%)": ("Accuracy (%)", "mean"), |
| "Submissions": ("Model", "count"), |
| "Best (%)": ("Accuracy (%)", "max"), |
| "Latest Submission": ("Submitted", "max"), |
| } |
| ).reset_index() |
| by_model["Avg Accuracy (%)"] = by_model["Avg Accuracy (%)"].round(1) |
| by_model["Best (%)"] = by_model["Best (%)"].round(1) |
| by_model = by_model.sort_values("Avg Accuracy (%)", ascending=False).reset_index(drop=True) |
| by_model.index = by_model.index + 1 |
| by_model.index.name = "Rank" |
| by_model = by_model.reset_index() |
|
|
| return by_model, all_df |
|
|
|
|
| def validate_submission(file_path): |
| try: |
| df = pd.read_csv(file_path, sep="\t", dtype=str) |
| except Exception as e: |
| return None, f"Could not parse file as TSV: {e}" |
|
|
| if "question_id" not in df.columns or "answer" not in df.columns: |
| return None, "File must have 'question_id' and 'answer' columns." |
|
|
| submitted_ids = set(df["question_id"].astype(str).values) |
| missing = EXPECTED_QUESTION_IDS - submitted_ids |
| extra = submitted_ids - EXPECTED_QUESTION_IDS |
|
|
| if missing: |
| return None, f"Missing {len(missing)} question(s): {', '.join(sorted(missing)[:5])}{'...' if len(missing) > 5 else ''}" |
| if extra: |
| return None, f"Unknown question ID(s): {', '.join(sorted(extra)[:5])}{'...' if len(extra) > 5 else ''}" |
| if len(df) != TOTAL_QUESTIONS: |
| return None, f"Expected {TOTAL_QUESTIONS} answers, got {len(df)}." |
|
|
| return df, None |
|
|
|
|
| def check_rate_limit(username): |
| """Check if the user has exceeded the daily submission limit.""" |
| if username in RATE_LIMIT_EXEMPT_USERS: |
| return True |
| try: |
| files = list_repo_files(SUBMISSIONS_REPO, repo_type="dataset", token=TOKEN) |
| except Exception: |
| return True |
|
|
| today = datetime.now(timezone.utc).strftime("%Y-%m-%d") |
| metadata_files = [f for f in files if f.startswith("metadata/") and f.endswith(".json")] |
|
|
| count_today = 0 |
| for mf in metadata_files: |
| try: |
| path = hf_hub_download( |
| repo_id=SUBMISSIONS_REPO, filename=mf, |
| repo_type="dataset", token=TOKEN, |
| ) |
| with open(path) as f: |
| data = json.load(f) |
| if data.get("submitter") == username and data.get("submission_time", "").startswith(today): |
| count_today += 1 |
| except Exception: |
| continue |
|
|
| return count_today < DAILY_SUBMISSION_LIMIT |
|
|
|
|
| def submit_predictions(agent_harness, model, organisation, contact_email, file, profile: gr.OAuthProfile | None): |
| if profile is None: |
| return "Please sign in with your Hugging Face account before submitting." |
| if not agent_harness or not agent_harness.strip(): |
| return "Please enter an agent harness name." |
| if not model or not model.strip(): |
| return "Please enter a model name." |
| if contact_email and "@" not in contact_email: |
| return "Please enter a valid email address." |
| if file is None: |
| return "Please upload a predictions TSV file." |
|
|
| agent_harness = agent_harness.strip() |
| model = model.strip() |
| organisation = organisation.strip() if organisation else "" |
| contact_email = contact_email.strip() if contact_email else "" |
| submitter = profile.username |
|
|
| if not check_rate_limit(submitter): |
| return f"You have reached the daily submission limit ({DAILY_SUBMISSION_LIMIT} per day). Please try again tomorrow." |
|
|
| df, error = validate_submission(file) |
| if error: |
| return f"Validation failed: {error}" |
|
|
| timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
| safe_harness = agent_harness.replace("/", "_").replace(" ", "_") |
| safe_model = model.replace("/", "_").replace(" ", "_") |
| submission_id = f"{safe_harness}_{safe_model}_{timestamp.replace(':', '-')}" |
|
|
| metadata = { |
| "agent_harness": agent_harness, |
| "model": model, |
| "organisation": organisation, |
| "contact_email": contact_email, |
| "submitter": submitter, |
| "submission_time": timestamp, |
| "status": "pending", |
| } |
| metadata_bytes = json.dumps(metadata, indent=2).encode() |
| API.upload_file( |
| path_or_fileobj=BytesIO(metadata_bytes), |
| path_in_repo=f"metadata/{submission_id}.json", |
| repo_id=SUBMISSIONS_REPO, |
| repo_type="dataset", |
| token=TOKEN, |
| commit_message=f"Add submission: {agent_harness} + {model}", |
| ) |
|
|
| with open(file, "rb") as f: |
| predictions_bytes = f.read() |
| API.upload_file( |
| path_or_fileobj=BytesIO(predictions_bytes), |
| path_in_repo=f"predictions/{submission_id}.tsv", |
| repo_id=SUBMISSIONS_REPO, |
| repo_type="dataset", |
| token=TOKEN, |
| commit_message=f"Add predictions: {agent_harness} + {model}", |
| ) |
|
|
| try: |
| API.restart_space("Genentech/compbiobench-evaluator-v1", token=TOKEN) |
| except Exception: |
| pass |
|
|
| return f"Submitted successfully! Your submission '{agent_harness} + {model}' is being evaluated. Results will appear on the leaderboard shortly." |
|
|
|
|
| TITLE = "# CompBioBench v1 Leaderboard" |
|
|
| INTRODUCTION = f""" |
| We introduce CompBioBench v1, a benchmark of 100 diverse tasks for evaluating agentic systems in computational biology. |
| Unlike mathematics and programming, which more readily admit systematic verification, biological data are inherently |
| noisy and open to interpretation. To enable objective evaluation without reducing tasks to prescriptive checklists, |
| we propose a new benchmark-construction strategy based on synthetic/augmented data and metadata scrambling/scrubbing |
| of real datasets to create challenging problems with a single ground-truth answer that require multi-step reasoning, |
| tool use, bespoke code, and interaction with real-world external resources. The benchmark spans genomics, |
| transcriptomics, epigenomics, single-cell analysis, human genetics, and machine learning workflows. Questions are |
| curated by domain experts to cover a broad range of skills with varying difficulty. |
| |
| For more information, see the [paper](https://www.biorxiv.org/content/10.64898/2026.04.06.716850v1). Questions and input data are available [here](https://huggingface.co/datasets/{DATA_REPO}). |
| |
| Check the Instructions tab for submission details. |
| |
| Submissions from the original authors are the ones submitted by `surag-genentech`. |
| """ |
|
|
| ABOUT_TEXT = f""" |
| ## Getting Started |
| |
| 1. Download questions and input files from the [dataset]({f"https://huggingface.co/datasets/{DATA_REPO}"}) |
| 2. Run your agent on all 100 questions |
| 3. Submit your answers as a TSV file with columns `question_id` and `answer` |
| |
| ### Running the Benchmark |
| |
| Each question should be run in an isolated environment (e.g., a separate conda clone) to prevent |
| cross-contamination between questions. In our experiments, we created a base conda environment with |
| Python 3.11, standard scientific and bioinformatics tooling, and cloned it per question. You can see |
| our code [here](https://github.com/Genentech/compbiobench-runner/tree/dc350ed). Users are free to choose their own isolation strategy. |
| |
| For each question, input files should be placed in an isolated workspace directory, and the agent |
| should be launched with that workspace as its working directory. Web access should be enabled. |
| |
| We recommend running the full benchmark 3 times and submitting each run separately. The leaderboard |
| automatically averages scores across multiple submissions of the same agent harness + model combination. |
| |
| ### Submission Format |
| |
| Upload a tab-separated file with exactly {TOTAL_QUESTIONS+1} rows (header + one per question): |
| |
| ``` |
| question_id\\tanswer |
| ensembl-grab-q1\\tENSG00000000001;ENSG00000000002 |
| bam-ops-q1\\t42 |
| ... |
| ``` |
| |
| ### Evaluation |
| |
| Answers are evaluated by **exact string match** (whitespace-stripped). Your accuracy is the fraction of |
| questions answered correctly out of {TOTAL_QUESTIONS}. |
| |
| ### Prompt Template |
| |
| The following is the prompt template we used in our experiments: |
| |
| ``` |
| QUESTION: {{question}} |
| |
| FILES: {{workspace_file_paths}} |
| |
| Note: All files are located in your current working directory (workspace). |
| |
| INSTRUCTIONS: |
| - You have {{timeout_minutes}} minutes to complete this task |
| - Do not read/access any other files outside the workspace. |
| - Get any files or tools you need from the internet. |
| - You are free to modify the current conda environment as needed. |
| - Keep all scripts and intermediate data in the workspace only. |
| |
| OUTPUT CONTRACT (IMPORTANT): |
| - Your final response will be graded by exact string match. |
| - Return EXACTLY ONE LINE containing **ONLY** the final answer |
| in the format required by the question. |
| - Do NOT include any explanation, reasoning, labels, prefixes, |
| markdown, code fences, citations, or extra whitespace lines. |
| - Any extra text before or after the answer is incorrect. |
| - Before sending your final response, verify it is exactly one |
| line and nothing else. |
| ``` |
| """ |
|
|
| with gr.Blocks(title="CompBioBench v1 Leaderboard") as demo: |
| gr.Markdown(TITLE) |
| gr.Markdown(INTRODUCTION) |
|
|
| with gr.Tabs(): |
| with gr.TabItem("Leaderboard"): |
| with gr.Tabs(): |
| with gr.TabItem("By Model"): |
| by_model_table = gr.Dataframe( |
| interactive=False, |
| column_widths=["5%", "20%", "20%", "10%", "15%", "15%", "5%", "8%", "15%"], |
| ) |
| with gr.TabItem("All Submissions"): |
| all_subs_table = gr.Dataframe( |
| interactive=False, |
| column_widths=["5%", "20%", "20%", "10%", "15%", "10%", "5%", "5%", "20%"], |
| ) |
|
|
| refresh_btn = gr.Button("Refresh") |
|
|
| def refresh(): |
| by_model, all_subs = load_results_cached() |
| return ( |
| by_model if not by_model.empty else None, |
| all_subs if not all_subs.empty else None, |
| ) |
|
|
| refresh_btn.click(fn=refresh, outputs=[by_model_table, all_subs_table]) |
|
|
| with gr.TabItem("Instructions"): |
| gr.Markdown(ABOUT_TEXT) |
|
|
| with gr.TabItem("Submit"): |
| gr.Markdown("## Submit Your Predictions") |
| gr.Markdown( |
| "Sign in with your Hugging Face account, then fill in the details and upload a TSV file with " |
| f"`question_id` and `answer` columns containing answers to all {TOTAL_QUESTIONS} questions. " |
| f"Limit: {DAILY_SUBMISSION_LIMIT} submissions per day. " |
| "We recommend submitting 3 independent runs; the leaderboard averages scores automatically." |
| ) |
| gr.LoginButton() |
| with gr.Row(): |
| with gr.Column(): |
| harness_input = gr.Textbox(label="Agent Harness", placeholder="e.g. OpenHands, Claude Code, custom") |
| model_input = gr.Textbox(label="Model", placeholder="e.g. GPT-4o, Claude 3.5 Sonnet") |
| with gr.Column(): |
| org_input = gr.Textbox(label="Organisation (optional)", placeholder="e.g. Genentech") |
| email_input = gr.Textbox(label="Contact Email (private, used only to contact you regarding your submission)", placeholder="your@email.com") |
| file_input = gr.File(label="Predictions TSV (.tsv file)") |
| submit_btn = gr.Button("Submit") |
| result_output = gr.Markdown() |
|
|
| submit_btn.click( |
| fn=submit_predictions, |
| inputs=[harness_input, model_input, org_input, email_input, file_input], |
| outputs=result_output, |
| ) |
|
|
| demo.load(fn=refresh, outputs=[by_model_table, all_subs_table]) |
|
|
| demo.queue(default_concurrency_limit=10).launch() |
|
|