Surag
cache + load latest version
54d61b3
import json
import os
from datetime import datetime, timezone
from io import BytesIO
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, list_repo_files, hf_hub_download
TOKEN = os.environ.get("HF_TOKEN")
API = HfApi(token=TOKEN)
SUBMISSIONS_REPO = "Genentech/compbiobench-submissions-v1"
RESULTS_REPO = "Genentech/compbiobench-results-v1"
DATA_REPO = "Genentech/compbiobench-data-v1"
DAILY_SUBMISSION_LIMIT = 2
RATE_LIMIT_EXEMPT_USERS = {"surag-genentech"}
_results_cache = {
"last_modified": None,
"by_model": pd.DataFrame(),
"all_subs": pd.DataFrame(),
}
QUESTIONS_TSV = os.path.join(os.path.dirname(__file__), "compbiobench.v1.tsv")
_questions_df = pd.read_csv(QUESTIONS_TSV, sep="\t", dtype=str)
EXPECTED_QUESTION_IDS = set(_questions_df["question_id"].values)
TOTAL_QUESTIONS = len(EXPECTED_QUESTION_IDS)
def load_results_cached():
try:
info = API.repo_info(RESULTS_REPO, repo_type="dataset", token=TOKEN)
last_modified = info.last_modified
except Exception:
last_modified = None
if last_modified is not None and last_modified == _results_cache["last_modified"]:
return _results_cache["by_model"], _results_cache["all_subs"]
by_model, all_subs = load_results()
_results_cache["last_modified"] = last_modified
_results_cache["by_model"] = by_model
_results_cache["all_subs"] = all_subs
return by_model, all_subs
def load_excluded_submissions():
try:
path = hf_hub_download(
repo_id=RESULTS_REPO, filename="excluded.tsv",
repo_type="dataset", token=TOKEN,
)
df = pd.read_csv(path, sep="\t")
return set(df["submission_id"].astype(str).values)
except Exception:
return set()
def load_results():
try:
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
except Exception:
return pd.DataFrame(), pd.DataFrame()
result_files = [f for f in files if f.startswith("results/") and f.endswith(".json")]
if not result_files:
return pd.DataFrame(), pd.DataFrame()
excluded = load_excluded_submissions()
all_results = []
for rf in result_files:
submission_id = rf.replace("results/", "").replace(".json", "")
if submission_id in excluded:
continue
try:
path = hf_hub_download(
repo_id=RESULTS_REPO, filename=rf,
repo_type="dataset", token=TOKEN,
)
with open(path) as f:
data = json.load(f)
all_results.append({
"Agent Harness": data.get("agent_harness", ""),
"Model": data.get("model", data.get("model_name", "")),
"Submitter": data.get("submitter", ""),
"Organisation": data.get("organisation", ""),
"Accuracy (%)": round(data["accuracy"] * 100, 1),
"Correct": data["num_correct"],
"Total": data["total_questions"],
"Submitted": data["submission_time"][:16].replace("T", " "),
})
except Exception:
continue
if not all_results:
return pd.DataFrame(), pd.DataFrame()
all_df = pd.DataFrame(all_results)
all_df = all_df.sort_values("Accuracy (%)", ascending=False).reset_index(drop=True)
all_df.index = all_df.index + 1
all_df.index.name = "Rank"
all_df = all_df.reset_index()
by_model = all_df.groupby(["Agent Harness", "Model", "Submitter"]).agg(
**{
"Organisation": ("Organisation", "first"),
"Avg Accuracy (%)": ("Accuracy (%)", "mean"),
"Submissions": ("Model", "count"),
"Best (%)": ("Accuracy (%)", "max"),
"Latest Submission": ("Submitted", "max"),
}
).reset_index()
by_model["Avg Accuracy (%)"] = by_model["Avg Accuracy (%)"].round(1)
by_model["Best (%)"] = by_model["Best (%)"].round(1)
by_model = by_model.sort_values("Avg Accuracy (%)", ascending=False).reset_index(drop=True)
by_model.index = by_model.index + 1
by_model.index.name = "Rank"
by_model = by_model.reset_index()
return by_model, all_df
def validate_submission(file_path):
try:
df = pd.read_csv(file_path, sep="\t", dtype=str)
except Exception as e:
return None, f"Could not parse file as TSV: {e}"
if "question_id" not in df.columns or "answer" not in df.columns:
return None, "File must have 'question_id' and 'answer' columns."
submitted_ids = set(df["question_id"].astype(str).values)
missing = EXPECTED_QUESTION_IDS - submitted_ids
extra = submitted_ids - EXPECTED_QUESTION_IDS
if missing:
return None, f"Missing {len(missing)} question(s): {', '.join(sorted(missing)[:5])}{'...' if len(missing) > 5 else ''}"
if extra:
return None, f"Unknown question ID(s): {', '.join(sorted(extra)[:5])}{'...' if len(extra) > 5 else ''}"
if len(df) != TOTAL_QUESTIONS:
return None, f"Expected {TOTAL_QUESTIONS} answers, got {len(df)}."
return df, None
def check_rate_limit(username):
"""Check if the user has exceeded the daily submission limit."""
if username in RATE_LIMIT_EXEMPT_USERS:
return True
try:
files = list_repo_files(SUBMISSIONS_REPO, repo_type="dataset", token=TOKEN)
except Exception:
return True
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
metadata_files = [f for f in files if f.startswith("metadata/") and f.endswith(".json")]
count_today = 0
for mf in metadata_files:
try:
path = hf_hub_download(
repo_id=SUBMISSIONS_REPO, filename=mf,
repo_type="dataset", token=TOKEN,
)
with open(path) as f:
data = json.load(f)
if data.get("submitter") == username and data.get("submission_time", "").startswith(today):
count_today += 1
except Exception:
continue
return count_today < DAILY_SUBMISSION_LIMIT
def submit_predictions(agent_harness, model, organisation, contact_email, file, profile: gr.OAuthProfile | None):
if profile is None:
return "Please sign in with your Hugging Face account before submitting."
if not agent_harness or not agent_harness.strip():
return "Please enter an agent harness name."
if not model or not model.strip():
return "Please enter a model name."
if contact_email and "@" not in contact_email:
return "Please enter a valid email address."
if file is None:
return "Please upload a predictions TSV file."
agent_harness = agent_harness.strip()
model = model.strip()
organisation = organisation.strip() if organisation else ""
contact_email = contact_email.strip() if contact_email else ""
submitter = profile.username
if not check_rate_limit(submitter):
return f"You have reached the daily submission limit ({DAILY_SUBMISSION_LIMIT} per day). Please try again tomorrow."
df, error = validate_submission(file)
if error:
return f"Validation failed: {error}"
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
safe_harness = agent_harness.replace("/", "_").replace(" ", "_")
safe_model = model.replace("/", "_").replace(" ", "_")
submission_id = f"{safe_harness}_{safe_model}_{timestamp.replace(':', '-')}"
metadata = {
"agent_harness": agent_harness,
"model": model,
"organisation": organisation,
"contact_email": contact_email,
"submitter": submitter,
"submission_time": timestamp,
"status": "pending",
}
metadata_bytes = json.dumps(metadata, indent=2).encode()
API.upload_file(
path_or_fileobj=BytesIO(metadata_bytes),
path_in_repo=f"metadata/{submission_id}.json",
repo_id=SUBMISSIONS_REPO,
repo_type="dataset",
token=TOKEN,
commit_message=f"Add submission: {agent_harness} + {model}",
)
with open(file, "rb") as f:
predictions_bytes = f.read()
API.upload_file(
path_or_fileobj=BytesIO(predictions_bytes),
path_in_repo=f"predictions/{submission_id}.tsv",
repo_id=SUBMISSIONS_REPO,
repo_type="dataset",
token=TOKEN,
commit_message=f"Add predictions: {agent_harness} + {model}",
)
try:
API.restart_space("Genentech/compbiobench-evaluator-v1", token=TOKEN)
except Exception:
pass
return f"Submitted successfully! Your submission '{agent_harness} + {model}' is being evaluated. Results will appear on the leaderboard shortly."
TITLE = "# CompBioBench v1 Leaderboard"
INTRODUCTION = f"""
We introduce CompBioBench v1, a benchmark of 100 diverse tasks for evaluating agentic systems in computational biology.
Unlike mathematics and programming, which more readily admit systematic verification, biological data are inherently
noisy and open to interpretation. To enable objective evaluation without reducing tasks to prescriptive checklists,
we propose a new benchmark-construction strategy based on synthetic/augmented data and metadata scrambling/scrubbing
of real datasets to create challenging problems with a single ground-truth answer that require multi-step reasoning,
tool use, bespoke code, and interaction with real-world external resources. The benchmark spans genomics,
transcriptomics, epigenomics, single-cell analysis, human genetics, and machine learning workflows. Questions are
curated by domain experts to cover a broad range of skills with varying difficulty.
For more information, see the [paper](https://www.biorxiv.org/content/10.64898/2026.04.06.716850v1). Questions and input data are available [here](https://huggingface.co/datasets/{DATA_REPO}).
Check the Instructions tab for submission details.
Submissions from the original authors are the ones submitted by `surag-genentech`.
"""
ABOUT_TEXT = f"""
## Getting Started
1. Download questions and input files from the [dataset]({f"https://huggingface.co/datasets/{DATA_REPO}"})
2. Run your agent on all 100 questions
3. Submit your answers as a TSV file with columns `question_id` and `answer`
### Running the Benchmark
Each question should be run in an isolated environment (e.g., a separate conda clone) to prevent
cross-contamination between questions. In our experiments, we created a base conda environment with
Python 3.11, standard scientific and bioinformatics tooling, and cloned it per question. You can see
our code [here](https://github.com/Genentech/compbiobench-runner/tree/dc350ed). Users are free to choose their own isolation strategy.
For each question, input files should be placed in an isolated workspace directory, and the agent
should be launched with that workspace as its working directory. Web access should be enabled.
We recommend running the full benchmark 3 times and submitting each run separately. The leaderboard
automatically averages scores across multiple submissions of the same agent harness + model combination.
### Submission Format
Upload a tab-separated file with exactly {TOTAL_QUESTIONS+1} rows (header + one per question):
```
question_id\\tanswer
ensembl-grab-q1\\tENSG00000000001;ENSG00000000002
bam-ops-q1\\t42
...
```
### Evaluation
Answers are evaluated by **exact string match** (whitespace-stripped). Your accuracy is the fraction of
questions answered correctly out of {TOTAL_QUESTIONS}.
### Prompt Template
The following is the prompt template we used in our experiments:
```
QUESTION: {{question}}
FILES: {{workspace_file_paths}}
Note: All files are located in your current working directory (workspace).
INSTRUCTIONS:
- You have {{timeout_minutes}} minutes to complete this task
- Do not read/access any other files outside the workspace.
- Get any files or tools you need from the internet.
- You are free to modify the current conda environment as needed.
- Keep all scripts and intermediate data in the workspace only.
OUTPUT CONTRACT (IMPORTANT):
- Your final response will be graded by exact string match.
- Return EXACTLY ONE LINE containing **ONLY** the final answer
in the format required by the question.
- Do NOT include any explanation, reasoning, labels, prefixes,
markdown, code fences, citations, or extra whitespace lines.
- Any extra text before or after the answer is incorrect.
- Before sending your final response, verify it is exactly one
line and nothing else.
```
"""
with gr.Blocks(title="CompBioBench v1 Leaderboard") as demo:
gr.Markdown(TITLE)
gr.Markdown(INTRODUCTION)
with gr.Tabs():
with gr.TabItem("Leaderboard"):
with gr.Tabs():
with gr.TabItem("By Model"):
by_model_table = gr.Dataframe(
interactive=False,
column_widths=["5%", "20%", "20%", "10%", "15%", "15%", "5%", "8%", "15%"],
)
with gr.TabItem("All Submissions"):
all_subs_table = gr.Dataframe(
interactive=False,
column_widths=["5%", "20%", "20%", "10%", "15%", "10%", "5%", "5%", "20%"],
)
refresh_btn = gr.Button("Refresh")
def refresh():
by_model, all_subs = load_results_cached()
return (
by_model if not by_model.empty else None,
all_subs if not all_subs.empty else None,
)
refresh_btn.click(fn=refresh, outputs=[by_model_table, all_subs_table])
with gr.TabItem("Instructions"):
gr.Markdown(ABOUT_TEXT)
with gr.TabItem("Submit"):
gr.Markdown("## Submit Your Predictions")
gr.Markdown(
"Sign in with your Hugging Face account, then fill in the details and upload a TSV file with "
f"`question_id` and `answer` columns containing answers to all {TOTAL_QUESTIONS} questions. "
f"Limit: {DAILY_SUBMISSION_LIMIT} submissions per day. "
"We recommend submitting 3 independent runs; the leaderboard averages scores automatically."
)
gr.LoginButton()
with gr.Row():
with gr.Column():
harness_input = gr.Textbox(label="Agent Harness", placeholder="e.g. OpenHands, Claude Code, custom")
model_input = gr.Textbox(label="Model", placeholder="e.g. GPT-4o, Claude 3.5 Sonnet")
with gr.Column():
org_input = gr.Textbox(label="Organisation (optional)", placeholder="e.g. Genentech")
email_input = gr.Textbox(label="Contact Email (private, used only to contact you regarding your submission)", placeholder="your@email.com")
file_input = gr.File(label="Predictions TSV (.tsv file)")
submit_btn = gr.Button("Submit")
result_output = gr.Markdown()
submit_btn.click(
fn=submit_predictions,
inputs=[harness_input, model_input, org_input, email_input, file_input],
outputs=result_output,
)
demo.load(fn=refresh, outputs=[by_model_table, all_subs_table])
demo.queue(default_concurrency_limit=10).launch()