Upload RPC-Bench Space
Browse files- .gitignore +4 -0
- README.md +57 -7
- app.py +274 -0
- build_seed_leaderboard.py +156 -0
- constants.py +62 -0
- eval.py +264 -0
- leaderboard_seed.csv +29 -0
- requirements.txt +7 -0
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
index.html
|
| 2 |
+
__pycache__/
|
| 3 |
+
.cache/
|
| 4 |
+
*.pyc
|
README.md
CHANGED
|
@@ -1,13 +1,63 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
python_version: '3.13'
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: RPC-Bench Leaderboard
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
<p align="center">
|
| 14 |
+
🌐 <a href="https://rpc-bench.github.io/" target="_blank">Project Page</a> •
|
| 15 |
+
💻 <a href="https://github.com/RPC-Bench/PRC-Bench" target="_blank">GitHub</a> •
|
| 16 |
+
📖 <a href="https://arxiv.org/abs/2601.14289" target="_blank">Paper</a> •
|
| 17 |
+
🤗 <a href="https://huggingface.co" target="_blank">Hugging Face</a> •
|
| 18 |
+
🧭 <a href="https://community.modelscope.cn/" target="_blank">ModelScope</a>
|
| 19 |
+
</p>
|
| 20 |
+
|
| 21 |
+
# RPC-Bench Leaderboard
|
| 22 |
+
|
| 23 |
+
RPC-Bench is a benchmark for research paper comprehension. This Space provides two functions:
|
| 24 |
+
|
| 25 |
+
- a public leaderboard for published submissions
|
| 26 |
+
- a submission entry for uploading new evaluation files
|
| 27 |
+
|
| 28 |
+
## Expected repository layout
|
| 29 |
+
|
| 30 |
+
The Space is designed to work with a separate submission dataset repository.
|
| 31 |
+
|
| 32 |
+
```text
|
| 33 |
+
space/
|
| 34 |
+
├── app.py
|
| 35 |
+
├── constants.py
|
| 36 |
+
├── eval.py
|
| 37 |
+
├── requirements.txt
|
| 38 |
+
└── benchmark/
|
| 39 |
+
├── dev.json
|
| 40 |
+
└── test.json
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
If `benchmark/dev.json` and `benchmark/test.json` are not bundled in the Space repo, set `RPC_BENCH_GOLD_DIR` or `RPC_BENCH_GOLD_PATH` through Space secrets / variables.
|
| 44 |
+
|
| 45 |
+
The static leaderboard seed is stored in `leaderboard_seed.csv`. `index.html` is only used locally to generate that CSV and should not be uploaded to the Space repository.
|
| 46 |
+
|
| 47 |
+
## Submission format
|
| 48 |
+
|
| 49 |
+
Uploaded files should be JSONL with one answer per line:
|
| 50 |
+
|
| 51 |
+
```json
|
| 52 |
+
{"id":"...", "part_idx":1, "question":"...", "gen_answer":"...", "category":"..."}
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Required environment variables
|
| 56 |
+
|
| 57 |
+
- `HF_TOKEN`: token for cloning and pushing the submission repository
|
| 58 |
+
- `SUBMISSION_REPO_ID`: dataset repo used to store leaderboard results
|
| 59 |
+
- `RPC_BENCH_GOLD_DIR`: optional directory containing `dev.json` and `test.json`
|
| 60 |
+
- `OPENAI_API_KEY`: optional, required if you want the Space to run LLM-based judging inline
|
| 61 |
+
- `OPENAI_BASE_URL`: optional, for OpenAI-compatible endpoints
|
| 62 |
+
|
| 63 |
+
The Space can still accept uploads when the judge variables are missing, but evaluation will be marked as pending.
|
app.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import shutil
|
| 6 |
+
import tempfile
|
| 7 |
+
import traceback
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List
|
| 11 |
+
|
| 12 |
+
import gradio as gr
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from huggingface_hub import Repository
|
| 15 |
+
|
| 16 |
+
from constants import (
|
| 17 |
+
ALL_COLUMNS,
|
| 18 |
+
CITATION,
|
| 19 |
+
EXTERNAL_LINKS,
|
| 20 |
+
GOLD_PATHS,
|
| 21 |
+
HF_TOKEN,
|
| 22 |
+
INTRODUCTION,
|
| 23 |
+
MODEL_COLUMNS,
|
| 24 |
+
SCORE_COLUMNS,
|
| 25 |
+
SEED_LEADERBOARD_PATH,
|
| 26 |
+
SPACE_SUBTITLE,
|
| 27 |
+
SPACE_TITLE,
|
| 28 |
+
SUBMISSION_CSV_PATH,
|
| 29 |
+
SUBMISSION_REPO_ID,
|
| 30 |
+
SUBMISSION_REPO_TYPE,
|
| 31 |
+
SUBMIT_GUIDANCE,
|
| 32 |
+
)
|
| 33 |
+
from eval import evaluate_submission
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _empty_leaderboard() -> pd.DataFrame:
|
| 37 |
+
return pd.DataFrame(columns=ALL_COLUMNS)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _normalize_leaderboard_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 41 |
+
for col in SCORE_COLUMNS:
|
| 42 |
+
if col in df.columns:
|
| 43 |
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
| 44 |
+
return df
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _seed_leaderboard() -> pd.DataFrame:
|
| 48 |
+
if not SEED_LEADERBOARD_PATH.exists():
|
| 49 |
+
return _empty_leaderboard()
|
| 50 |
+
|
| 51 |
+
df = pd.read_csv(SEED_LEADERBOARD_PATH)
|
| 52 |
+
for col in ALL_COLUMNS:
|
| 53 |
+
if col not in df.columns:
|
| 54 |
+
df[col] = ""
|
| 55 |
+
return _normalize_leaderboard_df(df[ALL_COLUMNS])
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _clone_submission_repo() -> tuple[Repository | None, Path]:
|
| 59 |
+
if not SUBMISSION_REPO_ID:
|
| 60 |
+
return None, Path(".")
|
| 61 |
+
|
| 62 |
+
local_dir = Path(tempfile.mkdtemp(prefix="rpc_bench_submission_"))
|
| 63 |
+
repo = Repository(
|
| 64 |
+
local_dir=str(local_dir),
|
| 65 |
+
clone_from=SUBMISSION_REPO_ID,
|
| 66 |
+
repo_type=SUBMISSION_REPO_TYPE,
|
| 67 |
+
use_auth_token=HF_TOKEN,
|
| 68 |
+
)
|
| 69 |
+
repo.git_pull()
|
| 70 |
+
return repo, local_dir
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _load_leaderboard() -> pd.DataFrame:
|
| 74 |
+
try:
|
| 75 |
+
seed_df = _seed_leaderboard()
|
| 76 |
+
repo, local_dir = _clone_submission_repo()
|
| 77 |
+
if repo is None:
|
| 78 |
+
return seed_df.sort_values(by=["Info"], ascending=False, na_position="last")
|
| 79 |
+
|
| 80 |
+
csv_path = local_dir / SUBMISSION_CSV_PATH
|
| 81 |
+
if not csv_path.exists():
|
| 82 |
+
return seed_df.sort_values(by=["Info"], ascending=False, na_position="last")
|
| 83 |
+
|
| 84 |
+
df = pd.read_csv(csv_path)
|
| 85 |
+
for col in ALL_COLUMNS:
|
| 86 |
+
if col not in df.columns:
|
| 87 |
+
df[col] = ""
|
| 88 |
+
merged = pd.concat([seed_df, _normalize_leaderboard_df(df[ALL_COLUMNS])], ignore_index=True)
|
| 89 |
+
return merged.sort_values(by=["Info"], ascending=False, na_position="last")
|
| 90 |
+
except Exception:
|
| 91 |
+
print(traceback.format_exc())
|
| 92 |
+
return _seed_leaderboard().sort_values(by=["Info"], ascending=False, na_position="last")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _validate_submission_file(file_path: str) -> tuple[bool, str, List[dict]]:
|
| 96 |
+
path = Path(file_path)
|
| 97 |
+
if not path.exists():
|
| 98 |
+
return False, "Uploaded file does not exist.", []
|
| 99 |
+
if path.suffix.lower() not in {".jsonl", ".json"}:
|
| 100 |
+
return False, "Submission file must be JSONL or JSON.", []
|
| 101 |
+
|
| 102 |
+
rows: List[dict] = []
|
| 103 |
+
try:
|
| 104 |
+
if path.suffix.lower() == ".json":
|
| 105 |
+
loaded = json.loads(path.read_text(encoding="utf-8"))
|
| 106 |
+
if not isinstance(loaded, list):
|
| 107 |
+
return False, "JSON submissions must be a list of records.", []
|
| 108 |
+
rows = loaded
|
| 109 |
+
else:
|
| 110 |
+
with path.open("r", encoding="utf-8") as f:
|
| 111 |
+
for line in f:
|
| 112 |
+
line = line.strip()
|
| 113 |
+
if not line:
|
| 114 |
+
continue
|
| 115 |
+
rows.append(json.loads(line))
|
| 116 |
+
except Exception as exc:
|
| 117 |
+
return False, f"Failed to parse submission file: {exc}", []
|
| 118 |
+
|
| 119 |
+
required = {"id", "part_idx", "question", "gen_answer", "category"}
|
| 120 |
+
for idx, row in enumerate(rows, start=1):
|
| 121 |
+
missing = required - set(row.keys())
|
| 122 |
+
if missing:
|
| 123 |
+
return False, f"Row {idx} is missing fields: {sorted(missing)}", []
|
| 124 |
+
return True, "Submission format is valid.", rows
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _append_submission_record(local_dir: Path, leaderboard: pd.DataFrame, row: dict) -> pd.DataFrame:
|
| 128 |
+
csv_path = local_dir / SUBMISSION_CSV_PATH
|
| 129 |
+
merged = pd.concat([leaderboard, pd.DataFrame([row])], ignore_index=True)
|
| 130 |
+
merged = merged.reindex(columns=ALL_COLUMNS)
|
| 131 |
+
merged.to_csv(csv_path, index=False)
|
| 132 |
+
return merged
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def submit_prediction(
|
| 136 |
+
input_file,
|
| 137 |
+
model_name: str,
|
| 138 |
+
organization: str,
|
| 139 |
+
revision: str,
|
| 140 |
+
model_link: str,
|
| 141 |
+
input_config: str,
|
| 142 |
+
split: str,
|
| 143 |
+
):
|
| 144 |
+
if input_file is None:
|
| 145 |
+
return "Error: please upload a prediction file.", gr.update(value=_load_leaderboard())
|
| 146 |
+
|
| 147 |
+
path = input_file if isinstance(input_file, str) else getattr(input_file, "name", None)
|
| 148 |
+
if not path:
|
| 149 |
+
return "Error: could not access the uploaded file.", gr.update(value=_load_leaderboard())
|
| 150 |
+
|
| 151 |
+
ok, message, _ = _validate_submission_file(path)
|
| 152 |
+
if not ok:
|
| 153 |
+
return f"Error: {message}", gr.update(value=_load_leaderboard())
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
repo, local_dir = _clone_submission_repo()
|
| 157 |
+
leaderboard = _load_leaderboard()
|
| 158 |
+
|
| 159 |
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 160 |
+
display_name = revision.strip() or model_name.strip()
|
| 161 |
+
if model_link.strip() and "](" not in display_name:
|
| 162 |
+
display_name = f"[{display_name}]({model_link.strip()})"
|
| 163 |
+
|
| 164 |
+
status = "pending"
|
| 165 |
+
score_row = {k: "" for k in SCORE_COLUMNS}
|
| 166 |
+
split_path = GOLD_PATHS.get(split.lower())
|
| 167 |
+
|
| 168 |
+
if os.environ.get("OPENAI_API_KEY") and split_path and split_path.exists():
|
| 169 |
+
eval_dir = local_dir / ".eval" if repo is not None else Path(tempfile.mkdtemp(prefix="rpc_bench_eval_"))
|
| 170 |
+
try:
|
| 171 |
+
score_row = evaluate_submission(split_path, path, eval_dir)
|
| 172 |
+
status = "scored"
|
| 173 |
+
except Exception:
|
| 174 |
+
print(traceback.format_exc())
|
| 175 |
+
status = "uploaded, evaluation failed"
|
| 176 |
+
else:
|
| 177 |
+
status = "uploaded, evaluation pending"
|
| 178 |
+
|
| 179 |
+
record = {
|
| 180 |
+
"Model": display_name,
|
| 181 |
+
"Organization": organization.strip(),
|
| 182 |
+
"Input Config": input_config.strip().upper(),
|
| 183 |
+
"Date": now,
|
| 184 |
+
"Status": status,
|
| 185 |
+
**{k: score_row.get(k, "") for k in SCORE_COLUMNS},
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
if repo is None:
|
| 189 |
+
return (
|
| 190 |
+
"Submission accepted, but no submission repository is configured. "
|
| 191 |
+
"Set `SUBMISSION_REPO_ID` to enable persistent leaderboard updates.",
|
| 192 |
+
gr.update(value=_load_leaderboard()),
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
submissions_dir = local_dir / "submissions"
|
| 196 |
+
submissions_dir.mkdir(parents=True, exist_ok=True)
|
| 197 |
+
stored_name = f"{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}_{Path(path).name}"
|
| 198 |
+
shutil.copy2(path, submissions_dir / stored_name)
|
| 199 |
+
|
| 200 |
+
updated_leaderboard = _append_submission_record(local_dir, leaderboard, record)
|
| 201 |
+
repo.push_to_hub()
|
| 202 |
+
|
| 203 |
+
return f"OK: {message}. Status: {status}", gr.update(value=updated_leaderboard)
|
| 204 |
+
except Exception as exc:
|
| 205 |
+
print(traceback.format_exc())
|
| 206 |
+
return f"Error: {exc}", gr.update(value=_load_leaderboard())
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def refresh_leaderboard():
|
| 210 |
+
return gr.update(value=_load_leaderboard())
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
with gr.Blocks(title=SPACE_TITLE) as demo:
|
| 214 |
+
gr.Markdown(EXTERNAL_LINKS)
|
| 215 |
+
gr.Markdown(f"# {SPACE_TITLE}")
|
| 216 |
+
gr.Markdown(SPACE_SUBTITLE)
|
| 217 |
+
gr.Markdown(INTRODUCTION)
|
| 218 |
+
|
| 219 |
+
with gr.Tabs():
|
| 220 |
+
with gr.TabItem("🏅 Leaderboard"):
|
| 221 |
+
with gr.Row():
|
| 222 |
+
refresh_btn = gr.Button("Refresh")
|
| 223 |
+
leaderboard = gr.Dataframe(
|
| 224 |
+
value=_load_leaderboard(),
|
| 225 |
+
headers=ALL_COLUMNS,
|
| 226 |
+
datatype=["markdown", "str", "str", "str", "str", "number", "number", "number", "number", "number"],
|
| 227 |
+
interactive=False,
|
| 228 |
+
wrap=True,
|
| 229 |
+
)
|
| 230 |
+
refresh_btn.click(fn=refresh_leaderboard, inputs=None, outputs=leaderboard)
|
| 231 |
+
|
| 232 |
+
with gr.TabItem("📝 Submit"):
|
| 233 |
+
gr.Markdown(SUBMIT_GUIDANCE)
|
| 234 |
+
with gr.Row():
|
| 235 |
+
with gr.Column():
|
| 236 |
+
model_name = gr.Textbox(label="Model name", placeholder="Your model name")
|
| 237 |
+
organization = gr.Textbox(label="Organization", placeholder="Your lab, company, or team name")
|
| 238 |
+
revision = gr.Textbox(label="Revision name", placeholder="Optional revision label")
|
| 239 |
+
with gr.Column():
|
| 240 |
+
model_link = gr.Textbox(label="Model link", placeholder="https://huggingface.co/...")
|
| 241 |
+
input_config = gr.Dropdown(
|
| 242 |
+
choices=["TEXT", "VISUAL"],
|
| 243 |
+
value="TEXT",
|
| 244 |
+
label="Input config",
|
| 245 |
+
interactive=True,
|
| 246 |
+
)
|
| 247 |
+
split = gr.Dropdown(
|
| 248 |
+
choices=["test", "dev"],
|
| 249 |
+
value="test",
|
| 250 |
+
label="Evaluation split",
|
| 251 |
+
interactive=True,
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
input_file = gr.File(label="Upload prediction file", file_count="single", type="filepath")
|
| 255 |
+
submit_btn = gr.Button("Submit and evaluate")
|
| 256 |
+
submit_result = gr.Markdown()
|
| 257 |
+
|
| 258 |
+
submit_btn.click(
|
| 259 |
+
fn=submit_prediction,
|
| 260 |
+
inputs=[input_file, model_name, organization, revision, model_link, input_config, split],
|
| 261 |
+
outputs=[submit_result, leaderboard],
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
with gr.TabItem("ℹ️ About"):
|
| 265 |
+
gr.Markdown("## Citation")
|
| 266 |
+
gr.Code(CITATION, language="bibtex")
|
| 267 |
+
|
| 268 |
+
gr.Markdown(
|
| 269 |
+
"If you want inline evaluation, configure `OPENAI_API_KEY` and `OPENAI_BASE_URL` in the Space secrets."
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
demo.launch()
|
build_seed_leaderboard.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import re
|
| 5 |
+
from html.parser import HTMLParser
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
SPACE_DIR = Path(__file__).resolve().parent
|
| 10 |
+
INDEX_HTML = SPACE_DIR / "index.html"
|
| 11 |
+
OUTPUT_CSV = SPACE_DIR / "leaderboard_seed.csv"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ResultsTableParser(HTMLParser):
|
| 15 |
+
def __init__(self) -> None:
|
| 16 |
+
super().__init__()
|
| 17 |
+
self.in_results_table = False
|
| 18 |
+
self.in_tbody = False
|
| 19 |
+
self.in_tr = False
|
| 20 |
+
self.in_td = False
|
| 21 |
+
self.in_a = False
|
| 22 |
+
self.in_p = False
|
| 23 |
+
self.current_href = ""
|
| 24 |
+
self.current_cell_parts: list[str] = []
|
| 25 |
+
self.current_row: list[dict] = []
|
| 26 |
+
self.rows: list[list[dict]] = []
|
| 27 |
+
|
| 28 |
+
def handle_starttag(self, tag, attrs):
|
| 29 |
+
attrs_dict = dict(attrs)
|
| 30 |
+
if tag == "table" and attrs_dict.get("id") == "results":
|
| 31 |
+
self.in_results_table = True
|
| 32 |
+
elif self.in_results_table and tag == "tbody":
|
| 33 |
+
self.in_tbody = True
|
| 34 |
+
elif self.in_tbody and tag == "tr":
|
| 35 |
+
self.in_tr = True
|
| 36 |
+
self.current_row = []
|
| 37 |
+
elif self.in_tr and tag == "td":
|
| 38 |
+
self.in_td = True
|
| 39 |
+
self.current_cell_parts = []
|
| 40 |
+
self.current_href = ""
|
| 41 |
+
elif self.in_td and tag == "a":
|
| 42 |
+
self.in_a = True
|
| 43 |
+
self.current_href = attrs_dict.get("href", "").strip()
|
| 44 |
+
elif self.in_td and tag == "p":
|
| 45 |
+
self.in_p = True
|
| 46 |
+
elif self.in_td and tag == "br":
|
| 47 |
+
self.current_cell_parts.append(" ")
|
| 48 |
+
|
| 49 |
+
def handle_endtag(self, tag):
|
| 50 |
+
if tag == "table" and self.in_results_table:
|
| 51 |
+
self.in_results_table = False
|
| 52 |
+
elif tag == "tbody" and self.in_tbody:
|
| 53 |
+
self.in_tbody = False
|
| 54 |
+
elif tag == "tr" and self.in_tr:
|
| 55 |
+
self.in_tr = False
|
| 56 |
+
if self.current_row:
|
| 57 |
+
self.rows.append(self.current_row)
|
| 58 |
+
elif tag == "td" and self.in_td:
|
| 59 |
+
text = re.sub(r"\s+", " ", "".join(self.current_cell_parts)).strip()
|
| 60 |
+
self.current_row.append({"text": text, "href": self.current_href})
|
| 61 |
+
self.in_td = False
|
| 62 |
+
self.in_a = False
|
| 63 |
+
self.in_p = False
|
| 64 |
+
self.current_cell_parts = []
|
| 65 |
+
self.current_href = ""
|
| 66 |
+
elif tag == "a":
|
| 67 |
+
self.in_a = False
|
| 68 |
+
elif tag == "p":
|
| 69 |
+
self.in_p = False
|
| 70 |
+
|
| 71 |
+
def handle_data(self, data):
|
| 72 |
+
if self.in_td:
|
| 73 |
+
self.current_cell_parts.append(data)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def parse_rows() -> list[dict]:
|
| 77 |
+
parser = ResultsTableParser()
|
| 78 |
+
parser.feed(INDEX_HTML.read_text(encoding="utf-8"))
|
| 79 |
+
|
| 80 |
+
records = []
|
| 81 |
+
for row in parser.rows:
|
| 82 |
+
if len(row) < 8:
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
model_cell = row[1]
|
| 86 |
+
model_text = model_cell["text"]
|
| 87 |
+
parts = [part.strip() for part in model_text.split(" ") if part.strip()]
|
| 88 |
+
organization = ""
|
| 89 |
+
|
| 90 |
+
# The parser preserves the model and organization in a single cell text.
|
| 91 |
+
# Organization appears after the model title because of the nested <p>.
|
| 92 |
+
# We recover it by subtracting the anchor text prefix from the cell text.
|
| 93 |
+
model_name = model_text
|
| 94 |
+
if model_cell["href"]:
|
| 95 |
+
anchor_name = model_text.split(" ")[0]
|
| 96 |
+
if model_text.startswith(anchor_name):
|
| 97 |
+
model_name = anchor_name
|
| 98 |
+
organization = model_text[len(anchor_name):].strip()
|
| 99 |
+
|
| 100 |
+
# For names with spaces, use the full text before the trailing organization line.
|
| 101 |
+
if not organization and len(parts) >= 2:
|
| 102 |
+
organization = parts[-1]
|
| 103 |
+
|
| 104 |
+
if organization and model_name.endswith(organization):
|
| 105 |
+
model_name = model_name[: -len(organization)].strip()
|
| 106 |
+
|
| 107 |
+
if not model_name:
|
| 108 |
+
model_name = model_text
|
| 109 |
+
|
| 110 |
+
if model_cell["href"]:
|
| 111 |
+
model_md = f"[{model_name}]({model_cell['href']})"
|
| 112 |
+
else:
|
| 113 |
+
model_md = model_name
|
| 114 |
+
|
| 115 |
+
record = {
|
| 116 |
+
"Model": model_md,
|
| 117 |
+
"Organization": organization,
|
| 118 |
+
"Input Config": row[2]["text"].upper(),
|
| 119 |
+
"Date": row[3]["text"],
|
| 120 |
+
"Status": "published",
|
| 121 |
+
"Conciseness": row[4]["text"],
|
| 122 |
+
"Correctness": row[5]["text"],
|
| 123 |
+
"Completeness": row[6]["text"],
|
| 124 |
+
"F1-like": row[7]["text"],
|
| 125 |
+
"Info": row[8]["text"],
|
| 126 |
+
}
|
| 127 |
+
records.append(record)
|
| 128 |
+
|
| 129 |
+
return records
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main() -> None:
|
| 133 |
+
rows = parse_rows()
|
| 134 |
+
with OUTPUT_CSV.open("w", encoding="utf-8", newline="") as f:
|
| 135 |
+
writer = csv.DictWriter(
|
| 136 |
+
f,
|
| 137 |
+
fieldnames=[
|
| 138 |
+
"Model",
|
| 139 |
+
"Organization",
|
| 140 |
+
"Input Config",
|
| 141 |
+
"Date",
|
| 142 |
+
"Status",
|
| 143 |
+
"Conciseness",
|
| 144 |
+
"Correctness",
|
| 145 |
+
"Completeness",
|
| 146 |
+
"F1-like",
|
| 147 |
+
"Info",
|
| 148 |
+
],
|
| 149 |
+
)
|
| 150 |
+
writer.writeheader()
|
| 151 |
+
writer.writerows(rows)
|
| 152 |
+
print(f"Wrote {len(rows)} rows to {OUTPUT_CSV}")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
main()
|
constants.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
SPACE_ROOT = Path(__file__).resolve().parent
|
| 8 |
+
DEFAULT_OUTPUT_DIR = SPACE_ROOT / ".cache"
|
| 9 |
+
SEED_LEADERBOARD_PATH = SPACE_ROOT / "leaderboard_seed.csv"
|
| 10 |
+
|
| 11 |
+
SPACE_TITLE = "RPC-Bench Leaderboard"
|
| 12 |
+
SPACE_SUBTITLE = "Leaderboard and submission entry for RPC-Bench."
|
| 13 |
+
|
| 14 |
+
SUBMISSION_REPO_ID = os.environ.get("SUBMISSION_REPO_ID", "").strip()
|
| 15 |
+
SUBMISSION_REPO_TYPE = "dataset"
|
| 16 |
+
SUBMISSION_CSV_PATH = os.environ.get("SUBMISSION_CSV_PATH", "leaderboard.csv").strip()
|
| 17 |
+
|
| 18 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() or None
|
| 19 |
+
|
| 20 |
+
GOLD_DIR = Path(os.environ.get("RPC_BENCH_GOLD_DIR", SPACE_ROOT / "benchmark"))
|
| 21 |
+
GOLD_PATHS = {
|
| 22 |
+
"dev": Path(os.environ.get("RPC_BENCH_GOLD_DEV", GOLD_DIR / "dev.json")),
|
| 23 |
+
"test": Path(os.environ.get("RPC_BENCH_GOLD_TEST", GOLD_DIR / "test.json")),
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
MODEL_COLUMNS = ["Model", "Organization", "Input Config", "Date", "Status"]
|
| 27 |
+
SCORE_COLUMNS = [
|
| 28 |
+
"Conciseness",
|
| 29 |
+
"Correctness",
|
| 30 |
+
"Completeness",
|
| 31 |
+
"F1-like",
|
| 32 |
+
"Info",
|
| 33 |
+
]
|
| 34 |
+
ALL_COLUMNS = MODEL_COLUMNS + SCORE_COLUMNS
|
| 35 |
+
|
| 36 |
+
EXTERNAL_LINKS = """
|
| 37 |
+
<p align="center">
|
| 38 |
+
🌐 <a href="https://rpc-bench.github.io/" target="_blank">Project Page</a> •
|
| 39 |
+
💻 <a href="https://github.com/RPC-Bench/PRC-Bench" target="_blank">GitHub</a> •
|
| 40 |
+
📖 <a href="https://arxiv.org/abs/2601.14289" target="_blank">Paper</a> •
|
| 41 |
+
🤗 <a href="https://huggingface.co" target="_blank">Hugging Face</a> •
|
| 42 |
+
🧭 <a href="https://community.modelscope.cn/" target="_blank">ModelScope</a>
|
| 43 |
+
</p>
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
INTRODUCTION = (
|
| 47 |
+
"RPC-Bench Leaderboard provides a compact interface for browsing published results "
|
| 48 |
+
"and uploading new submissions for evaluation."
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
SUBMIT_GUIDANCE = (
|
| 52 |
+
"Upload a JSONL prediction file with fields `id`, `part_idx`, `question`, "
|
| 53 |
+
"`gen_answer`, and `category`. The Space will validate the format, optionally "
|
| 54 |
+
"run the judge, and then write the result into the submission repository."
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
CITATION = r"""@article{chen2026rpc,
|
| 58 |
+
title={RPC-Bench: A Fine-grained Benchmark for Research Paper Comprehension},
|
| 59 |
+
author={Chen, Yelin and Zhang, Fanjin and Sun, Suping and Pang, Yunhe and Wang, Yuanchun and Song, Jian and Li, Xiaoyan and Hou, Lei and Zhao, Shu and Tang, Jie and others},
|
| 60 |
+
journal={arXiv preprint arXiv:2601.14289},
|
| 61 |
+
year={2026}
|
| 62 |
+
}"""
|
eval.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Iterable, List, Tuple
|
| 9 |
+
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
DEFAULT_GPT_MODEL = os.environ.get("RPC_BENCH_GPT_MODEL", "gpt-5-2025-08-07")
|
| 14 |
+
DEFAULT_GEMINI_MODEL = os.environ.get("RPC_BENCH_GEMINI_MODEL", "gemini-2.5-pro")
|
| 15 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
| 16 |
+
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _client() -> OpenAI:
|
| 20 |
+
return OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL or None)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _extract_json(text: str) -> str:
|
| 24 |
+
text = text.strip()
|
| 25 |
+
if "```json" in text:
|
| 26 |
+
match = re.search(r"```json(.*?)```", text, re.DOTALL)
|
| 27 |
+
if match:
|
| 28 |
+
return match.group(1).strip()
|
| 29 |
+
if "```" in text:
|
| 30 |
+
match = re.search(r"```(.*?)```", text, re.DOTALL)
|
| 31 |
+
if match:
|
| 32 |
+
return match.group(1).strip()
|
| 33 |
+
return text
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _load_jsonl(path: str | Path) -> List[Dict]:
|
| 37 |
+
rows: List[Dict] = []
|
| 38 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 39 |
+
for line in f:
|
| 40 |
+
line = line.strip()
|
| 41 |
+
if not line:
|
| 42 |
+
continue
|
| 43 |
+
rows.append(json.loads(line))
|
| 44 |
+
return rows
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _judge(messages: List[Dict], model: str) -> str:
|
| 48 |
+
client = _client()
|
| 49 |
+
response = client.chat.completions.create(
|
| 50 |
+
model=model,
|
| 51 |
+
messages=messages,
|
| 52 |
+
stream=False,
|
| 53 |
+
)
|
| 54 |
+
return response.choices[0].message.content or ""
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _score_prompt(title: str, abstract: str, question: str, reference_answer: str, predicted_answer: str) -> List[Dict]:
|
| 58 |
+
system_prompt = (
|
| 59 |
+
"You are a strict paper-answer judge. Return JSON only. "
|
| 60 |
+
"Score the prediction on three dimensions: Conciseness, Correctness, Completeness. "
|
| 61 |
+
"Each dimension must contain a numeric rating in [1, 5] and a short reason."
|
| 62 |
+
)
|
| 63 |
+
user_prompt = (
|
| 64 |
+
f"Title: {title}\n"
|
| 65 |
+
f"Abstract: {abstract}\n"
|
| 66 |
+
f"Question: {question}\n"
|
| 67 |
+
f"Reference answer: {reference_answer}\n"
|
| 68 |
+
f"Predicted answer: {predicted_answer}\n"
|
| 69 |
+
"Return JSON only with keys Conciseness, Correctness, Completeness."
|
| 70 |
+
)
|
| 71 |
+
return [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _normalize_rating_block(content: Dict) -> Dict:
|
| 75 |
+
result = {}
|
| 76 |
+
for key in ("Conciseness", "Correctness", "Completeness"):
|
| 77 |
+
value = content.get(key, {})
|
| 78 |
+
if isinstance(value, dict):
|
| 79 |
+
rating = float(value.get("rating", 0.0))
|
| 80 |
+
reason = value.get("reason", "")
|
| 81 |
+
else:
|
| 82 |
+
rating = float(value)
|
| 83 |
+
reason = ""
|
| 84 |
+
result[key] = {"rating": rating, "reason": reason}
|
| 85 |
+
return result
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def paper_qa_score(
|
| 89 |
+
file_path: str | Path,
|
| 90 |
+
eval_path: str | Path,
|
| 91 |
+
out_path: str | Path,
|
| 92 |
+
judge_model: str = "gpt",
|
| 93 |
+
) -> None:
|
| 94 |
+
gold_items = _load_jsonl(file_path)
|
| 95 |
+
pred_items = _load_jsonl(eval_path)
|
| 96 |
+
|
| 97 |
+
paper_dict: Dict[str, Dict[str, str]] = {}
|
| 98 |
+
qa_items: List[Dict] = []
|
| 99 |
+
for paper in gold_items:
|
| 100 |
+
paper_dict[paper["id"]] = {
|
| 101 |
+
"title": paper.get("title", ""),
|
| 102 |
+
"abstract": paper.get("abstract", ""),
|
| 103 |
+
}
|
| 104 |
+
for idx, qa in enumerate(paper.get("qa_pairs", []), start=1):
|
| 105 |
+
qa_items.append(
|
| 106 |
+
{
|
| 107 |
+
"id": paper["id"],
|
| 108 |
+
"part_idx": idx,
|
| 109 |
+
"question": qa["question"],
|
| 110 |
+
"answer": qa["answer"],
|
| 111 |
+
"category": qa["category"],
|
| 112 |
+
}
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
os.makedirs(Path(out_path).parent, exist_ok=True)
|
| 116 |
+
if len(qa_items) != len(pred_items):
|
| 117 |
+
raise ValueError(f"Prediction count mismatch: expected {len(qa_items)}, got {len(pred_items)}")
|
| 118 |
+
|
| 119 |
+
model_name = DEFAULT_GPT_MODEL if judge_model == "gpt" else DEFAULT_GEMINI_MODEL
|
| 120 |
+
for gold, pred in zip(qa_items, pred_items):
|
| 121 |
+
if gold["id"] != pred["id"] or gold["part_idx"] != pred["part_idx"]:
|
| 122 |
+
raise ValueError(f"Submission order mismatch at {gold['id']} / {gold['part_idx']}")
|
| 123 |
+
|
| 124 |
+
if gold["category"] == "Claim_Verification":
|
| 125 |
+
score_block = []
|
| 126 |
+
else:
|
| 127 |
+
messages = _score_prompt(
|
| 128 |
+
paper_dict[gold["id"]]["title"],
|
| 129 |
+
paper_dict[gold["id"]]["abstract"],
|
| 130 |
+
gold["question"],
|
| 131 |
+
gold["answer"],
|
| 132 |
+
pred["gen_answer"],
|
| 133 |
+
)
|
| 134 |
+
raw = _judge(messages, model_name)
|
| 135 |
+
score_block = _normalize_rating_block(json.loads(_extract_json(raw)))
|
| 136 |
+
time.sleep(float(os.environ.get("RPC_BENCH_JUDGE_SLEEP", "0")))
|
| 137 |
+
|
| 138 |
+
with open(out_path, "a", encoding="utf-8") as fw:
|
| 139 |
+
fw.write(
|
| 140 |
+
json.dumps(
|
| 141 |
+
{
|
| 142 |
+
"id": gold["id"],
|
| 143 |
+
"part_idx": gold["part_idx"],
|
| 144 |
+
"question": gold["question"],
|
| 145 |
+
"reference_answer": gold["answer"],
|
| 146 |
+
"predicted_answer": pred["gen_answer"],
|
| 147 |
+
"category": gold["category"],
|
| 148 |
+
"score": score_block,
|
| 149 |
+
},
|
| 150 |
+
ensure_ascii=False,
|
| 151 |
+
)
|
| 152 |
+
+ "\n"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def get_llm_score(eval_path: str | Path) -> Tuple[Dict[str, float], Dict[str, Tuple[float, float, float]]]:
|
| 157 |
+
category_dict: Dict[str, Dict[str, float]] = {}
|
| 158 |
+
sum_c1 = sum_c2 = sum_c3 = 0.0
|
| 159 |
+
count = 0
|
| 160 |
+
|
| 161 |
+
with open(eval_path, "r", encoding="utf-8") as f:
|
| 162 |
+
for line in f:
|
| 163 |
+
line = line.strip()
|
| 164 |
+
if not line:
|
| 165 |
+
continue
|
| 166 |
+
item = json.loads(line)
|
| 167 |
+
category = item["category"]
|
| 168 |
+
if category == "Claim_Verification":
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
if category not in category_dict:
|
| 172 |
+
category_dict[category] = {"Conciseness": 0.0, "Correctness": 0.0, "Completeness": 0.0, "count": 0.0}
|
| 173 |
+
|
| 174 |
+
content = item.get("score", {})
|
| 175 |
+
c1 = float(content.get("Conciseness", {}).get("rating", 0.0))
|
| 176 |
+
c2 = float(content.get("Correctness", {}).get("rating", 0.0))
|
| 177 |
+
c3 = float(content.get("Completeness", {}).get("rating", 0.0))
|
| 178 |
+
|
| 179 |
+
category_dict[category]["Conciseness"] += c1
|
| 180 |
+
category_dict[category]["Correctness"] += c2
|
| 181 |
+
category_dict[category]["Completeness"] += c3
|
| 182 |
+
category_dict[category]["count"] += 1
|
| 183 |
+
|
| 184 |
+
sum_c1 += c1
|
| 185 |
+
sum_c2 += c2
|
| 186 |
+
sum_c3 += c3
|
| 187 |
+
count += 1
|
| 188 |
+
|
| 189 |
+
result: Dict[str, Tuple[float, float, float]] = {}
|
| 190 |
+
for category, values in category_dict.items():
|
| 191 |
+
denom = max(values["count"], 1.0)
|
| 192 |
+
result[category] = (
|
| 193 |
+
values["Conciseness"] / denom,
|
| 194 |
+
values["Correctness"] / denom,
|
| 195 |
+
values["Completeness"] / denom,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
total_scores = {
|
| 199 |
+
"Conciseness": sum_c1 / max(count, 1),
|
| 200 |
+
"Correctness": sum_c2 / max(count, 1),
|
| 201 |
+
"Completeness": sum_c3 / max(count, 1),
|
| 202 |
+
}
|
| 203 |
+
return total_scores, result
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def calculate_acc(pred: List[str], gold: List[str]) -> float:
|
| 207 |
+
if not pred:
|
| 208 |
+
return 0.0
|
| 209 |
+
return sum(1 for p, g in zip(pred, gold) if p == g) / len(pred)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def get_verification_score(gold_path: str | Path, eval_path: str | Path) -> float:
|
| 213 |
+
gold_answers: List[str] = []
|
| 214 |
+
pred_answers: List[str] = []
|
| 215 |
+
|
| 216 |
+
for paper in _load_jsonl(gold_path):
|
| 217 |
+
for qa in paper.get("qa_pairs", []):
|
| 218 |
+
if qa.get("category") == "Claim_Verification":
|
| 219 |
+
gold_answers.append(str(qa.get("answer", "")).strip())
|
| 220 |
+
|
| 221 |
+
for item in _load_jsonl(eval_path):
|
| 222 |
+
if item.get("category") == "Claim_Verification":
|
| 223 |
+
pred_answers.append(str(item.get("gen_answer", "")).strip())
|
| 224 |
+
|
| 225 |
+
if len(gold_answers) != len(pred_answers):
|
| 226 |
+
raise ValueError(
|
| 227 |
+
f"Claim verification count mismatch: expected {len(gold_answers)}, got {len(pred_answers)}"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
normalized_pred: List[str] = []
|
| 231 |
+
for gold, pred in zip(gold_answers, pred_answers):
|
| 232 |
+
if pred not in {"True", "False"}:
|
| 233 |
+
normalized_pred.append("False" if gold == "True" else "True")
|
| 234 |
+
else:
|
| 235 |
+
normalized_pred.append(pred)
|
| 236 |
+
|
| 237 |
+
return calculate_acc(normalized_pred, gold_answers[: len(normalized_pred)])
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def evaluate_submission(gold_path: str | Path, pred_path: str | Path, out_dir: str | Path, judge_model: str = "gpt") -> Dict[str, float]:
|
| 241 |
+
out_dir = Path(out_dir)
|
| 242 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 243 |
+
|
| 244 |
+
judged_path = out_dir / f"{Path(pred_path).stem}_{judge_model}_judge.jsonl"
|
| 245 |
+
if judged_path.exists():
|
| 246 |
+
judged_path.unlink()
|
| 247 |
+
|
| 248 |
+
paper_qa_score(gold_path, pred_path, judged_path, judge_model=judge_model)
|
| 249 |
+
llm_total, _ = get_llm_score(judged_path)
|
| 250 |
+
claim_acc = get_verification_score(gold_path, pred_path)
|
| 251 |
+
|
| 252 |
+
f1_like = (
|
| 253 |
+
2 * llm_total["Correctness"] * llm_total["Completeness"]
|
| 254 |
+
/ (llm_total["Correctness"] + llm_total["Completeness"] + 1e-8)
|
| 255 |
+
)
|
| 256 |
+
info = llm_total["Conciseness"] * f1_like * 4
|
| 257 |
+
return {
|
| 258 |
+
"Conciseness": round(llm_total["Conciseness"] * 20, 4),
|
| 259 |
+
"Correctness": round(llm_total["Correctness"] * 20, 4),
|
| 260 |
+
"Completeness": round(llm_total["Completeness"] * 20, 4),
|
| 261 |
+
"F1-like": round(f1_like * 20, 4),
|
| 262 |
+
"Info": round(info * 20, 4),
|
| 263 |
+
"Claim Accuracy": round(claim_acc * 100, 4),
|
| 264 |
+
}
|
leaderboard_seed.csv
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Organization,Input Config,Date,Status,Conciseness,Correctness,Completeness,F1-like,Info
|
| 2 |
+
[GPT-5](https://openai.com/index/introducing-gpt-5/),OpenAI,TEXT,2025-8-7,published,54.93,69.10,67.33,68.20,37.46
|
| 3 |
+
[GPT-5.2](https://openai.com/index/introducing-gpt-5-2/),OpenAI,TEXT,2025-12-11,published,53.81,66.84,64.03,65.40,35.19
|
| 4 |
+
[GPT-5](https://openai.com/index/introducing-gpt-5/),OpenAI,VISUAL,2025-8-7,published,61.47,58.90,55.34,57.07,35.08
|
| 5 |
+
[Gemini-2.5-Pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/),Google,TEXT,2025-3-25,published,54.87,62.65,59.03,60.79,33.35
|
| 6 |
+
[Gemini-3-Pro](https://blog.google/products-and-platforms/products/gemini/gemini-3/),Google,TEXT,2025-11-18,published,52.81,62.69,60.28,61.46,32.46
|
| 7 |
+
[DeepSeek-V3.2](https://api-docs.deepseek.com/news/news251201),DeepSeek-AI,TEXT,2025-12-1,published,56.31,58.73,55.19,56.91,32.04
|
| 8 |
+
[GPT-5.2](https://openai.com/index/introducing-gpt-5-2/),OpenAI,VISUAL,2025-12-11,published,56.43,56.75,52.82,54.72,30.88
|
| 9 |
+
[DeepSeek-V3.1](https://api-docs.deepseek.com/news/news250821),DeepSeek-AI,TEXT,2025-8-21,published,54.76,57.85,54.85,56.31,30.84
|
| 10 |
+
[GLM-4.6V](https://github.com/zai-org/GLM-V),Z.ai,VISUAL,2025-12-8,published,64.55,47.32,43.43,45.29,29.23
|
| 11 |
+
[GLM-4.7](https://z.ai/blog/glm-4.7),Z.ai,TEXT,2025-12-22,published,54.34,54.36,51.75,53.02,28.81
|
| 12 |
+
[GLM-4.5V](https://github.com/zai-org/GLM-V),Z.ai,VISUAL,2025-8-11,published,59.44,48.79,43.62,46.06,27.38
|
| 13 |
+
[gemini-3-pro](https://blog.google/products-and-platforms/products/gemini/gemini-3/),Google,VISUAL,2025-11-18,published,50.22,56.06,52.69,54.32,27.28
|
| 14 |
+
[GLM-4.5](https://z.ai/blog/glm-4.5),Z.ai,TEXT,2025-7-28,published,43.41,58.95,59.54,59.24,25.72
|
| 15 |
+
[gemini-2.5-pro](https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/),Google,VISUAL,2025-3-25,published,51.71,48.39,45.59,46.95,24.28
|
| 16 |
+
[Claude-Sonnet-4](https://www.anthropic.com/news/claude-4),Anthropic,TEXT,2025-5-23,published,41.37,58.53,58.44,58.48,24.19
|
| 17 |
+
[Qwen3](https://github.com/QwenLM/Qwen3),Alibaba,TEXT,2025-7-21,published,41.44,55.88,56.64,56.26,23.31
|
| 18 |
+
[Claude-Sonnet-4.5](https://www.anthropic.com/news/claude-sonnet-4-5),Anthropic,TEXT,2025-9-30,published,31.02,64.31,64.97,64.64,20.05
|
| 19 |
+
[Claude-Sonnet-4.5](https://www.anthropic.com/news/claude-sonnet-4-5),Anthropic,VISUAL,2025-9-30,published,31.95,55.35,54.45,54.89,17.54
|
| 20 |
+
[Claude-Sonnet-4](https://www.anthropic.com/news/claude-4),Anthropic,VISUAL,2025-5-23,published,31.63,54.16,53.32,53.74,16.99
|
| 21 |
+
[HippoRAG2](https://github.com/ianliuwd/HippoRAG2),The Ohio State University,TEXT,2025-6-19,published,45.77,33.13,27.88,30.28,13.86
|
| 22 |
+
[MemoRAG](https://github.com/qhjqhj00/MemoRAG),Peking University & Hong Kong Polytechnic University,TEXT,2025-4-9,published,51.31,24.19,19.10,21.35,10.96
|
| 23 |
+
[VdocRAG](https://vdocrag.github.io/),NTT Corporation & Tohoku University,VISUAL,2025-4-14,published,61.54,21.17,13.88,16.77,10.32
|
| 24 |
+
[VisRAG](https://github.com/OpenBMB/VisRAG),Tsinghua University & ModelBest Inc.,VISUAL,2025-3-2,published,39.90,26.24,23.63,24.87,9.92
|
| 25 |
+
[Raptor](https://github.com/parthsarthi03/raptor),Stanford University,TEXT,2024-1-31,published,36.47,25.28,20.82,22.84,8.33
|
| 26 |
+
[Monkey](https://github.com/Yuliang-Liu/Monkey),Huazhong University of Science and Technology,VISUAL,2024-8-26,published,54.61,17.08,11.27,13.58,7.41
|
| 27 |
+
[Docopilot](https://github.com/OpenGVLab/Docopilot),Shanghai AI Laboratory,VISUAL,2025-7-19,published,39.31,18.31,17.12,17.69,6.96
|
| 28 |
+
[Qwen3](https://github.com/QwenLM/Qwen3),Alibaba,VISUAL,2025-7-21,published,22.64,20.17,20.14,20.16,4.56
|
| 29 |
+
[DocOwl2](https://github.com/X-PLUG/mPLUG-DocOwl),Alibaba,VISUAL,2024-9-9,published,50.19,11.75,6.66,8.50,4.27
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.1
|
| 2 |
+
huggingface-hub>=0.23.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
openai>=1.40.0
|
| 6 |
+
tqdm>=4.66.0
|
| 7 |
+
python-dateutil>=2.8.2
|