yoavg-aai's picture
Revert table-border CSS; force header text to single line
491f794
"""BONAFIDE benchmark website -- Gradio HF Space.
Single-page layout: intro, leaderboard (prominent), methodology, submission,
citation. All leaderboard data is read from data/leaderboard.parquet at
startup; no live metric execution. The submission section requires
`BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` env vars and uploads directly to
that private HF Hub dataset repo — nothing is written to the Space's local
disk.
"""
import base64
import datetime as dt
import json
import os
import re
import time
import uuid
from pathlib import Path
# Workaround for gradio_client crash on JSON schemas with boolean
# `additionalProperties` — bricks /api_info, which HF Spaces hits at page load.
# Recurs across gradio versions; safer to patch than to chase releases.
import gradio_client.utils as _gcu
_orig_jstpt = _gcu._json_schema_to_python_type
def _safe_jstpt(schema, defs=None):
if isinstance(schema, bool):
return "Any"
return _orig_jstpt(schema, defs)
_gcu._json_schema_to_python_type = _safe_jstpt
import gradio as gr # noqa: E402
import pandas as pd # noqa: E402
ROOT = Path(__file__).parent
DATA = ROOT / "data"
PAGES = ROOT / "pages"
ASSETS = ROOT / "assets"
# ---------- Data loading ----------
LEADERBOARD = pd.read_parquet(DATA / "leaderboard.parquet")
METRIC_NAMES = {
"adding_mistakes": "Adding Mistakes",
"early_answering": "Early Answering",
"filler_tokens": "Filler Tokens",
"scm": "SCM",
"paraphrasing": "Paraphrasing",
"simulatability": "Simulatability",
"fur": "FUR",
"cc_shap": "CC-SHAP",
}
def _fmt_auc(auroc, lo, hi):
if pd.isna(auroc):
return "—"
margin = (hi - lo) / 2
return f"{auroc:.2f} ± {margin:.2f}"
def _fmt_seconds(x):
if pd.isna(x):
return "—"
if x < 1:
return f"{x:.2f}s"
return f"{x:.0f}s"
def build_leaderboard():
"""Return one row per metric with CoT and Step AUROCs side-by-side.
Always aggregated across tasks, settings, and target models. Missing
values render as empty strings (so they sort to the bottom in descending
order). The best AUROC in each level column is bolded with leading
underscores so it still sorts above the non-bolded numerics in
descending order.
"""
df = LEADERBOARD[
(LEADERBOARD["setting"] == "ALL")
& (LEADERBOARD["src_type"] == "ALL")
& (LEADERBOARD["target_model"] == "ALL")
]
by_level = {lvl: df[df["level"] == lvl].set_index("metric") for lvl in ("cot", "step")}
metrics = sorted(set(by_level["cot"].index) | set(by_level["step"].index))
def _top_metric(level):
sub = by_level[level].dropna(subset=["AUROC"])
return sub["AUROC"].idxmax() if not sub.empty else None
top_cot = _top_metric("cot")
top_step = _top_metric("step")
def _cell(row, is_top):
if row is None or pd.isna(row["AUROC"]):
return ""
txt = _fmt_auc(row["AUROC"], row["AUROC_low"], row["AUROC_high"])
return f"__{txt}__" if is_top else txt
def _wt(row):
"""Numeric seconds for sortable column; None → empty cell."""
if row is None or pd.isna(row["mean_wall_time_s"]):
return None
s = row["mean_wall_time_s"]
return round(s, 2) if s < 1 else round(s)
rows = []
for m in metrics:
cot = by_level["cot"].loc[m] if m in by_level["cot"].index else None
step = by_level["step"].loc[m] if m in by_level["step"].index else None
# Sort key is CoT AUROC only — methods without a CoT score sink to
# the bottom rather than getting interleaved using their step score.
sort_key = cot["AUROC"] if cot is not None and pd.notna(cot["AUROC"]) else -1
rows.append(
{
"Metric": METRIC_NAMES.get(m, m),
"AUROC — CoT (95% CI)": _cell(cot, m == top_cot),
"Wall-time — CoT (s)": _wt(cot),
"AUROC — Step (95% CI)": _cell(step, m == top_step),
"Wall-time — Step (s)": _wt(step),
"_sort": sort_key,
}
)
return pd.DataFrame(rows).sort_values("_sort", ascending=False).drop(columns="_sort")
# ---------- Submission handler ----------
MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB
SUBMIT_COOLDOWN_S = 60
REQUIRED_COLS = {"id", "score", "wall_time_s"}
def _read_submission_csv(path: Path) -> pd.DataFrame:
"""Read .csv or .csv.gz; raise on malformed input."""
if path.name.endswith(".csv.gz") or path.suffix == ".gz":
return pd.read_csv(path, compression="gzip")
return pd.read_csv(path)
def _notify_slack(text):
"""Best-effort Slack ping. Silent if no webhook configured or it errors."""
url = os.environ.get("SLACK_WEBHOOK_URL")
if not url:
return
import urllib.request
try:
req = urllib.request.Request(
url,
data=json.dumps({"text": text}).encode("utf-8"),
headers={"Content-Type": "application/json"},
)
urllib.request.urlopen(req, timeout=5).read()
except Exception:
pass
def handle_submission(
metric_name, description, code_link, contact, file_obj, last_submit_ts
):
now = time.time()
if last_submit_ts and now - last_submit_ts < SUBMIT_COOLDOWN_S:
wait = int(SUBMIT_COOLDOWN_S - (now - last_submit_ts)) + 1
return (
f"Please wait {wait}s before submitting again.",
"",
last_submit_ts,
)
if not metric_name or not metric_name.strip():
return "Please provide a metric name.", "", last_submit_ts
if not code_link or not code_link.strip():
return (
"Please provide a link to code or paper for this metric.",
"",
last_submit_ts,
)
if not contact or not contact.strip():
return "Please provide a contact email.", "", last_submit_ts
if not re.match(r"^[^@\s]+@[^@\s]+\.[^@\s]+$", contact.strip()):
return "Contact must be a valid email address.", "", last_submit_ts
if file_obj is None:
return "Please attach a submission.csv (or .csv.gz) file.", "", last_submit_ts
hub_repo = os.environ.get("BONAFIDE_SUBMISSIONS_REPO")
hf_token = os.environ.get("HF_TOKEN")
if not hub_repo or not hf_token:
return (
"Submission backend is not configured "
"(`BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` are required). "
"Please contact the maintainers.",
"",
last_submit_ts,
)
src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj)
size = src_path.stat().st_size
if size > MAX_FILE_SIZE_BYTES:
return (
f"File too large ({size / 1024 / 1024:.1f} MB; max "
f"{MAX_FILE_SIZE_BYTES // 1024 // 1024} MB).",
"",
last_submit_ts,
)
try:
df = _read_submission_csv(src_path)
except Exception as e:
return f"Could not parse CSV: {e}", "", last_submit_ts
if set(df.columns) != REQUIRED_COLS:
return (
f"CSV must have exactly these columns: `id, score, wall_time_s`. "
f"Got: {list(df.columns)}.",
"",
last_submit_ts,
)
if len(df) == 0:
return "CSV has no rows.", "", last_submit_ts
if not pd.to_numeric(df["score"], errors="coerce").notna().all():
return "All `score` values must be numeric.", "", last_submit_ts
wt_numeric = pd.to_numeric(df["wall_time_s"], errors="coerce")
if not wt_numeric.notna().all():
return "All `wall_time_s` values must be numeric.", "", last_submit_ts
if (wt_numeric < 0).any():
return "All `wall_time_s` values must be non-negative.", "", last_submit_ts
tracking_id = (
f"sub_{dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
)
ext = ".csv.gz" if src_path.name.endswith(".csv.gz") else ".csv"
target_name = f"{tracking_id}__{metric_name.strip().replace(' ', '_')}{ext}"
from huggingface_hub import upload_file
upload_file(
path_or_fileobj=str(src_path),
path_in_repo=f"submissions/{target_name}",
repo_id=hub_repo,
repo_type="dataset",
token=hf_token,
)
meta_bytes = json.dumps(
{
"tracking_id": tracking_id,
"metric_name": metric_name.strip(),
"description": description,
"code_link": code_link.strip(),
"contact": contact,
"submitted_utc": dt.datetime.utcnow().isoformat(),
"submission_file": target_name,
"row_count": int(len(df)),
},
indent=2,
).encode("utf-8")
upload_file(
path_or_fileobj=meta_bytes,
path_in_repo=f"submissions/{tracking_id}.meta.json",
repo_id=hub_repo,
repo_type="dataset",
token=hf_token,
)
_notify_slack(
f"New BonaFide submission `{tracking_id}`\n"
f"• metric: `{metric_name.strip()}`\n"
f"• rows: {len(df)}\n"
f"• code/paper: {code_link.strip()}\n"
f"• contact: {contact or '(none)'}\n"
f"• description: {description or '(none)'}"
)
return (
f"Submission received. Tracking ID: **`{tracking_id}`**.\n\n"
f"A maintainer will review and run the evaluation, then contact you "
f"via the link or contact info you provided within 72 hours.",
tracking_id,
now,
)
# ---------- UI ----------
THEME = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="slate",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
)
CSS = """
.gradio-container {
max-width: 1180px !important;
margin: 0 auto !important;
}
.gradio-container h1 {
font-size: 2.0rem !important;
font-weight: 700 !important;
letter-spacing: -0.01em;
margin-top: 1.2rem;
margin-bottom: 0.4rem;
}
.gradio-container h2 {
font-size: 1.3rem !important;
font-weight: 600 !important;
margin-top: 2.2rem !important;
margin-bottom: 0.4rem !important;
}
.gradio-container h3 {
font-size: 1.0rem !important;
font-weight: 600 !important;
margin-top: 1.2rem !important;
}
.gradio-container table { font-size: 0.92rem; }
.gradio-container table th {
font-weight: 600 !important;
text-transform: none !important;
padding: 10px 12px !important;
white-space: nowrap !important;
}
.gradio-container table td {
padding: 8px 12px !important;
vertical-align: middle !important;
}
.gradio-container table td:first-child {
font-weight: 500 !important;
}
.hero {
text-align: center;
margin: 0.8rem 0 1.6rem 0;
}
.hero img {
width: 140px;
height: 140px;
object-fit: contain;
display: block;
margin: 0 auto 0.6rem auto;
}
.hero h1 {
margin-top: 0.2rem !important;
margin-bottom: 0.2rem !important;
}
.hero-tagline {
font-size: 1.05rem;
color: #4b5563;
margin: 0.2rem 0 0.6rem 0;
}
.hero-cite {
font-size: 0.9rem;
color: #6b7280;
max-width: 720px;
margin: 0 auto;
}
/* Inline prose links: kill Soft-theme's button-ish padding/background. */
.gradio-container li a,
.gradio-container p a {
display: inline !important;
padding: 0 !important;
margin: 0 !important;
background: transparent !important;
border: none !important;
}
"""
with gr.Blocks(title="BonaFide — CoT Faithfulness Metric Benchmark", theme=THEME, css=CSS) as demo:
_logo_b64 = base64.b64encode((ASSETS / "bonafide_logo.png").read_bytes()).decode()
gr.HTML(
f"""
<div class="hero">
<img src="data:image/png;base64,{_logo_b64}" alt="BonaFide" />
<h1>BonaFide</h1>
<p class="hero-tagline">A benchmark for chain-of-thought faithfulness metrics.</p>
<p class="hero-cite">From the paper <em>Faithfulness Metrics Don't Measure Faithfulness: A Meta-Evaluation with Ground Truth</em> by Yoav Gur-Arieh, Ana Marasović, and Mor Geva (2026).</p>
</div>
"""
)
gr.Markdown("## Leaderboard")
gr.Markdown(
"BonaFide evaluates **chain-of-thought faithfulness metrics** — "
"tools that claim to detect when a model's reasoning trace fails "
"to reflect the computation behind its answer. Each metric is "
"scored by **AUROC against ground-truth faithfulness labels**, "
"which we construct from carefully designed tasks where the "
"model's output reveals which intermediate steps must have "
"produced it. Per-metric AUROC is shown at both the CoT and step "
"level, aggregated across tasks and settings."
)
gr.Dataframe(
value=build_leaderboard(),
datatype=["str", "markdown", "number", "markdown", "number"],
interactive=False,
wrap=True,
max_height=900,
)
_eval_b64 = base64.b64encode(
(ROOT / "scripts" / "evaluate_submission.py").read_bytes()
).decode()
gr.HTML(
f"""
<h2>Submit a metric</h2>
<p>Steps:</p>
<ol>
<li>Download the <a href="https://huggingface.co/datasets/yoavgurarieh/BonaFide" target="_blank">BonaFide</a> dataset.</li>
<li>Run your metric and produce a <code>submission.csv</code> (or <code>submission.csv.gz</code>) with exactly three columns: <code>id, score, wall_time_s</code>. <code>id</code> matches the row id in the dataset; <code>score</code> is a real number with the convention <strong>higher = faithful</strong>; <code>wall_time_s</code> is the wall-time (in seconds) your metric spent computing the score for that row.</li>
<li>You can validate locally with the evaluator <a href="data:text/x-python;base64,{_eval_b64}" download="evaluate_submission.py">script</a> to get the AUROC scores.</li>
<li>Submit below. You'll receive a tracking ID. A maintainer reviews the submission and adds the results to the leaderboard.</li>
</ol>
"""
)
metric_name = gr.Textbox(label="Metric name", placeholder="my_metric_v1")
description = gr.Textbox(
label="One-paragraph description",
lines=3,
placeholder="What does the metric measure and how is the score computed?",
)
code_link = gr.Textbox(
label="Code or paper link (required)",
placeholder="https://github.com/you/your-metric or https://arxiv.org/abs/...",
)
contact = gr.Textbox(
label="Contact email (required)",
placeholder="you@example.com",
)
file_in = gr.File(
label="submission.csv or submission.csv.gz",
file_types=[".csv", ".gz"],
)
submit_btn = gr.Button("Submit", variant="primary")
status = gr.Markdown()
tracking = gr.Textbox(label="Tracking ID", interactive=False)
last_submit_ts = gr.State(value=0.0)
submit_btn.click(
handle_submission,
inputs=[metric_name, description, code_link, contact, file_in, last_submit_ts],
outputs=[status, tracking, last_submit_ts],
)
gr.Markdown((PAGES / "cite.md").read_text())
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False)