Spaces:

yoavgurarieh
/

BonaFide-Benchmark

Running

App Files Files Community

BonaFide-Benchmark / app.py

yoavg-aai

Revert table-border CSS; force header text to single line

491f794 10 days ago

raw

history blame contribute delete

15.3 kB

	"""BONAFIDE benchmark website -- Gradio HF Space.

	Single-page layout: intro, leaderboard (prominent), methodology, submission,
	citation. All leaderboard data is read from data/leaderboard.parquet at
	startup; no live metric execution. The submission section requires
	`BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` env vars and uploads directly to
	that private HF Hub dataset repo — nothing is written to the Space's local
	disk.
	"""

	import base64
	import datetime as dt
	import json
	import os
	import re
	import time
	import uuid
	from pathlib import Path

	# Workaround for gradio_client crash on JSON schemas with boolean
	# `additionalProperties` — bricks /api_info, which HF Spaces hits at page load.
	# Recurs across gradio versions; safer to patch than to chase releases.
	import gradio_client.utils as _gcu

	_orig_jstpt = _gcu._json_schema_to_python_type


	def _safe_jstpt(schema, defs=None):
	if isinstance(schema, bool):
	return "Any"
	return _orig_jstpt(schema, defs)


	_gcu._json_schema_to_python_type = _safe_jstpt

	import gradio as gr # noqa: E402
	import pandas as pd # noqa: E402


	ROOT = Path(__file__).parent
	DATA = ROOT / "data"
	PAGES = ROOT / "pages"
	ASSETS = ROOT / "assets"


	# ---------- Data loading ----------

	LEADERBOARD = pd.read_parquet(DATA / "leaderboard.parquet")

	METRIC_NAMES = {
	"adding_mistakes": "Adding Mistakes",
	"early_answering": "Early Answering",
	"filler_tokens": "Filler Tokens",
	"scm": "SCM",
	"paraphrasing": "Paraphrasing",
	"simulatability": "Simulatability",
	"fur": "FUR",
	"cc_shap": "CC-SHAP",
	}


	def _fmt_auc(auroc, lo, hi):
	if pd.isna(auroc):
	return "—"
	margin = (hi - lo) / 2
	return f"{auroc:.2f} ± {margin:.2f}"


	def _fmt_seconds(x):
	if pd.isna(x):
	return "—"
	if x < 1:
	return f"{x:.2f}s"
	return f"{x:.0f}s"


	def build_leaderboard():
	"""Return one row per metric with CoT and Step AUROCs side-by-side.

	Always aggregated across tasks, settings, and target models. Missing
	values render as empty strings (so they sort to the bottom in descending
	order). The best AUROC in each level column is bolded with leading
	underscores so it still sorts above the non-bolded numerics in
	descending order.
	"""
	df = LEADERBOARD[
	(LEADERBOARD["setting"] == "ALL")
	& (LEADERBOARD["src_type"] == "ALL")
	& (LEADERBOARD["target_model"] == "ALL")
	]
	by_level = {lvl: df[df["level"] == lvl].set_index("metric") for lvl in ("cot", "step")}
	metrics = sorted(set(by_level["cot"].index) \| set(by_level["step"].index))

	def _top_metric(level):
	sub = by_level[level].dropna(subset=["AUROC"])
	return sub["AUROC"].idxmax() if not sub.empty else None

	top_cot = _top_metric("cot")
	top_step = _top_metric("step")

	def _cell(row, is_top):
	if row is None or pd.isna(row["AUROC"]):
	return ""
	txt = _fmt_auc(row["AUROC"], row["AUROC_low"], row["AUROC_high"])
	return f"__{txt}__" if is_top else txt

	def _wt(row):
	"""Numeric seconds for sortable column; None → empty cell."""
	if row is None or pd.isna(row["mean_wall_time_s"]):
	return None
	s = row["mean_wall_time_s"]
	return round(s, 2) if s < 1 else round(s)

	rows = []
	for m in metrics:
	cot = by_level["cot"].loc[m] if m in by_level["cot"].index else None
	step = by_level["step"].loc[m] if m in by_level["step"].index else None
	# Sort key is CoT AUROC only — methods without a CoT score sink to
	# the bottom rather than getting interleaved using their step score.
	sort_key = cot["AUROC"] if cot is not None and pd.notna(cot["AUROC"]) else -1
	rows.append(
	{
	"Metric": METRIC_NAMES.get(m, m),
	"AUROC — CoT (95% CI)": _cell(cot, m == top_cot),
	"Wall-time — CoT (s)": _wt(cot),
	"AUROC — Step (95% CI)": _cell(step, m == top_step),
	"Wall-time — Step (s)": _wt(step),
	"_sort": sort_key,
	}
	)
	return pd.DataFrame(rows).sort_values("_sort", ascending=False).drop(columns="_sort")


	# ---------- Submission handler ----------

	MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB
	SUBMIT_COOLDOWN_S = 60
	REQUIRED_COLS = {"id", "score", "wall_time_s"}


	def _read_submission_csv(path: Path) -> pd.DataFrame:
	"""Read .csv or .csv.gz; raise on malformed input."""
	if path.name.endswith(".csv.gz") or path.suffix == ".gz":
	return pd.read_csv(path, compression="gzip")
	return pd.read_csv(path)


	def _notify_slack(text):
	"""Best-effort Slack ping. Silent if no webhook configured or it errors."""
	url = os.environ.get("SLACK_WEBHOOK_URL")
	if not url:
	return
	import urllib.request

	try:
	req = urllib.request.Request(
	url,
	data=json.dumps({"text": text}).encode("utf-8"),
	headers={"Content-Type": "application/json"},
	)
	urllib.request.urlopen(req, timeout=5).read()
	except Exception:
	pass


	def handle_submission(
	metric_name, description, code_link, contact, file_obj, last_submit_ts
	):
	now = time.time()
	if last_submit_ts and now - last_submit_ts < SUBMIT_COOLDOWN_S:
	wait = int(SUBMIT_COOLDOWN_S - (now - last_submit_ts)) + 1
	return (
	f"Please wait {wait}s before submitting again.",
	"",
	last_submit_ts,
	)

	if not metric_name or not metric_name.strip():
	return "Please provide a metric name.", "", last_submit_ts
	if not code_link or not code_link.strip():
	return (
	"Please provide a link to code or paper for this metric.",
	"",
	last_submit_ts,
	)
	if not contact or not contact.strip():
	return "Please provide a contact email.", "", last_submit_ts
	if not re.match(r"^[^@\s]+@[^@\s]+\.[^@\s]+$", contact.strip()):
	return "Contact must be a valid email address.", "", last_submit_ts
	if file_obj is None:
	return "Please attach a submission.csv (or .csv.gz) file.", "", last_submit_ts

	hub_repo = os.environ.get("BONAFIDE_SUBMISSIONS_REPO")
	hf_token = os.environ.get("HF_TOKEN")
	if not hub_repo or not hf_token:
	return (
	"Submission backend is not configured "
	"(`BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` are required). "
	"Please contact the maintainers.",
	"",
	last_submit_ts,
	)

	src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj)
	size = src_path.stat().st_size
	if size > MAX_FILE_SIZE_BYTES:
	return (
	f"File too large ({size / 1024 / 1024:.1f} MB; max "
	f"{MAX_FILE_SIZE_BYTES // 1024 // 1024} MB).",
	"",
	last_submit_ts,
	)

	try:
	df = _read_submission_csv(src_path)
	except Exception as e:
	return f"Could not parse CSV: {e}", "", last_submit_ts

	if set(df.columns) != REQUIRED_COLS:
	return (
	f"CSV must have exactly these columns: `id, score, wall_time_s`. "
	f"Got: {list(df.columns)}.",
	"",
	last_submit_ts,
	)
	if len(df) == 0:
	return "CSV has no rows.", "", last_submit_ts
	if not pd.to_numeric(df["score"], errors="coerce").notna().all():
	return "All `score` values must be numeric.", "", last_submit_ts
	wt_numeric = pd.to_numeric(df["wall_time_s"], errors="coerce")
	if not wt_numeric.notna().all():
	return "All `wall_time_s` values must be numeric.", "", last_submit_ts
	if (wt_numeric < 0).any():
	return "All `wall_time_s` values must be non-negative.", "", last_submit_ts

	tracking_id = (
	f"sub_{dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
	)
	ext = ".csv.gz" if src_path.name.endswith(".csv.gz") else ".csv"
	target_name = f"{tracking_id}__{metric_name.strip().replace(' ', '_')}{ext}"

	from huggingface_hub import upload_file

	upload_file(
	path_or_fileobj=str(src_path),
	path_in_repo=f"submissions/{target_name}",
	repo_id=hub_repo,
	repo_type="dataset",
	token=hf_token,
	)
	meta_bytes = json.dumps(
	{
	"tracking_id": tracking_id,
	"metric_name": metric_name.strip(),
	"description": description,
	"code_link": code_link.strip(),
	"contact": contact,
	"submitted_utc": dt.datetime.utcnow().isoformat(),
	"submission_file": target_name,
	"row_count": int(len(df)),
	},
	indent=2,
	).encode("utf-8")
	upload_file(
	path_or_fileobj=meta_bytes,
	path_in_repo=f"submissions/{tracking_id}.meta.json",
	repo_id=hub_repo,
	repo_type="dataset",
	token=hf_token,
	)

	_notify_slack(
	f"New BonaFide submission `{tracking_id}`\n"
	f"• metric: `{metric_name.strip()}`\n"
	f"• rows: {len(df)}\n"
	f"• code/paper: {code_link.strip()}\n"
	f"• contact: {contact or '(none)'}\n"
	f"• description: {description or '(none)'}"
	)

	return (
	f"Submission received. Tracking ID: `{tracking_id}`.\n\n"
	f"A maintainer will review and run the evaluation, then contact you "
	f"via the link or contact info you provided within 72 hours.",
	tracking_id,
	now,
	)


	# ---------- UI ----------

	THEME = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="slate",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
	)

	CSS = """
	.gradio-container {
	max-width: 1180px !important;
	margin: 0 auto !important;
	}
	.gradio-container h1 {
	font-size: 2.0rem !important;
	font-weight: 700 !important;
	letter-spacing: -0.01em;
	margin-top: 1.2rem;
	margin-bottom: 0.4rem;
	}
	.gradio-container h2 {
	font-size: 1.3rem !important;
	font-weight: 600 !important;
	margin-top: 2.2rem !important;
	margin-bottom: 0.4rem !important;
	}
	.gradio-container h3 {
	font-size: 1.0rem !important;
	font-weight: 600 !important;
	margin-top: 1.2rem !important;
	}
	.gradio-container table { font-size: 0.92rem; }
	.gradio-container table th {
	font-weight: 600 !important;
	text-transform: none !important;
	padding: 10px 12px !important;
	white-space: nowrap !important;
	}
	.gradio-container table td {
	padding: 8px 12px !important;
	vertical-align: middle !important;
	}
	.gradio-container table td:first-child {
	font-weight: 500 !important;
	}
	.hero {
	text-align: center;
	margin: 0.8rem 0 1.6rem 0;
	}
	.hero img {
	width: 140px;
	height: 140px;
	object-fit: contain;
	display: block;
	margin: 0 auto 0.6rem auto;
	}
	.hero h1 {
	margin-top: 0.2rem !important;
	margin-bottom: 0.2rem !important;
	}
	.hero-tagline {
	font-size: 1.05rem;
	color: #4b5563;
	margin: 0.2rem 0 0.6rem 0;
	}
	.hero-cite {
	font-size: 0.9rem;
	color: #6b7280;
	max-width: 720px;
	margin: 0 auto;
	}
	/* Inline prose links: kill Soft-theme's button-ish padding/background. */
	.gradio-container li a,
	.gradio-container p a {
	display: inline !important;
	padding: 0 !important;
	margin: 0 !important;
	background: transparent !important;
	border: none !important;
	}
	"""

	with gr.Blocks(title="BonaFide — CoT Faithfulness Metric Benchmark", theme=THEME, css=CSS) as demo:
	_logo_b64 = base64.b64encode((ASSETS / "bonafide_logo.png").read_bytes()).decode()
	gr.HTML(
	f"""
	<div class="hero">
	<img src="data:image/png;base64,{_logo_b64}" alt="BonaFide" />
	<h1>BonaFide</h1>
	<p class="hero-tagline">A benchmark for chain-of-thought faithfulness metrics.</p>
	<p class="hero-cite">From the paper <em>Faithfulness Metrics Don't Measure Faithfulness: A Meta-Evaluation with Ground Truth</em> by Yoav Gur-Arieh, Ana Marasović, and Mor Geva (2026).</p>
	</div>
	"""
	)

	gr.Markdown("## Leaderboard")
	gr.Markdown(
	"BonaFide evaluates chain-of-thought faithfulness metrics — "
	"tools that claim to detect when a model's reasoning trace fails "
	"to reflect the computation behind its answer. Each metric is "
	"scored by AUROC against ground-truth faithfulness labels, "
	"which we construct from carefully designed tasks where the "
	"model's output reveals which intermediate steps must have "
	"produced it. Per-metric AUROC is shown at both the CoT and step "
	"level, aggregated across tasks and settings."
	)
	gr.Dataframe(
	value=build_leaderboard(),
	datatype=["str", "markdown", "number", "markdown", "number"],
	interactive=False,
	wrap=True,
	max_height=900,
	)

	_eval_b64 = base64.b64encode(
	(ROOT / "scripts" / "evaluate_submission.py").read_bytes()
	).decode()
	gr.HTML(
	f"""
	<h2>Submit a metric</h2>
	<p>Steps:</p>
	<ol>
	<li>Download the <a href="https://huggingface.co/datasets/yoavgurarieh/BonaFide" target="_blank">BonaFide</a> dataset.</li>
	<li>Run your metric and produce a <code>submission.csv</code> (or <code>submission.csv.gz</code>) with exactly three columns: <code>id, score, wall_time_s</code>. <code>id</code> matches the row id in the dataset; <code>score</code> is a real number with the convention <strong>higher = faithful</strong>; <code>wall_time_s</code> is the wall-time (in seconds) your metric spent computing the score for that row.</li>
	<li>You can validate locally with the evaluator <a href="data:text/x-python;base64,{_eval_b64}" download="evaluate_submission.py">script</a> to get the AUROC scores.</li>
	<li>Submit below. You'll receive a tracking ID. A maintainer reviews the submission and adds the results to the leaderboard.</li>
	</ol>
	"""
	)
	metric_name = gr.Textbox(label="Metric name", placeholder="my_metric_v1")
	description = gr.Textbox(
	label="One-paragraph description",
	lines=3,
	placeholder="What does the metric measure and how is the score computed?",
	)
	code_link = gr.Textbox(
	label="Code or paper link (required)",
	placeholder="https://github.com/you/your-metric or https://arxiv.org/abs/...",
	)
	contact = gr.Textbox(
	label="Contact email (required)",
	placeholder="you@example.com",
	)
	file_in = gr.File(
	label="submission.csv or submission.csv.gz",
	file_types=[".csv", ".gz"],
	)
	submit_btn = gr.Button("Submit", variant="primary")
	status = gr.Markdown()
	tracking = gr.Textbox(label="Tracking ID", interactive=False)
	last_submit_ts = gr.State(value=0.0)
	submit_btn.click(
	handle_submission,
	inputs=[metric_name, description, code_link, contact, file_in, last_submit_ts],
	outputs=[status, tracking, last_submit_ts],
	)

	gr.Markdown((PAGES / "cite.md").read_text())


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False)