Spaces:
Running
Running
| """BONAFIDE benchmark website -- Gradio HF Space. | |
| Single-page layout: intro, leaderboard (prominent), methodology, submission, | |
| citation. All leaderboard data is read from data/leaderboard.parquet at | |
| startup; no live metric execution. The submission section requires | |
| `BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` env vars and uploads directly to | |
| that private HF Hub dataset repo — nothing is written to the Space's local | |
| disk. | |
| """ | |
| import base64 | |
| import datetime as dt | |
| import json | |
| import os | |
| import re | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| # Workaround for gradio_client crash on JSON schemas with boolean | |
| # `additionalProperties` — bricks /api_info, which HF Spaces hits at page load. | |
| # Recurs across gradio versions; safer to patch than to chase releases. | |
| import gradio_client.utils as _gcu | |
| _orig_jstpt = _gcu._json_schema_to_python_type | |
| def _safe_jstpt(schema, defs=None): | |
| if isinstance(schema, bool): | |
| return "Any" | |
| return _orig_jstpt(schema, defs) | |
| _gcu._json_schema_to_python_type = _safe_jstpt | |
| import gradio as gr # noqa: E402 | |
| import pandas as pd # noqa: E402 | |
| ROOT = Path(__file__).parent | |
| DATA = ROOT / "data" | |
| PAGES = ROOT / "pages" | |
| ASSETS = ROOT / "assets" | |
| # ---------- Data loading ---------- | |
| LEADERBOARD = pd.read_parquet(DATA / "leaderboard.parquet") | |
| METRIC_NAMES = { | |
| "adding_mistakes": "Adding Mistakes", | |
| "early_answering": "Early Answering", | |
| "filler_tokens": "Filler Tokens", | |
| "scm": "SCM", | |
| "paraphrasing": "Paraphrasing", | |
| "simulatability": "Simulatability", | |
| "fur": "FUR", | |
| "cc_shap": "CC-SHAP", | |
| } | |
| def _fmt_auc(auroc, lo, hi): | |
| if pd.isna(auroc): | |
| return "—" | |
| margin = (hi - lo) / 2 | |
| return f"{auroc:.2f} ± {margin:.2f}" | |
| def _fmt_seconds(x): | |
| if pd.isna(x): | |
| return "—" | |
| if x < 1: | |
| return f"{x:.2f}s" | |
| return f"{x:.0f}s" | |
| def build_leaderboard(): | |
| """Return one row per metric with CoT and Step AUROCs side-by-side. | |
| Always aggregated across tasks, settings, and target models. Missing | |
| values render as empty strings (so they sort to the bottom in descending | |
| order). The best AUROC in each level column is bolded with leading | |
| underscores so it still sorts above the non-bolded numerics in | |
| descending order. | |
| """ | |
| df = LEADERBOARD[ | |
| (LEADERBOARD["setting"] == "ALL") | |
| & (LEADERBOARD["src_type"] == "ALL") | |
| & (LEADERBOARD["target_model"] == "ALL") | |
| ] | |
| by_level = {lvl: df[df["level"] == lvl].set_index("metric") for lvl in ("cot", "step")} | |
| metrics = sorted(set(by_level["cot"].index) | set(by_level["step"].index)) | |
| def _top_metric(level): | |
| sub = by_level[level].dropna(subset=["AUROC"]) | |
| return sub["AUROC"].idxmax() if not sub.empty else None | |
| top_cot = _top_metric("cot") | |
| top_step = _top_metric("step") | |
| def _cell(row, is_top): | |
| if row is None or pd.isna(row["AUROC"]): | |
| return "" | |
| txt = _fmt_auc(row["AUROC"], row["AUROC_low"], row["AUROC_high"]) | |
| return f"__{txt}__" if is_top else txt | |
| def _wt(row): | |
| """Numeric seconds for sortable column; None → empty cell.""" | |
| if row is None or pd.isna(row["mean_wall_time_s"]): | |
| return None | |
| s = row["mean_wall_time_s"] | |
| return round(s, 2) if s < 1 else round(s) | |
| rows = [] | |
| for m in metrics: | |
| cot = by_level["cot"].loc[m] if m in by_level["cot"].index else None | |
| step = by_level["step"].loc[m] if m in by_level["step"].index else None | |
| # Sort key is CoT AUROC only — methods without a CoT score sink to | |
| # the bottom rather than getting interleaved using their step score. | |
| sort_key = cot["AUROC"] if cot is not None and pd.notna(cot["AUROC"]) else -1 | |
| rows.append( | |
| { | |
| "Metric": METRIC_NAMES.get(m, m), | |
| "AUROC — CoT (95% CI)": _cell(cot, m == top_cot), | |
| "Wall-time — CoT (s)": _wt(cot), | |
| "AUROC — Step (95% CI)": _cell(step, m == top_step), | |
| "Wall-time — Step (s)": _wt(step), | |
| "_sort": sort_key, | |
| } | |
| ) | |
| return pd.DataFrame(rows).sort_values("_sort", ascending=False).drop(columns="_sort") | |
| # ---------- Submission handler ---------- | |
| MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB | |
| SUBMIT_COOLDOWN_S = 60 | |
| REQUIRED_COLS = {"id", "score", "wall_time_s"} | |
| def _read_submission_csv(path: Path) -> pd.DataFrame: | |
| """Read .csv or .csv.gz; raise on malformed input.""" | |
| if path.name.endswith(".csv.gz") or path.suffix == ".gz": | |
| return pd.read_csv(path, compression="gzip") | |
| return pd.read_csv(path) | |
| def _notify_slack(text): | |
| """Best-effort Slack ping. Silent if no webhook configured or it errors.""" | |
| url = os.environ.get("SLACK_WEBHOOK_URL") | |
| if not url: | |
| return | |
| import urllib.request | |
| try: | |
| req = urllib.request.Request( | |
| url, | |
| data=json.dumps({"text": text}).encode("utf-8"), | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| urllib.request.urlopen(req, timeout=5).read() | |
| except Exception: | |
| pass | |
| def handle_submission( | |
| metric_name, description, code_link, contact, file_obj, last_submit_ts | |
| ): | |
| now = time.time() | |
| if last_submit_ts and now - last_submit_ts < SUBMIT_COOLDOWN_S: | |
| wait = int(SUBMIT_COOLDOWN_S - (now - last_submit_ts)) + 1 | |
| return ( | |
| f"Please wait {wait}s before submitting again.", | |
| "", | |
| last_submit_ts, | |
| ) | |
| if not metric_name or not metric_name.strip(): | |
| return "Please provide a metric name.", "", last_submit_ts | |
| if not code_link or not code_link.strip(): | |
| return ( | |
| "Please provide a link to code or paper for this metric.", | |
| "", | |
| last_submit_ts, | |
| ) | |
| if not contact or not contact.strip(): | |
| return "Please provide a contact email.", "", last_submit_ts | |
| if not re.match(r"^[^@\s]+@[^@\s]+\.[^@\s]+$", contact.strip()): | |
| return "Contact must be a valid email address.", "", last_submit_ts | |
| if file_obj is None: | |
| return "Please attach a submission.csv (or .csv.gz) file.", "", last_submit_ts | |
| hub_repo = os.environ.get("BONAFIDE_SUBMISSIONS_REPO") | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if not hub_repo or not hf_token: | |
| return ( | |
| "Submission backend is not configured " | |
| "(`BONAFIDE_SUBMISSIONS_REPO` and `HF_TOKEN` are required). " | |
| "Please contact the maintainers.", | |
| "", | |
| last_submit_ts, | |
| ) | |
| src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj) | |
| size = src_path.stat().st_size | |
| if size > MAX_FILE_SIZE_BYTES: | |
| return ( | |
| f"File too large ({size / 1024 / 1024:.1f} MB; max " | |
| f"{MAX_FILE_SIZE_BYTES // 1024 // 1024} MB).", | |
| "", | |
| last_submit_ts, | |
| ) | |
| try: | |
| df = _read_submission_csv(src_path) | |
| except Exception as e: | |
| return f"Could not parse CSV: {e}", "", last_submit_ts | |
| if set(df.columns) != REQUIRED_COLS: | |
| return ( | |
| f"CSV must have exactly these columns: `id, score, wall_time_s`. " | |
| f"Got: {list(df.columns)}.", | |
| "", | |
| last_submit_ts, | |
| ) | |
| if len(df) == 0: | |
| return "CSV has no rows.", "", last_submit_ts | |
| if not pd.to_numeric(df["score"], errors="coerce").notna().all(): | |
| return "All `score` values must be numeric.", "", last_submit_ts | |
| wt_numeric = pd.to_numeric(df["wall_time_s"], errors="coerce") | |
| if not wt_numeric.notna().all(): | |
| return "All `wall_time_s` values must be numeric.", "", last_submit_ts | |
| if (wt_numeric < 0).any(): | |
| return "All `wall_time_s` values must be non-negative.", "", last_submit_ts | |
| tracking_id = ( | |
| f"sub_{dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}" | |
| ) | |
| ext = ".csv.gz" if src_path.name.endswith(".csv.gz") else ".csv" | |
| target_name = f"{tracking_id}__{metric_name.strip().replace(' ', '_')}{ext}" | |
| from huggingface_hub import upload_file | |
| upload_file( | |
| path_or_fileobj=str(src_path), | |
| path_in_repo=f"submissions/{target_name}", | |
| repo_id=hub_repo, | |
| repo_type="dataset", | |
| token=hf_token, | |
| ) | |
| meta_bytes = json.dumps( | |
| { | |
| "tracking_id": tracking_id, | |
| "metric_name": metric_name.strip(), | |
| "description": description, | |
| "code_link": code_link.strip(), | |
| "contact": contact, | |
| "submitted_utc": dt.datetime.utcnow().isoformat(), | |
| "submission_file": target_name, | |
| "row_count": int(len(df)), | |
| }, | |
| indent=2, | |
| ).encode("utf-8") | |
| upload_file( | |
| path_or_fileobj=meta_bytes, | |
| path_in_repo=f"submissions/{tracking_id}.meta.json", | |
| repo_id=hub_repo, | |
| repo_type="dataset", | |
| token=hf_token, | |
| ) | |
| _notify_slack( | |
| f"New BonaFide submission `{tracking_id}`\n" | |
| f"• metric: `{metric_name.strip()}`\n" | |
| f"• rows: {len(df)}\n" | |
| f"• code/paper: {code_link.strip()}\n" | |
| f"• contact: {contact or '(none)'}\n" | |
| f"• description: {description or '(none)'}" | |
| ) | |
| return ( | |
| f"Submission received. Tracking ID: **`{tracking_id}`**.\n\n" | |
| f"A maintainer will review and run the evaluation, then contact you " | |
| f"via the link or contact info you provided within 72 hours.", | |
| tracking_id, | |
| now, | |
| ) | |
| # ---------- UI ---------- | |
| THEME = gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], | |
| ) | |
| CSS = """ | |
| .gradio-container { | |
| max-width: 1180px !important; | |
| margin: 0 auto !important; | |
| } | |
| .gradio-container h1 { | |
| font-size: 2.0rem !important; | |
| font-weight: 700 !important; | |
| letter-spacing: -0.01em; | |
| margin-top: 1.2rem; | |
| margin-bottom: 0.4rem; | |
| } | |
| .gradio-container h2 { | |
| font-size: 1.3rem !important; | |
| font-weight: 600 !important; | |
| margin-top: 2.2rem !important; | |
| margin-bottom: 0.4rem !important; | |
| } | |
| .gradio-container h3 { | |
| font-size: 1.0rem !important; | |
| font-weight: 600 !important; | |
| margin-top: 1.2rem !important; | |
| } | |
| .gradio-container table { font-size: 0.92rem; } | |
| .gradio-container table th { | |
| font-weight: 600 !important; | |
| text-transform: none !important; | |
| padding: 10px 12px !important; | |
| white-space: nowrap !important; | |
| } | |
| .gradio-container table td { | |
| padding: 8px 12px !important; | |
| vertical-align: middle !important; | |
| } | |
| .gradio-container table td:first-child { | |
| font-weight: 500 !important; | |
| } | |
| .hero { | |
| text-align: center; | |
| margin: 0.8rem 0 1.6rem 0; | |
| } | |
| .hero img { | |
| width: 140px; | |
| height: 140px; | |
| object-fit: contain; | |
| display: block; | |
| margin: 0 auto 0.6rem auto; | |
| } | |
| .hero h1 { | |
| margin-top: 0.2rem !important; | |
| margin-bottom: 0.2rem !important; | |
| } | |
| .hero-tagline { | |
| font-size: 1.05rem; | |
| color: #4b5563; | |
| margin: 0.2rem 0 0.6rem 0; | |
| } | |
| .hero-cite { | |
| font-size: 0.9rem; | |
| color: #6b7280; | |
| max-width: 720px; | |
| margin: 0 auto; | |
| } | |
| /* Inline prose links: kill Soft-theme's button-ish padding/background. */ | |
| .gradio-container li a, | |
| .gradio-container p a { | |
| display: inline !important; | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| background: transparent !important; | |
| border: none !important; | |
| } | |
| """ | |
| with gr.Blocks(title="BonaFide — CoT Faithfulness Metric Benchmark", theme=THEME, css=CSS) as demo: | |
| _logo_b64 = base64.b64encode((ASSETS / "bonafide_logo.png").read_bytes()).decode() | |
| gr.HTML( | |
| f""" | |
| <div class="hero"> | |
| <img src="data:image/png;base64,{_logo_b64}" alt="BonaFide" /> | |
| <h1>BonaFide</h1> | |
| <p class="hero-tagline">A benchmark for chain-of-thought faithfulness metrics.</p> | |
| <p class="hero-cite">From the paper <em>Faithfulness Metrics Don't Measure Faithfulness: A Meta-Evaluation with Ground Truth</em> by Yoav Gur-Arieh, Ana Marasović, and Mor Geva (2026).</p> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown("## Leaderboard") | |
| gr.Markdown( | |
| "BonaFide evaluates **chain-of-thought faithfulness metrics** — " | |
| "tools that claim to detect when a model's reasoning trace fails " | |
| "to reflect the computation behind its answer. Each metric is " | |
| "scored by **AUROC against ground-truth faithfulness labels**, " | |
| "which we construct from carefully designed tasks where the " | |
| "model's output reveals which intermediate steps must have " | |
| "produced it. Per-metric AUROC is shown at both the CoT and step " | |
| "level, aggregated across tasks and settings." | |
| ) | |
| gr.Dataframe( | |
| value=build_leaderboard(), | |
| datatype=["str", "markdown", "number", "markdown", "number"], | |
| interactive=False, | |
| wrap=True, | |
| max_height=900, | |
| ) | |
| _eval_b64 = base64.b64encode( | |
| (ROOT / "scripts" / "evaluate_submission.py").read_bytes() | |
| ).decode() | |
| gr.HTML( | |
| f""" | |
| <h2>Submit a metric</h2> | |
| <p>Steps:</p> | |
| <ol> | |
| <li>Download the <a href="https://huggingface.co/datasets/yoavgurarieh/BonaFide" target="_blank">BonaFide</a> dataset.</li> | |
| <li>Run your metric and produce a <code>submission.csv</code> (or <code>submission.csv.gz</code>) with exactly three columns: <code>id, score, wall_time_s</code>. <code>id</code> matches the row id in the dataset; <code>score</code> is a real number with the convention <strong>higher = faithful</strong>; <code>wall_time_s</code> is the wall-time (in seconds) your metric spent computing the score for that row.</li> | |
| <li>You can validate locally with the evaluator <a href="data:text/x-python;base64,{_eval_b64}" download="evaluate_submission.py">script</a> to get the AUROC scores.</li> | |
| <li>Submit below. You'll receive a tracking ID. A maintainer reviews the submission and adds the results to the leaderboard.</li> | |
| </ol> | |
| """ | |
| ) | |
| metric_name = gr.Textbox(label="Metric name", placeholder="my_metric_v1") | |
| description = gr.Textbox( | |
| label="One-paragraph description", | |
| lines=3, | |
| placeholder="What does the metric measure and how is the score computed?", | |
| ) | |
| code_link = gr.Textbox( | |
| label="Code or paper link (required)", | |
| placeholder="https://github.com/you/your-metric or https://arxiv.org/abs/...", | |
| ) | |
| contact = gr.Textbox( | |
| label="Contact email (required)", | |
| placeholder="you@example.com", | |
| ) | |
| file_in = gr.File( | |
| label="submission.csv or submission.csv.gz", | |
| file_types=[".csv", ".gz"], | |
| ) | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| status = gr.Markdown() | |
| tracking = gr.Textbox(label="Tracking ID", interactive=False) | |
| last_submit_ts = gr.State(value=0.0) | |
| submit_btn.click( | |
| handle_submission, | |
| inputs=[metric_name, description, code_link, contact, file_in, last_submit_ts], | |
| outputs=[status, tracking, last_submit_ts], | |
| ) | |
| gr.Markdown((PAGES / "cite.md").read_text()) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False) | |