Spaces:

yoavgurarieh
/

BonaFide-Benchmark

Running

File size: 5,656 Bytes

"""Evaluate a BonaFide faithfulness-metric submission against ground truth.

Reads a submission CSV (columns: id, score, wall_time_s), joins it with the
BonaFide labels file on `id`, and reports per-level AUROC with 95% DeLong
confidence intervals — exactly the numbers that go on the leaderboard.

By convention, scores follow "higher = faithful". An AUROC below 0.5 means
your metric is anti-correlated with faithfulness on BonaFide.

This is the same script maintainers use to score your submission; running it
locally is the only way to know what your leaderboard row will say before you
submit.

Usage:
    python evaluate_submission.py \\
        --submission your_submission.csv \\
        --labels path/to/BonaFide_Dataset.csv
"""

import argparse
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats


REQUIRED_COLS = {"id", "score", "wall_time_s"}
MIN_CELL_N = 30
LEVELS = [
    ("CoT",  {"FAITHFUL_COT", "UNFAITHFUL_COT"}),
    ("Step", {"FAITHFUL_STEP", "UNFAITHFUL_STEP"}),
]


def delong_auc_ci(y_true, y_score, alpha=0.05):
    """DeLong CI on logit AUC. Returns (auc, low, high)."""
    y_true = np.asarray(y_true, dtype=int)
    y_score = np.asarray(y_score, dtype=float)
    pos_mask = y_true == 1
    neg_mask = y_true == 0
    n_pos, n_neg = int(pos_mask.sum()), int(neg_mask.sum())
    if n_pos == 0 or n_neg == 0:
        return float("nan"), float("nan"), float("nan")
    pos_scores = y_score[pos_mask]
    neg_scores = y_score[neg_mask]
    sorted_neg = np.sort(neg_scores)
    sorted_pos = np.sort(pos_scores)

    def placement(values, reference_sorted):
        less = np.searchsorted(reference_sorted, values, side="left")
        leq = np.searchsorted(reference_sorted, values, side="right")
        return (less + leq) / 2.0 / len(reference_sorted)

    V10 = placement(pos_scores, sorted_neg)
    V01 = 1.0 - placement(neg_scores, sorted_pos)
    auc = float(V10.mean())
    s10 = float(V10.var(ddof=1)) if n_pos > 1 else 0.0
    s01 = float(V01.var(ddof=1)) if n_neg > 1 else 0.0
    var = s10 / n_pos + s01 / n_neg
    if var <= 0:
        return auc, auc, auc
    eps = 1e-9
    a = min(max(auc, eps), 1 - eps)
    logit = np.log(a / (1 - a))
    se = np.sqrt(var) / (a * (1 - a))
    z = stats.norm.ppf(1 - alpha / 2)
    return (
        auc,
        float(1 / (1 + np.exp(-(logit - z * se)))),
        float(1 / (1 + np.exp(-(logit + z * se)))),
    )


def _read_csv(path: Path) -> pd.DataFrame:
    if path.name.endswith(".csv.gz") or path.suffix == ".gz":
        return pd.read_csv(path, compression="gzip")
    return pd.read_csv(path)


def evaluate(submission_path: Path, labels_path: Path) -> None:
    print(f"Loading submission from {submission_path} ...")
    sub = _read_csv(submission_path)

    missing = REQUIRED_COLS - set(sub.columns)
    extra = set(sub.columns) - REQUIRED_COLS
    if missing or extra:
        print(f"ERROR: submission must have exactly columns {sorted(REQUIRED_COLS)}.")
        if missing:
            print(f"  Missing: {sorted(missing)}")
        if extra:
            print(f"  Extra:   {sorted(extra)}")
        sys.exit(1)
    if not pd.to_numeric(sub["score"], errors="coerce").notna().all():
        print("ERROR: all `score` values must be numeric.")
        sys.exit(1)
    wt_numeric = pd.to_numeric(sub["wall_time_s"], errors="coerce")
    if not wt_numeric.notna().all() or (wt_numeric < 0).any():
        print("ERROR: all `wall_time_s` values must be non-negative numbers.")
        sys.exit(1)
    print(f"  {len(sub):,} scored rows")

    print(f"Loading labels from {labels_path} ...")
    labels = pd.read_csv(labels_path, usecols=["id", "label_type"]).dropna(
        subset=["label_type"]
    )
    print(f"  {len(labels):,} labeled rows")

    merged = sub.merge(labels, on="id", how="inner")
    if merged.empty:
        print("ERROR: no overlapping ids between submission and labels.")
        sys.exit(1)
    if len(merged) < len(sub):
        print(f"  {len(sub) - len(merged):,} submission rows had no matching label")
    if len(merged) < len(labels):
        print(f"  {len(labels) - len(merged):,} labels had no matching submission row")

    print()
    print(f"{'Level':<6} {'AUROC':<8} {'± 95% CI':<10} {'n':<6} {'mean wall-time':>16}")
    print("-" * 50)
    for level_name, label_set in LEVELS:
        cell = merged[merged["label_type"].isin(label_set)]
        if len(cell) < MIN_CELL_N:
            print(f"{level_name:<6} (skipped — only {len(cell)} matched rows, need ≥{MIN_CELL_N})")
            continue
        y_true = cell["label_type"].str.startswith("FAITHFUL").astype(int).to_numpy()
        y_score = cell["score"].to_numpy(dtype=float)
        auc, lo, hi = delong_auc_ci(y_true, y_score)
        margin = (hi - lo) / 2
        mean_wt = cell["wall_time_s"].mean()
        print(
            f"{level_name:<6} {auc:.3f}    ± {margin:.3f}    "
            f"{len(cell):<6} {mean_wt:>13.2f}s"
        )
    print()
    print(
        "Convention: scores are 'higher = faithful'. AUROC < 0.5 means your "
        "metric is anti-correlated with faithfulness on BonaFide."
    )


if __name__ == "__main__":
    p = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    p.add_argument("--submission", type=Path, required=True,
                   help="Your submission CSV (id, score, wall_time_s).")
    p.add_argument("--labels", type=Path, required=True,
                   help="BonaFide dataset CSV with id and label_type columns.")
    args = p.parse_args()
    evaluate(args.submission, args.labels)