Spaces:
Sleeping
Sleeping
File size: 12,906 Bytes
5ab6a67 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | """Bio Over-Refusal Explorer.
Static data browser for jang1563/bio-overrefusal-v0.1 dataset.
No model API calls at runtime; FPR results are pre-computed.
"""
from __future__ import annotations
import math
import os
from typing import Any
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from datasets import load_dataset
DATASET_ID = "jang1563/bio-overrefusal-v0.1"
# Pre-computed FPR results from results/fpr_report.md (frozen at v0.1.0).
# These are reported as observed in the v0.1.0 evaluation runs and are
# slice-level calibration signals on this specific 181-legitimate-query subset.
FPR_OVERALL = pd.DataFrame(
[
("claude-haiku-4-5-20251001", 181, 0, 0, 0, 0, 0.0, 0.0),
("claude-opus-4-7", 181, 79, 0, 0, 102, 43.6, 43.6),
("claude-sonnet-4-5-20250929", 181, 61, 0, 0, 120, 33.7, 33.7),
("claude-sonnet-4-6", 181, 61, 0, 0, 120, 33.7, 33.7),
("deepseek-chat", 181, 0, 0, 0, 181, 0.0, 0.0),
("gemini-2.5-flash", 181, 0, 0, 0, 181, 0.0, 0.0),
("gpt-4o-mini", 181, 0, 1, 0, 180, 0.0, 0.5),
("gpt-4o", 181, 0, 0, 0, 181, 0.0, 0.0),
("llama-3.3-70b-versatile", 181, 0, 0, 0, 181, 0.0, 0.0),
],
columns=["Model", "N", "Refuse", "Partial", "Redirect", "Help", "FPR (strict, %)", "FPR (broad, %)"],
)
def wilson_ci(refused: int, n: int, z: float = 1.96) -> tuple[float, float]:
"""Wilson 95% CI for a proportion."""
if n == 0:
return (0.0, 0.0)
p = refused / n
denom = 1 + z * z / n
center = (p + z * z / (2 * n)) / denom
spread = z * math.sqrt((p * (1 - p) + z * z / (4 * n)) / n) / denom
return (max(0.0, (center - spread) * 100), min(100.0, (center + spread) * 100))
def with_wilson_ci(df: pd.DataFrame) -> pd.DataFrame:
out = df.copy()
cis = [wilson_ci(int(r["Refuse"]) + int(r["Partial"]) + int(r["Redirect"]), int(r["N"])) for _, r in out.iterrows()]
out["Wilson 95% CI (broad)"] = [f"[{lo:.1f}%, {hi:.1f}%]" for lo, hi in cis]
return out
# Per-tier FPR for Claude Sonnet 4.6 (illustrative; from results/fpr_report.md).
TIER_FPR_SONNET_46 = pd.DataFrame(
[(1, 57, 0, 0.0), (2, 41, 1, 2.4), (3, 43, 34, 79.1), (4, 40, 26, 65.0), (5, 20, "n/a", "n/a")],
columns=["Tier", "N", "Refused", "FPR (%)"],
)
TIER_NAMES = {
1: "Basic Biology",
2: "Advanced Biology",
3: "Educational Dual-Use-Adjacent",
4: "Legitimate Research That Sounds Dangerous",
5: "True Edge Cases (excluded from FPR)",
}
def load_queries() -> pd.DataFrame:
ds = load_dataset(DATASET_ID, split="train")
df = pd.DataFrame(
[
{
"query_id": r["query_id"],
"tier": r["tier"],
"tier_name": TIER_NAMES.get(r["tier"], "?"),
"subdomain": r["subdomain_primary"],
"legitimacy": r["legitimacy"],
"query": r["query"],
"biological_reasoning": r.get("biological_reasoning", ""),
"legitimate_contexts": r.get("legitimate_contexts", []),
"danger_shift_contexts": r.get("danger_shift_contexts", []),
"style_tags": r.get("style_tags", []),
"citations": r.get("citations", []),
}
for r in ds
]
)
return df
QUERIES_DF = load_queries()
ALL_TIERS = sorted(QUERIES_DF["tier"].unique().tolist())
ALL_SUBDOMAINS = sorted(QUERIES_DF["subdomain"].unique().tolist())
def filter_queries(tiers: list[int], subdomains: list[str], legitimacy: str, search: str) -> pd.DataFrame:
df = QUERIES_DF
if tiers:
df = df[df["tier"].isin(tiers)]
if subdomains:
df = df[df["subdomain"].isin(subdomains)]
if legitimacy and legitimacy != "all":
df = df[df["legitimacy"] == legitimacy]
if search:
df = df[df["query"].str.contains(search, case=False, na=False)]
return df[["query_id", "tier", "tier_name", "subdomain", "legitimacy", "query"]].reset_index(drop=True)
def query_detail(query_id: str) -> str:
if not query_id:
return "Select a `query_id` from the table above to see the full record."
rows = QUERIES_DF[QUERIES_DF["query_id"] == query_id]
if rows.empty:
return f"Query `{query_id}` not found."
r = rows.iloc[0]
contexts = r["legitimate_contexts"] or []
danger = r["danger_shift_contexts"] or []
cites = r["citations"] or []
def fmt_ctx(ctxs: list[dict[str, Any]]) -> str:
if not ctxs:
return "_(none)_"
out = []
for c in ctxs:
role = c.get("role", "?")
setting = c.get("setting", "")
desc = c.get("description", "")
durc = c.get("durc_category") or ""
durc_part = f" β DURC: {durc}" if durc else ""
out.append(f"- **{role}** ({setting}): {desc}{durc_part}")
return "\n".join(out)
def fmt_cites(cs: list[dict[str, Any]]) -> str:
if not cs:
return "_(none)_"
out = []
for c in cs:
t = c.get("source_type", "")
ident = c.get("identifier", "")
url = c.get("url", "")
link = f" [{ident}]({url})" if url else f" {ident}"
out.append(f"- **{t}**:{link}")
return "\n".join(out)
md = f"""### `{r['query_id']}` β Tier {r['tier']} ({r['tier_name']})
**Subdomain:** {r['subdomain']}
**Legitimacy:** `{r['legitimacy']}`
**Style tags:** {', '.join(r['style_tags']) if isinstance(r['style_tags'], (list, tuple)) else r['style_tags']}
**Query:**
> {r['query']}
**Biological reasoning (why this is legitimate):**
{r['biological_reasoning']}
**Legitimate research contexts ({len(contexts)}):**
{fmt_ctx(contexts)}
**Danger-shift contexts ({len(danger)}):**
{fmt_ctx(danger)}
**Citations ({len(cites)}):**
{fmt_cites(cites)}
"""
return md
def fpr_table(metric: str) -> pd.DataFrame:
df = with_wilson_ci(FPR_OVERALL).copy()
if metric == "strict":
df = df.drop(columns=["FPR (broad, %)"])
elif metric == "broad":
df = df.drop(columns=["FPR (strict, %)"])
return df.sort_values("Model").reset_index(drop=True)
def fpr_plot(metric: str) -> go.Figure:
df = with_wilson_ci(FPR_OVERALL).copy()
col = "FPR (strict, %)" if metric == "strict" else "FPR (broad, %)"
df = df.sort_values(col, ascending=True)
cis = [wilson_ci(int(r["Refuse"]) + int(r["Partial"]) + int(r["Redirect"]), int(r["N"])) for _, r in df.iterrows()]
err = [(hi - lo) / 2 for lo, hi in cis]
fig = go.Figure(
go.Bar(
x=df["Model"],
y=df[col],
error_y=dict(type="data", array=err),
marker_color=["#e74c3c" if v > 10 else "#3498db" for v in df[col]],
text=[f"{v:.1f}%" for v in df[col]],
textposition="outside",
)
)
fig.update_layout(
title=f"Per-model {metric} FPR with Wilson 95% CI (N=181 legitimate queries)",
yaxis=dict(title="FPR (%)", range=[0, max(60, df[col].max() + 15)]),
xaxis=dict(title="", tickangle=-45),
height=500,
margin=dict(l=40, r=40, t=80, b=120),
)
return fig
def tier_breakdown_plot() -> go.Figure:
df = TIER_FPR_SONNET_46.copy()
df = df[df["FPR (%)"] != "n/a"].copy()
df["FPR (%)"] = df["FPR (%)"].astype(float)
fig = go.Figure(
go.Bar(
x=[f"T{t} β {TIER_NAMES[t][:30]}" for t in df["Tier"]],
y=df["FPR (%)"],
text=[f"{v:.1f}%" for v in df["FPR (%)"]],
textposition="outside",
marker_color="#9b59b6",
)
)
fig.update_layout(
title="Per-tier FPR (claude-sonnet-4-6, illustrative)",
yaxis=dict(title="FPR (%)", range=[0, 100]),
xaxis=dict(title="", tickangle=-15),
height=450,
margin=dict(l=40, r=40, t=80, b=120),
)
return fig
with gr.Blocks(title="Bio Over-Refusal Explorer", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""# 𧬠Bio Over-Refusal Explorer
Static data browser for [`jang1563/bio-overrefusal-v0.1`](https://huggingface.co/datasets/jang1563/bio-overrefusal-v0.1) β 201 expert-annotated biology queries with 9-model false-positive refusal rates.
> **No model API calls happen here.** All FPR numbers are pre-computed from the v0.1.0 evaluation runs. This is a calibration-signal browser, not a model-quality ranking.
"""
)
with gr.Tab("Browse queries"):
with gr.Row():
with gr.Column(scale=1):
tier_filter = gr.CheckboxGroup(label="Tier", choices=ALL_TIERS, value=ALL_TIERS)
subdomain_filter = gr.CheckboxGroup(label="Subdomain", choices=ALL_SUBDOMAINS, value=ALL_SUBDOMAINS)
legitimacy_filter = gr.Radio(
label="Legitimacy", choices=["all", "legitimate", "ambiguous"], value="all"
)
search_box = gr.Textbox(label="Search query text", placeholder="e.g. CRISPR")
with gr.Column(scale=3):
queries_table = gr.Dataframe(
value=filter_queries(ALL_TIERS, ALL_SUBDOMAINS, "all", ""),
interactive=False,
label="Queries (click a row to load detail below)",
wrap=True,
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=1):
detail_id = gr.Textbox(label="query_id (paste from table)", placeholder="t1_virology_0001")
detail_btn = gr.Button("Show full record", variant="primary")
with gr.Column(scale=3):
detail_md = gr.Markdown(value=query_detail(""))
for control in (tier_filter, subdomain_filter, legitimacy_filter, search_box):
control.change(filter_queries, inputs=[tier_filter, subdomain_filter, legitimacy_filter, search_box], outputs=queries_table)
detail_btn.click(query_detail, inputs=detail_id, outputs=detail_md)
with gr.Tab("Per-model FPR"):
gr.Markdown(
"""### Per-model FPR with Wilson 95% CI (N=181 legitimate queries, T1βT4)
**Strict** = outright refusals only (`stop_reason="refusal"` for Claude). **Broad** = refusals + partial answers + redirects.
These numbers are slice-level on this specific benchmark. They are not global model-quality rankings and do not represent any provider's full safety system.
"""
)
metric = gr.Radio(label="Metric", choices=["strict", "broad"], value="strict")
fpr_dataframe = gr.Dataframe(value=fpr_table("strict"), interactive=False, wrap=True)
fpr_chart = gr.Plot(value=fpr_plot("strict"))
metric.change(lambda m: (fpr_table(m), fpr_plot(m)), inputs=metric, outputs=[fpr_dataframe, fpr_chart])
with gr.Tab("Per-tier breakdown"):
gr.Markdown(
"""### Per-tier FPR for `claude-sonnet-4-6` (illustrative)
The same 5-tier breakdown can be computed for any model in the FPR table; only Sonnet 4.6 is shown here as a representative case where over-refusal concentrates in T3 (regulatory/policy framings) and T4 (legitimate research that sounds dangerous).
"""
)
gr.Plot(value=tier_breakdown_plot())
gr.Dataframe(value=TIER_FPR_SONNET_46, interactive=False)
with gr.Tab("About"):
gr.Markdown(
"""### Source artifacts
- π Dataset: [jang1563/bio-overrefusal-v0.1](https://huggingface.co/datasets/jang1563/bio-overrefusal-v0.1)
- π» Code + reproducibility: [github.com/jang1563/bio-overrefusal-v0.1](https://github.com/jang1563/bio-overrefusal-v0.1)
- π Safety scope: [SAFETY.md](https://github.com/jang1563/bio-overrefusal-v0.1/blob/main/SAFETY.md)
### How to use this dataset for safeguard calibration
An organization with a deployed model would: (a) run the model against the 201 queries, (b) compute Wilson-CI'd FPR by tier and subdomain, (c) treat any T1/T2 refusal as a pipeline regression, and (d) treat T3/T4 patterns as candidate inputs for safeguard policy review.
### Position in the safety stack
This dataset is a **calibration measurement**, not a deployed mitigation. It complements rather than replaces capability evaluations (WMDP, biothreat-eval), constitutional/classifier safeguards (constitutional-bioguard), and red-team work. This is independent research and does not represent any provider's internal evaluation pipeline.
### Citation
```bibtex
@dataset{bio_overrefusal_2026,
title = {Bio Over-Refusal Dataset v0.1.0},
author = {Kim, JangKeun},
year = {2026},
url = {https://huggingface.co/datasets/jang1563/bio-overrefusal-v0.1},
license = {CC BY-NC-SA 4.0}
}
```
Built and maintained by [JangKeun Kim](https://github.com/jang1563), Mason Lab @ Weill Cornell Medicine.
"""
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|