polyguard-openenv / app /evaluation /abstention_eval.py
TheJackBright's picture
Deploy PolyGuard OpenEnv Space
877add7 verified
"""Abstention behavior evaluation."""
from __future__ import annotations
from app.evaluation.simulator_rollouts import run_rollouts
def abstention_eval() -> dict[str, float]:
rows = run_rollouts(episodes=8, difficulty="hard")
if not rows:
return {"appropriate_abstention_rate": 0.0}
qualified = [
row
for row in rows
if float((row.get("reward_breakdown", {}) or {}).get("abstention_quality_score", 0.0)) >= 0.6
]
if not qualified:
return {"appropriate_abstention_rate": 0.0}
appropriate = sum(1.0 for row in qualified if bool(row.get("abstain", False)))
return {"appropriate_abstention_rate": round(appropriate / len(qualified), 6)}