"""Abstention behavior evaluation.""" from __future__ import annotations from app.evaluation.simulator_rollouts import run_rollouts def abstention_eval() -> dict[str, float]: rows = run_rollouts(episodes=8, difficulty="hard") if not rows: return {"appropriate_abstention_rate": 0.0} qualified = [ row for row in rows if float((row.get("reward_breakdown", {}) or {}).get("abstention_quality_score", 0.0)) >= 0.6 ] if not qualified: return {"appropriate_abstention_rate": 0.0} appropriate = sum(1.0 for row in qualified if bool(row.get("abstain", False))) return {"appropriate_abstention_rate": round(appropriate / len(qualified), 6)}