Spaces:
Running
Running
File size: 705 Bytes
877add7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | """Abstention behavior evaluation."""
from __future__ import annotations
from app.evaluation.simulator_rollouts import run_rollouts
def abstention_eval() -> dict[str, float]:
rows = run_rollouts(episodes=8, difficulty="hard")
if not rows:
return {"appropriate_abstention_rate": 0.0}
qualified = [
row
for row in rows
if float((row.get("reward_breakdown", {}) or {}).get("abstention_quality_score", 0.0)) >= 0.6
]
if not qualified:
return {"appropriate_abstention_rate": 0.0}
appropriate = sum(1.0 for row in qualified if bool(row.get("abstain", False)))
return {"appropriate_abstention_rate": round(appropriate / len(qualified), 6)}
|