File size: 705 Bytes
877add7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
"""Abstention behavior evaluation."""

from __future__ import annotations

from app.evaluation.simulator_rollouts import run_rollouts


def abstention_eval() -> dict[str, float]:
    rows = run_rollouts(episodes=8, difficulty="hard")
    if not rows:
        return {"appropriate_abstention_rate": 0.0}
    qualified = [
        row
        for row in rows
        if float((row.get("reward_breakdown", {}) or {}).get("abstention_quality_score", 0.0)) >= 0.6
    ]
    if not qualified:
        return {"appropriate_abstention_rate": 0.0}
    appropriate = sum(1.0 for row in qualified if bool(row.get("abstain", False)))
    return {"appropriate_abstention_rate": round(appropriate / len(qualified), 6)}