polyguard-openenv / app /evaluation /robustness_eval.py
TheJackBright's picture
Deploy PolyGuard OpenEnv Space
877add7 verified
"""Robustness evaluation suite computed from perturbed rollouts."""
from __future__ import annotations
from app.evaluation.simulator_rollouts import run_rollouts
def _safety_rate(rows: list[dict]) -> float:
if not rows:
return 0.0
return round(sum(1.0 for row in rows if bool(row.get("legal", False))) / len(rows), 6)
def robustness_eval() -> dict[str, float]:
return {
"missing_labs_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="hard", perturbation="missing_labs")
),
"noisy_dose_info_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="medium", perturbation="noisy_dose_info")
),
"conflicting_meds_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="hard", perturbation="conflicting_meds")
),
"alias_noise_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="medium", perturbation="alias_noise")
),
"hidden_duplicate_detection_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="hard", perturbation="hidden_duplicate")
),
"wrong_candidate_id_resilience": _safety_rate(
run_rollouts(episodes=6, difficulty="medium", policy_stack="bandit-only")
),
"stale_evidence_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="hard", perturbation="stale_evidence")
),
"delayed_ade_manifestation_safety_rate": _safety_rate(
run_rollouts(episodes=6, difficulty="hard", perturbation="delayed_ade")
),
}