polyguard-openenv / app /training /reward_functions.py
TheJackBright's picture
Deploy PolyGuard OpenEnv Space
877add7 verified
"""Standalone reward functions with strict [0.001, 0.999] output."""
from __future__ import annotations
from app.common.normalization import clamp_reward
def format_compliance_score(valid: bool) -> float:
"""Schema validity: valid->0.999, invalid->0.001."""
return clamp_reward(0.999 if valid else 0.001)
def candidate_alignment_score(aligned: bool) -> float:
"""Whether selected action references legal candidate set."""
return clamp_reward(0.999 if aligned else 0.001)
def legality_score(legal: bool) -> float:
"""Hard constraint satisfaction score."""
return clamp_reward(0.999 if legal else 0.001)
def safety_delta_score(delta: float) -> float:
"""Risk-delta mapping where positive delta means lower safety risk."""
return clamp_reward(0.5 + delta * 0.4)
def burden_improvement_score(delta: float) -> float:
"""Burden reduction score; positive delta indicates lower burden."""
return clamp_reward(0.5 + delta * 0.4)
def disease_stability_score(stability: float) -> float:
"""Stability proxy in [0,1], default caller-side imputation when missing."""
return clamp_reward(stability)
def dosing_quality_score(quality: float) -> float:
"""Dose quality proxy in [0,1], neutral caller default for non-dose scenarios."""
return clamp_reward(quality)
def abstention_quality_score(good_abstain: bool) -> float:
"""Judges abstention quality; not merely abstaining."""
return clamp_reward(0.8 if good_abstain else 0.3)
def efficiency_score(step_fraction: float) -> float:
"""Shorter successful trajectories receive higher score."""
return clamp_reward(1.0 - step_fraction)
def process_fidelity_score(fidelity: float) -> float:
"""Process-supervision score for valid clinical decision sequence."""
return clamp_reward(fidelity)
def explanation_grounding_score(grounded: float) -> float:
"""Grounded explanation support score."""
return clamp_reward(grounded)
def anti_cheat_score(exploit: bool) -> float:
"""Exploit-like behavior gets floor score."""
return clamp_reward(0.001 if exploit else 0.999)
def uncertainty_calibration_score(calibration: float) -> float:
"""Confidence calibration score."""
return clamp_reward(calibration)