Gov_Workflow_RL / app /reward.py
Siddharaj Shirke
deploy: clean code-only snapshot for HF Space
df97e68
"""
reward.py β€” Gov Workflow OpenEnv Phase 4: Dense Reward Shaping
Formula (per step):
R_t = progress_reward + completion_reward + recovery_reward + stability_bonus
- waiting_penalty - sla_penalty - fairness_penalty
- invalid_action_penalty - idle_capacity_penalty - oscillation_penalty
All coefficients are named constants β€” never magic numbers inline.
"""
from __future__ import annotations
from app.models import RewardModel
# ── Positive coefficients ─────────────────────────────────────────
COEFF_PROGRESS = 0.7 # per stage advance
COEFF_COMPLETION = 4.0 # per completed case
COEFF_RECOVERY = 1.5 # per unblocked missing-doc case resolved
COEFF_STABILITY = 0.1 # per step with zero SLA breaches and zero invalid actions
# ── Negative coefficients ─────────────────────────────────────────
COEFF_WAITING = 0.04 # per case per day in backlog
COEFF_SLA = 1.5 # per new SLA breach
COEFF_FAIRNESS = 2.0 # per unit of fairness excess above threshold
COEFF_INVALID = 1.5 # flat penalty per invalid action
COEFF_IDLE = 0.05 # per idle officer-day
COEFF_OSCILLATION = 0.15 # per oscillation event (repeated contradictory actions)
# ── Fairness default tolerance (when no threshold set by task) ────
DEFAULT_FAIRNESS_TOLERANCE = 0.40
def compute_reward(
*,
stage_advances: int,
completions: int,
active_backlog: int,
new_sla_breaches: int,
fairness_gap: float,
fairness_threshold: float | None,
invalid_action: bool,
idle_capacity: int,
newly_unblocked_docs: int = 0,
oscillation_detected: bool = False,
award_stability_bonus: bool = True,
) -> RewardModel:
"""
Compute one-step dense reward.
Args:
stage_advances: Number of applications that moved forward one stage today.
completions: Number of applications fully completed today.
active_backlog: Total cases still pending (creates waiting pressure).
new_sla_breaches: New SLA deadline violations this step.
fairness_gap: Cross-service completion fairness gap [0.0, 1.0].
fairness_threshold: Task-defined acceptable fairness gap (or None β†’ default).
invalid_action: Whether the submitted action was invalid.
idle_capacity: Officer-days wasted idle while backlog exists.
newly_unblocked_docs: Cases unblocked after missing-doc resolution (positive signal).
oscillation_detected: True if agent is rapidly reversing recent decisions.
Returns:
RewardModel with all components filled and total_reward as the scalar.
"""
# ── Positive components ───────────────────────────────────────
progress_reward = COEFF_PROGRESS * stage_advances
completion_reward = COEFF_COMPLETION * completions
recovery_reward = COEFF_RECOVERY * newly_unblocked_docs
stability_bonus = (
COEFF_STABILITY
if (award_stability_bonus and new_sla_breaches == 0 and not invalid_action)
else 0.0
)
# ── Negative components ───────────────────────────────────────
waiting_penalty = COEFF_WAITING * active_backlog
sla_penalty = COEFF_SLA * new_sla_breaches
tolerance = fairness_threshold if fairness_threshold is not None else DEFAULT_FAIRNESS_TOLERANCE
unfairness_excess = max(0.0, fairness_gap - tolerance)
fairness_penalty = COEFF_FAIRNESS * unfairness_excess
invalid_action_penalty = COEFF_INVALID if invalid_action else 0.0
idle_capacity_penalty = COEFF_IDLE * idle_capacity
oscillation_penalty = COEFF_OSCILLATION if oscillation_detected else 0.0
# ── Total ─────────────────────────────────────────────────────
total_reward = (
progress_reward + completion_reward + recovery_reward + stability_bonus
- waiting_penalty - sla_penalty - fairness_penalty
- invalid_action_penalty - idle_capacity_penalty - oscillation_penalty
)
return RewardModel(
total_reward=round(total_reward, 4),
progress_reward=round(progress_reward, 4),
completion_reward=round(completion_reward, 4),
recovery_reward=round(recovery_reward, 4),
stability_bonus=round(stability_bonus, 4),
waiting_penalty=round(-waiting_penalty, 4),
sla_penalty=round(-sla_penalty, 4),
fairness_penalty=round(-fairness_penalty, 4),
invalid_action_penalty=round(-invalid_action_penalty, 4),
idle_capacity_penalty=round(-idle_capacity_penalty, 4),
oscillation_penalty=round(-oscillation_penalty, 4),
)