Spaces:

Otter21
/

Gov_Workflow_RL

Running

File size: 5,055 Bytes

df97e68

"""
reward.py — Gov Workflow OpenEnv Phase 4: Dense Reward Shaping

Formula (per step):
  R_t = progress_reward + completion_reward + recovery_reward + stability_bonus
        - waiting_penalty - sla_penalty - fairness_penalty
        - invalid_action_penalty - idle_capacity_penalty - oscillation_penalty

All coefficients are named constants — never magic numbers inline.
"""
from __future__ import annotations
from app.models import RewardModel

# ── Positive coefficients ─────────────────────────────────────────
COEFF_PROGRESS     = 0.7   # per stage advance
COEFF_COMPLETION   = 4.0   # per completed case
COEFF_RECOVERY     = 1.5   # per unblocked missing-doc case resolved
COEFF_STABILITY    = 0.1   # per step with zero SLA breaches and zero invalid actions

# ── Negative coefficients ─────────────────────────────────────────
COEFF_WAITING      = 0.04  # per case per day in backlog
COEFF_SLA          = 1.5   # per new SLA breach
COEFF_FAIRNESS     = 2.0   # per unit of fairness excess above threshold
COEFF_INVALID      = 1.5   # flat penalty per invalid action
COEFF_IDLE         = 0.05  # per idle officer-day
COEFF_OSCILLATION  = 0.15  # per oscillation event (repeated contradictory actions)

# ── Fairness default tolerance (when no threshold set by task) ────
DEFAULT_FAIRNESS_TOLERANCE = 0.40


def compute_reward(
    *,
    stage_advances: int,
    completions: int,
    active_backlog: int,
    new_sla_breaches: int,
    fairness_gap: float,
    fairness_threshold: float | None,
    invalid_action: bool,
    idle_capacity: int,
    newly_unblocked_docs: int = 0,
    oscillation_detected: bool = False,
    award_stability_bonus: bool = True,
) -> RewardModel:
    """
    Compute one-step dense reward.

    Args:
        stage_advances:       Number of applications that moved forward one stage today.
        completions:          Number of applications fully completed today.
        active_backlog:       Total cases still pending (creates waiting pressure).
        new_sla_breaches:     New SLA deadline violations this step.
        fairness_gap:         Cross-service completion fairness gap [0.0, 1.0].
        fairness_threshold:   Task-defined acceptable fairness gap (or None → default).
        invalid_action:       Whether the submitted action was invalid.
        idle_capacity:        Officer-days wasted idle while backlog exists.
        newly_unblocked_docs: Cases unblocked after missing-doc resolution (positive signal).
        oscillation_detected: True if agent is rapidly reversing recent decisions.

    Returns:
        RewardModel with all components filled and total_reward as the scalar.
    """
    # ── Positive components ───────────────────────────────────────
    progress_reward   = COEFF_PROGRESS   * stage_advances
    completion_reward = COEFF_COMPLETION * completions
    recovery_reward   = COEFF_RECOVERY   * newly_unblocked_docs
    stability_bonus = (
        COEFF_STABILITY
        if (award_stability_bonus and new_sla_breaches == 0 and not invalid_action)
        else 0.0
    )

    # ── Negative components ───────────────────────────────────────
    waiting_penalty = COEFF_WAITING * active_backlog

    sla_penalty = COEFF_SLA * new_sla_breaches

    tolerance = fairness_threshold if fairness_threshold is not None else DEFAULT_FAIRNESS_TOLERANCE
    unfairness_excess = max(0.0, fairness_gap - tolerance)
    fairness_penalty = COEFF_FAIRNESS * unfairness_excess

    invalid_action_penalty = COEFF_INVALID if invalid_action else 0.0

    idle_capacity_penalty = COEFF_IDLE * idle_capacity

    oscillation_penalty = COEFF_OSCILLATION if oscillation_detected else 0.0

    # ── Total ─────────────────────────────────────────────────────
    total_reward = (
        progress_reward + completion_reward + recovery_reward + stability_bonus
        - waiting_penalty - sla_penalty - fairness_penalty
        - invalid_action_penalty - idle_capacity_penalty - oscillation_penalty
    )

    return RewardModel(
        total_reward=round(total_reward, 4),
        progress_reward=round(progress_reward, 4),
        completion_reward=round(completion_reward, 4),
        recovery_reward=round(recovery_reward, 4),
        stability_bonus=round(stability_bonus, 4),
        waiting_penalty=round(-waiting_penalty, 4),
        sla_penalty=round(-sla_penalty, 4),
        fairness_penalty=round(-fairness_penalty, 4),
        invalid_action_penalty=round(-invalid_action_penalty, 4),
        idle_capacity_penalty=round(-idle_capacity_penalty, 4),
        oscillation_penalty=round(-oscillation_penalty, 4),
    )