Spaces:

Otter21
/

Gov_Workflow_RL

Running

Siddharaj Shirke

deploy: clean code-only snapshot for HF Space

df97e68 10 days ago

5.06 kB

	"""
	reward.py — Gov Workflow OpenEnv Phase 4: Dense Reward Shaping

	Formula (per step):
	R_t = progress_reward + completion_reward + recovery_reward + stability_bonus
	- waiting_penalty - sla_penalty - fairness_penalty
	- invalid_action_penalty - idle_capacity_penalty - oscillation_penalty

	All coefficients are named constants — never magic numbers inline.
	"""
	from __future__ import annotations
	from app.models import RewardModel

	# ── Positive coefficients ─────────────────────────────────────────
	COEFF_PROGRESS = 0.7 # per stage advance
	COEFF_COMPLETION = 4.0 # per completed case
	COEFF_RECOVERY = 1.5 # per unblocked missing-doc case resolved
	COEFF_STABILITY = 0.1 # per step with zero SLA breaches and zero invalid actions

	# ── Negative coefficients ─────────────────────────────────────────
	COEFF_WAITING = 0.04 # per case per day in backlog
	COEFF_SLA = 1.5 # per new SLA breach
	COEFF_FAIRNESS = 2.0 # per unit of fairness excess above threshold
	COEFF_INVALID = 1.5 # flat penalty per invalid action
	COEFF_IDLE = 0.05 # per idle officer-day
	COEFF_OSCILLATION = 0.15 # per oscillation event (repeated contradictory actions)

	# ── Fairness default tolerance (when no threshold set by task) ────
	DEFAULT_FAIRNESS_TOLERANCE = 0.40


	def compute_reward(
	*,
	stage_advances: int,
	completions: int,
	active_backlog: int,
	new_sla_breaches: int,
	fairness_gap: float,
	fairness_threshold: float \| None,
	invalid_action: bool,
	idle_capacity: int,
	newly_unblocked_docs: int = 0,
	oscillation_detected: bool = False,
	award_stability_bonus: bool = True,
	) -> RewardModel:
	"""
	Compute one-step dense reward.

	Args:
	stage_advances: Number of applications that moved forward one stage today.
	completions: Number of applications fully completed today.
	active_backlog: Total cases still pending (creates waiting pressure).
	new_sla_breaches: New SLA deadline violations this step.
	fairness_gap: Cross-service completion fairness gap [0.0, 1.0].
	fairness_threshold: Task-defined acceptable fairness gap (or None → default).
	invalid_action: Whether the submitted action was invalid.
	idle_capacity: Officer-days wasted idle while backlog exists.
	newly_unblocked_docs: Cases unblocked after missing-doc resolution (positive signal).
	oscillation_detected: True if agent is rapidly reversing recent decisions.

	Returns:
	RewardModel with all components filled and total_reward as the scalar.
	"""
	# ── Positive components ───────────────────────────────────────
	progress_reward = COEFF_PROGRESS * stage_advances
	completion_reward = COEFF_COMPLETION * completions
	recovery_reward = COEFF_RECOVERY * newly_unblocked_docs
	stability_bonus = (
	COEFF_STABILITY
	if (award_stability_bonus and new_sla_breaches == 0 and not invalid_action)
	else 0.0
	)

	# ── Negative components ───────────────────────────────────────
	waiting_penalty = COEFF_WAITING * active_backlog

	sla_penalty = COEFF_SLA * new_sla_breaches

	tolerance = fairness_threshold if fairness_threshold is not None else DEFAULT_FAIRNESS_TOLERANCE
	unfairness_excess = max(0.0, fairness_gap - tolerance)
	fairness_penalty = COEFF_FAIRNESS * unfairness_excess

	invalid_action_penalty = COEFF_INVALID if invalid_action else 0.0

	idle_capacity_penalty = COEFF_IDLE * idle_capacity

	oscillation_penalty = COEFF_OSCILLATION if oscillation_detected else 0.0

	# ── Total ─────────────────────────────────────────────────────
	total_reward = (
	progress_reward + completion_reward + recovery_reward + stability_bonus
	- waiting_penalty - sla_penalty - fairness_penalty
	- invalid_action_penalty - idle_capacity_penalty - oscillation_penalty
	)

	return RewardModel(
	total_reward=round(total_reward, 4),
	progress_reward=round(progress_reward, 4),
	completion_reward=round(completion_reward, 4),
	recovery_reward=round(recovery_reward, 4),
	stability_bonus=round(stability_bonus, 4),
	waiting_penalty=round(-waiting_penalty, 4),
	sla_penalty=round(-sla_penalty, 4),
	fairness_penalty=round(-fairness_penalty, 4),
	invalid_action_penalty=round(-invalid_action_penalty, 4),
	idle_capacity_penalty=round(-idle_capacity_penalty, 4),
	oscillation_penalty=round(-oscillation_penalty, 4),
	)