Spaces:

TheJackBright
/

polypharmacy-env

Sleeping

TheJackBright Claude Opus 4.6 commited on 29 days ago

Commit

c314a65

1 Parent(s): 3948a09

Enforce strict (0.001, 0.999) bounds on ALL rewards and scores

- rewards.py: fix early returns for invalid/timeout that skipped clamp
- env_core.py: align _REWARD_MIN/MAX to 0.001/0.999
- graders.py: align _SCORE_MIN/MAX to 0.001/0.999
- inference.py: align _clamp01 to 0.001/0.999

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (4) hide show

backend/src/polypharmacy_env/env_core.py +24 -15
backend/src/polypharmacy_env/graders.py +1 -1
backend/src/polypharmacy_env/rewards.py +6 -8
inference.py +6 -6

backend/src/polypharmacy_env/env_core.py CHANGED Viewed

@@ -25,6 +25,15 @@ from .models import (
     PolypharmacyState,
 )
 from .rewards import compute_regimen_risk, compute_shaped_reward
 from .tasks import get_task_config, sample_episode
@@ -123,10 +132,10 @@ class PolypharmacyEnv(
         # Validate basic action structure
         valid, err = self._validate_action(action)
         if not valid:
-            reward = compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 action.action_type, is_invalid=True,
-            )
             info["error"] = err
             self._step_count += 1
             return self._check_timeout_and_build_obs(reward, info)
@@ -140,7 +149,7 @@ class PolypharmacyEnv(
         elif action.action_type == "finish_review":
             self._done = True
             score = self._run_grader()
-            reward = score  # terminal bonus
             info["grader_score"] = score
         self._step_count += 1
@@ -187,10 +196,10 @@ class PolypharmacyEnv(
         assert action.drug_id_1 and action.drug_id_2
         if self._remaining_query_budget <= 0:
-            reward = compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "query_ddi", is_invalid=True,
-            )
             info["error"] = "Query budget exhausted"
             return reward, info
@@ -210,12 +219,12 @@ class PolypharmacyEnv(
         if discovered_severe:
             self._severe_moderate_discovered += 1
-        reward = compute_shaped_reward(
             self._current_risk, self._current_risk,
             "query_ddi",
             discovered_severe=(result.severity == "severe"),
             discovered_moderate=(result.severity == "moderate"),
-        )
         info["ddi_result"] = {
             "severity": result.severity,
             "recommendation": result.recommendation,
@@ -229,10 +238,10 @@ class PolypharmacyEnv(
         assert action.intervention_type and action.intervention_type != "none"
         if self._remaining_intervention_budget <= 0:
-            reward = compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "propose_intervention", is_invalid=True,
-            )
             info["error"] = "Intervention budget exhausted"
             return reward, info
@@ -244,10 +253,10 @@ class PolypharmacyEnv(
                 break
         if target_idx is None:
-            reward = compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "propose_intervention", is_invalid=True,
-            )
             info["error"] = f"Drug {action.target_drug_id} not in current medications"
             return reward, info
@@ -321,7 +330,7 @@ class PolypharmacyEnv(
             step_index=self._step_count,
         ))
-        reward = compute_shaped_reward(previous_risk, self._current_risk, "propose_intervention")
         info["risk_delta"] = risk_delta
         return reward, info
@@ -354,7 +363,7 @@ class PolypharmacyEnv(
                 self._total_drug_changes,
                 self._critical_stopped_without_sub,
             )
-        return 0.001  # strict (0, 1) range required
     def _get_severe_pairs(self) -> List[Tuple[str, str]]:
         """Return all severe DDI pairs present in the *initial* medication list."""
@@ -377,8 +386,8 @@ class PolypharmacyEnv(
         if not self._done and self._step_count >= self._task_cfg.max_steps:
             self._done = True
             score = self._run_grader()
-            # Terminal reward = grader score only (strictly in (0, 1))
-            reward = score
             info["timeout"] = True
             info["grader_score"] = score

     PolypharmacyState,
 )
 from .rewards import compute_regimen_risk, compute_shaped_reward
+# ── Reward clamping ──────────────────────────────────────────────────────────
+_REWARD_MIN = 0.001
+_REWARD_MAX = 0.999
+def _clamp_reward(v: float) -> float:
+    """Clamp any reward to strict (0.001, 0.999) bounds."""
+    return max(_REWARD_MIN, min(_REWARD_MAX, v))
 from .tasks import get_task_config, sample_episode
         # Validate basic action structure
         valid, err = self._validate_action(action)
         if not valid:
+            reward = _clamp_reward(compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 action.action_type, is_invalid=True,
+            ))
             info["error"] = err
             self._step_count += 1
             return self._check_timeout_and_build_obs(reward, info)
         elif action.action_type == "finish_review":
             self._done = True
             score = self._run_grader()
+            reward = _clamp_reward(score)  # terminal bonus
             info["grader_score"] = score
         self._step_count += 1
         assert action.drug_id_1 and action.drug_id_2
         if self._remaining_query_budget <= 0:
+            reward = _clamp_reward(compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "query_ddi", is_invalid=True,
+            ))
             info["error"] = "Query budget exhausted"
             return reward, info
         if discovered_severe:
             self._severe_moderate_discovered += 1
+        reward = _clamp_reward(compute_shaped_reward(
             self._current_risk, self._current_risk,
             "query_ddi",
             discovered_severe=(result.severity == "severe"),
             discovered_moderate=(result.severity == "moderate"),
+        ))
         info["ddi_result"] = {
             "severity": result.severity,
             "recommendation": result.recommendation,
         assert action.intervention_type and action.intervention_type != "none"
         if self._remaining_intervention_budget <= 0:
+            reward = _clamp_reward(compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "propose_intervention", is_invalid=True,
+            ))
             info["error"] = "Intervention budget exhausted"
             return reward, info
                 break
         if target_idx is None:
+            reward = _clamp_reward(compute_shaped_reward(
                 self._current_risk, self._current_risk,
                 "propose_intervention", is_invalid=True,
+            ))
             info["error"] = f"Drug {action.target_drug_id} not in current medications"
             return reward, info
             step_index=self._step_count,
         ))
+        reward = _clamp_reward(compute_shaped_reward(previous_risk, self._current_risk, "propose_intervention"))
         info["risk_delta"] = risk_delta
         return reward, info
                 self._total_drug_changes,
                 self._critical_stopped_without_sub,
             )
+        return 0.001  # strict (0.001, 0.999) range required
     def _get_severe_pairs(self) -> List[Tuple[str, str]]:
         """Return all severe DDI pairs present in the *initial* medication list."""
         if not self._done and self._step_count >= self._task_cfg.max_steps:
             self._done = True
             score = self._run_grader()
+            # Terminal reward = grader score only (strictly in [0.01, 0.99])
+            reward = _clamp_reward(score)
             info["timeout"] = True
             info["grader_score"] = score

backend/src/polypharmacy_env/graders.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .models import InterventionRecord
 _EPS = 1e-8
-# Scores must be strictly in (0, 1) — never exactly 0.0 or 1.0
 _SCORE_MIN = 0.001
 _SCORE_MAX = 0.999

 _EPS = 1e-8
+# Scores must be strictly in (0.001, 0.999) — never outside this range
 _SCORE_MIN = 0.001
 _SCORE_MAX = 0.999

backend/src/polypharmacy_env/rewards.py CHANGED Viewed

@@ -75,22 +75,20 @@ def compute_shaped_reward(
     reward = 0.0
     if is_invalid:
-        return -INVALID_ACTION_PENALTY
-    if is_timeout:
-        return -TIMEOUT_PENALTY
-    if action_type == "query_ddi":
         reward -= QUERY_COST
         if discovered_severe:
             reward += SEVERE_DDI_DISCOVERY_BONUS
         elif discovered_moderate:
             reward += MODERATE_DDI_DISCOVERY_BONUS
     elif action_type == "propose_intervention":
         reward += (previous_risk - new_risk)
         reward -= INTERVENTION_COST
     # finish_review terminal bonus is added by the caller after grading
-    return max(0.001, reward)

     reward = 0.0
     if is_invalid:
+        reward = -INVALID_ACTION_PENALTY
+    elif is_timeout:
+        reward = -TIMEOUT_PENALTY
+    elif action_type == "query_ddi":
         reward -= QUERY_COST
         if discovered_severe:
             reward += SEVERE_DDI_DISCOVERY_BONUS
         elif discovered_moderate:
             reward += MODERATE_DDI_DISCOVERY_BONUS
     elif action_type == "propose_intervention":
         reward += (previous_risk - new_risk)
         reward -= INTERVENTION_COST
     # finish_review terminal bonus is added by the caller after grading
+    # Clamp all rewards to strict (0.001, 0.999) range
+    return max(0.001, min(0.999, reward))

inference.py CHANGED Viewed

@@ -68,7 +68,7 @@ def _fmt_reward(v: float) -> str:
 def _clamp01(v: float) -> float:
-    """Clamp score to strict (0, 1) — never exactly 0.0 or 1.0."""
     return max(0.001, min(0.999, float(v)))
@@ -179,8 +179,8 @@ def _reset(task_id: str) -> Dict[str, Any]:
 def _step(action: Dict[str, Any]) -> Dict[str, Any]:
     r = requests.post(f"{ENV_URL}/step", json={"action": action}, timeout=45)
     if r.status_code == 422:
-        # Invalid action — return a penalty and let the agent continue
-        return {"observation": {}, "reward": -0.1, "done": False, "info": {"error": r.text[:200]}}
     r.raise_for_status()
     return r.json()
@@ -189,7 +189,7 @@ def run_task(client: OpenAI, task_id: str) -> None:
     rewards: List[float] = []
     steps = 0
     success = False
-    score = 0.001  # strict (0, 1) — never exactly 0.0
     log_start(task_id)
     try:
         reset_payload = _reset(task_id)
@@ -203,7 +203,7 @@ def run_task(client: OpenAI, task_id: str) -> None:
             action_str = json.dumps(action, separators=(",", ":"))
             step_payload = _step(action)
             obs = step_payload.get("observation", {})
-            reward = float(step_payload.get("reward") or 0.0)
             done = bool(step_payload.get("done", False))
             metadata = (obs or {}).get("metadata", {}) or {}
             last_error = metadata.get("error")
@@ -217,7 +217,7 @@ def run_task(client: OpenAI, task_id: str) -> None:
                     score = _clamp01(float(raw_score))
                 else:
                     score = _clamp01(sum(max(0.0, r) for r in rewards) / max(len(rewards), 1))
-                success = score > 0.0
                 break
     except Exception:
         success = False

 def _clamp01(v: float) -> float:
+    """Clamp score to strict (0.001, 0.999) — never outside this range."""
     return max(0.001, min(0.999, float(v)))
 def _step(action: Dict[str, Any]) -> Dict[str, Any]:
     r = requests.post(f"{ENV_URL}/step", json={"action": action}, timeout=45)
     if r.status_code == 422:
+        # Invalid action — return a clamped penalty and let the agent continue
+        return {"observation": {}, "reward": 0.01, "done": False, "info": {"error": r.text[:200]}}
     r.raise_for_status()
     return r.json()
     rewards: List[float] = []
     steps = 0
     success = False
+    score = 0.001  # strict (0.001, 0.999) — never outside this range
     log_start(task_id)
     try:
         reset_payload = _reset(task_id)
             action_str = json.dumps(action, separators=(",", ":"))
             step_payload = _step(action)
             obs = step_payload.get("observation", {})
+            reward = _clamp01(float(step_payload.get("reward") or 0.0))
             done = bool(step_payload.get("done", False))
             metadata = (obs or {}).get("metadata", {}) or {}
             last_error = metadata.get("error")
                     score = _clamp01(float(raw_score))
                 else:
                     score = _clamp01(sum(max(0.0, r) for r in rewards) / max(len(rewards), 1))
+                success = score > 0.001
                 break
     except Exception:
         success = False