Spaces:

Draken1606
/

undertrial-ai

Running

App Files Files Community

Draken1606 commited on 15 days ago

Commit

6218d9a

1 Parent(s): d76d092

Fix 5 audit gaps: conditional bail, action history, efficiency reward, train/val split, env API routing

Browse files

Files changed (4) hide show

models.py +6 -2
server/reward.py +35 -13
server/undertrial_environment.py +8 -0
training/train_grpo.py +125 -7

models.py CHANGED Viewed

@@ -121,8 +121,8 @@ class SubmitMemoAction(Action):
     )
     # Recommendation
-    recommended_outcome: Literal["Bail Granted", "Bail Denied"] = Field(
-        ..., description="Final recommendation"
     )
     recommended_conditions: Optional[List[str]] = Field(
         None,
@@ -186,6 +186,10 @@ class CaseObservation(Observation):
     # Episode state
     action_result: Optional[str] = None
     flags_raised: List[str] = Field(default_factory=list)
     precedents_retrieved: List[str] = Field(default_factory=list)
     memo_submitted: bool = False

     )
     # Recommendation
+    recommended_outcome: Literal["Bail Granted", "Bail Denied", "Bail Conditional"] = Field(
+        ..., description="Final recommendation: Bail Granted | Bail Denied | Bail Conditional (strict conditions)"
     )
     recommended_conditions: Optional[List[str]] = Field(
         None,
     # Episode state
     action_result: Optional[str] = None
+    action_history: List[str] = Field(
+        default_factory=list,
+        description="Ordered log of all tool results seen so far this episode",
+    )
     flags_raised: List[str] = Field(default_factory=list)
     precedents_retrieved: List[str] = Field(default_factory=list)
     memo_submitted: bool = False

server/reward.py CHANGED Viewed

@@ -20,17 +20,26 @@ def compute_outcome_match(agent_outcome: str, ground_truth: Dict[str, Any]) -> f
     Checks if the agent's final recommendation matches the High Court decision.
     Scoring:
-        1.0 — Exact string match (e.g. "Bail Granted" == "Bail Granted")
-        0.8 — Directionally correct (agent says "granted", GT says "Bail Granted")
         0.0 — Wrong direction (granted vs. denied, or vice versa)
     """
     gt = ground_truth["outcome"]
-    if agent_outcome.strip().lower() == gt.strip().lower():
         return 1.0
-    agent_granted = "grant" in agent_outcome.lower()
-    gt_granted    = "grant" in gt.lower()
     return 0.8 if (agent_granted == gt_granted) else 0.0
@@ -334,6 +343,8 @@ def compute_reward(
     agent_computation: str,
     agent_conditions: List[str],
     episode: Dict[str, Any],
 ) -> Dict[str, float]:
     """
     Computes the full reward for a submitted bail assessment memo.
@@ -343,10 +354,11 @@ def compute_reward(
           + 0.2*flight_risk_accuracy
           + 0.2*statutory_accuracy
           + 0.2*condition_appropriateness
           - 0.3*bias_penalty
     Returns a dict with all component scores + total_reward.
-    Range: [-0.3, 1.0] (bias penalty can produce negative totals — this is intentional).
     """
     gt = episode["ground_truth"]
@@ -356,16 +368,26 @@ def compute_reward(
     ca   = compute_condition_score(agent_outcome, agent_conditions, gt)
     bias = compute_bias_penalty(agent_outcome, episode)
     lam   = 0.3
-    total = 0.4*om + 0.2*fr + 0.2*sa + 0.2*ca - lam*bias
     return {
-        "outcome_match":             round(om,   4),
-        "flight_risk_accuracy":      round(fr,   4),
-        "statutory_accuracy":        round(sa,   4),
-        "condition_appropriateness": round(ca,   4),
-        "bias_penalty":              round(bias, 4),
-        "total_reward":              round(total, 4),
         "ground_truth_outcome":      gt["outcome"],
         "agent_outcome":             agent_outcome,
     }

     Checks if the agent's final recommendation matches the High Court decision.
     Scoring:
+        1.0 — Exact string match
+        0.9 — "Bail Conditional" vs "Bail Granted" (conditional IS bail)
+        0.8 — Directionally correct but loose string
         0.0 — Wrong direction (granted vs. denied, or vice versa)
     """
     gt = ground_truth["outcome"]
+    agent_norm = agent_outcome.strip().lower()
+    gt_norm    = gt.strip().lower()
+    if agent_norm == gt_norm:
         return 1.0
+    # Conditional bail counts almost as well as full bail
+    if "conditional" in agent_norm and "grant" in gt_norm:
+        return 0.9
+    if "grant" in agent_norm and "conditional" in gt_norm:
+        return 0.9
+    agent_granted = "grant" in agent_norm or "conditional" in agent_norm
+    gt_granted    = "grant" in gt_norm    or "conditional" in gt_norm
     return 0.8 if (agent_granted == gt_granted) else 0.0
     agent_computation: str,
     agent_conditions: List[str],
     episode: Dict[str, Any],
+    step_count: int = 0,
+    max_steps: int = 10,
 ) -> Dict[str, float]:
     """
     Computes the full reward for a submitted bail assessment memo.
           + 0.2*flight_risk_accuracy
           + 0.2*statutory_accuracy
           + 0.2*condition_appropriateness
+          + 0.1*efficiency_bonus   (only when outcome is correct)
           - 0.3*bias_penalty
     Returns a dict with all component scores + total_reward.
+    Range: [-0.3, 1.1] (efficiency can push above 1.0 slightly on perfect runs).
     """
     gt = episode["ground_truth"]
     ca   = compute_condition_score(agent_outcome, agent_conditions, gt)
     bias = compute_bias_penalty(agent_outcome, episode)
+    # R4 — Efficiency bonus: reward finishing faster when the answer is correct.
+    # Only fires on directionally-correct outcomes (om >= 0.8) to prevent
+    # rewarding efficient-but-wrong agents.
+    efficiency = 0.0
+    if om >= 0.8 and max_steps > 1:
+        efficiency = round((1.0 - (step_count - 1) / (max_steps - 1)), 4)
+        efficiency = max(0.0, min(1.0, efficiency))
     lam   = 0.3
+    total = 0.4*om + 0.2*fr + 0.2*sa + 0.2*ca + 0.1*efficiency - lam*bias
     return {
+        "outcome_match":             round(om,         4),
+        "flight_risk_accuracy":      round(fr,         4),
+        "statutory_accuracy":        round(sa,         4),
+        "condition_appropriateness": round(ca,         4),
+        "efficiency_bonus":          round(efficiency, 4),
+        "bias_penalty":              round(bias,       4),
+        "total_reward":              round(total,      4),
         "ground_truth_outcome":      gt["outcome"],
         "agent_outcome":             agent_outcome,
+        "steps_used":                step_count,
     }

server/undertrial_environment.py CHANGED Viewed

@@ -86,6 +86,7 @@ class UndertriAIEnvironment(Environment):
         self._step_count = 0
         self._flags      = []
         self._retrieved_precedents = []
         return self._make_observation(action_result=None)
     def step(
@@ -113,6 +114,8 @@ class UndertriAIEnvironment(Environment):
                 agent_computation = action.statutory_computation,
                 agent_conditions  = action.recommended_conditions or [],
                 episode           = self._episode,
             )
             # Apply skip penalty (can push total legitimately negative)
             reward_dict["total_reward"] = round(reward_dict["total_reward"] - no_tool_penalty, 4)
@@ -149,6 +152,10 @@ class UndertriAIEnvironment(Environment):
         else:
             result = self._dispatch_tool(action)
         # Force submit if max steps reached
         done = (self._step_count >= self.MAX_STEPS)
         reward = -0.1 if done else 0.0  # Small penalty for exhausting budget
@@ -277,6 +284,7 @@ class UndertriAIEnvironment(Environment):
             cited_precedents    = init_precedents + self._retrieved_precedents,
             documents_available = ep.get("documents_available", []),
             action_result       = action_result,
             flags_raised        = list(self._flags),
             precedents_retrieved = list(self._retrieved_precedents),
             memo_submitted      = memo_submitted,

         self._step_count = 0
         self._flags      = []
         self._retrieved_precedents = []
+        self._action_history: List[str] = []  # accumulated tool results (Gap 4)
         return self._make_observation(action_result=None)
     def step(
                 agent_computation = action.statutory_computation,
                 agent_conditions  = action.recommended_conditions or [],
                 episode           = self._episode,
+                step_count        = self._step_count,   # Gap 5: efficiency reward
+                max_steps         = self.MAX_STEPS,
             )
             # Apply skip penalty (can push total legitimately negative)
             reward_dict["total_reward"] = round(reward_dict["total_reward"] - no_tool_penalty, 4)
         else:
             result = self._dispatch_tool(action)
+        # Accumulate action history (Gap 4)
+        summary = f"[Step {self._step_count}] {type(action).__name__}: {result[:120]}..."
+        self._action_history.append(summary)
         # Force submit if max steps reached
         done = (self._step_count >= self.MAX_STEPS)
         reward = -0.1 if done else 0.0  # Small penalty for exhausting budget
             cited_precedents    = init_precedents + self._retrieved_precedents,
             documents_available = ep.get("documents_available", []),
             action_result       = action_result,
+            action_history      = list(self._action_history),  # Gap 4
             flags_raised        = list(self._flags),
             precedents_retrieved = list(self._retrieved_precedents),
             memo_submitted      = memo_submitted,

training/train_grpo.py CHANGED Viewed

@@ -25,12 +25,20 @@ INSTALL_COMMANDS = """
 # CELL 2 — Imports
 # ============================================================
-import os, sys, json, re, argparse, random
 from pathlib import Path
-from typing import List, Dict, Any, Optional
 import torch
 # ── Fix 1: Import authoritative reward functions from server/reward.py ──────
 # This ensures training optimises the SAME signal the deployed demo evaluates.
 try:
@@ -334,7 +342,15 @@ def combined_reward(
             ca = reward_conditions([comp], [ep])[0]  # condition score, not format
             b  = reward_no_bias([comp], [ep])[0]
-        total = 0.4*o + 0.2*fr + 0.2*s + 0.2*ca - 0.3*b
         rewards.append(round(total, 4))  # No max(0.0) clamp — bias can go negative
     return rewards
@@ -343,15 +359,117 @@ def combined_reward(
 # CELL 5 — Dataset builder
 # ============================================================
-def load_episodes(episodes_dir: str, stage: int = 1) -> List[Dict]:
     path = Path(episodes_dir) / f"episodes_stage_{stage}.jsonl"
     if not path.exists():
-        # Try the combined file
         path = Path(episodes_dir) / "episodes_all.jsonl"
     if not path.exists():
-        raise FileNotFoundError(f"No episodes found in {episodes_dir}. Run data/prepare_dataset.py first.")
     with open(path, encoding="utf-8") as f:
-        return [json.loads(l) for l in f if l.strip()]
 def build_hf_dataset(episodes: List[Dict], tokenizer) -> Dataset:

 # CELL 2 — Imports
 # ============================================================
+import os, sys, json, re, argparse, random, time
 from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import urllib.request
+import urllib.parse
 import torch
+# ── Environment API (Gap 1) ─────────────────────────────────────────────────
+ENV_API_URL = os.environ.get(
+    "UNDERTRIAL_ENV_URL",
+    "https://draken1606-undertrial-ai.hf.space",
+)
 # ── Fix 1: Import authoritative reward functions from server/reward.py ──────
 # This ensures training optimises the SAME signal the deployed demo evaluates.
 try:
             ca = reward_conditions([comp], [ep])[0]  # condition score, not format
             b  = reward_no_bias([comp], [ep])[0]
+        # R4 efficiency bonus: reward fewer steps when outcome is correct
+        eff = 0.0
+        if o >= 0.8:
+            steps_taken = kwargs.get("step_counts", [None] * len(completions))
+            sc = steps_taken[completions.index(comp)] if comp in completions else None
+            if sc is not None:
+                eff = max(0.0, 1.0 - (sc - 1) / 9)
+        total = 0.4*o + 0.2*fr + 0.2*s + 0.2*ca + 0.1*eff - 0.3*b
         rewards.append(round(total, 4))  # No max(0.0) clamp — bias can go negative
     return rewards
 # CELL 5 — Dataset builder
 # ============================================================
+def load_episodes(
+    episodes_dir: str,
+    stage: int = 1,
+    split: str = "train",
+    val_fraction: float = 0.15,
+    test_fraction: float = 0.10,
+) -> List[Dict]:
+    """
+    Load episodes for a given split (Gap 2: train/val/test split).
+    Split fractions (applied deterministically by index, no shuffle):
+        train  = first (1 - val - test) fraction
+        val    = next val_fraction
+        test   = last test_fraction
+    """
     path = Path(episodes_dir) / f"episodes_stage_{stage}.jsonl"
     if not path.exists():
         path = Path(episodes_dir) / "episodes_all.jsonl"
     if not path.exists():
+        raise FileNotFoundError(f"No episodes found in {episodes_dir}.")
     with open(path, encoding="utf-8") as f:
+        all_eps = [json.loads(l) for l in f if l.strip()]
+    n = len(all_eps)
+    n_test = max(1, int(n * test_fraction))
+    n_val  = max(1, int(n * val_fraction))
+    n_train = n - n_val - n_test
+    if split == "train":
+        return all_eps[:n_train]
+    elif split == "val":
+        return all_eps[n_train:n_train + n_val]
+    elif split == "test":
+        return all_eps[n_train + n_val:]
+    else:
+        return all_eps  # all: for backward compat
+def rollout_via_env_api(
+    completion: str,
+    episode: Dict,
+    env_url: str = ENV_API_URL,
+    session_id: Optional[str] = None,
+    timeout: float = 10.0,
+) -> float:
+    """
+    Gap 1: Route reward through the live deployed environment API.
+    Sends the model's completion to the environment server via HTTP,
+    replaying the parsed submit_memo action, and returns the official reward.
+    Falls back to local reward on any network error.
+    """
+    import urllib.error
+    try:
+        from server.reward import compute_reward as _local_reward
+    except ImportError:
+        _local_reward = None
+    parsed = parse_model_output(completion)
+    if not parsed["recommended_outcome"]:
+        return 0.0  # Malformed output
+    try:
+        # Step 1: Reset the environment with the correct episode
+        episode_stage = episode.get("curriculum_stage", 1)
+        reset_url = f"{env_url}/reset?stage={episode_stage}"
+        req = urllib.request.Request(reset_url, method="POST")
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            reset_data = json.loads(resp.read())
+        sid = session_id or reset_data.get("session_id", "")
+        # Step 2: Submit the parsed memo
+        memo_payload = json.dumps({
+            "session_id": sid,
+            "action": {
+                "tool_name": "submit_memo",
+                "flight_risk": parsed["flight_risk"] or "Medium",
+                "flight_risk_justification": parsed["flight_risk_just"] or "Not specified",
+                "statutory_eligible": parsed["statutory_eligible"],
+                "statutory_computation": parsed["statutory_computation"] or "Not computed",
+                "grounds_for_bail": parsed["grounds_for"] or [],
+                "grounds_against_bail": parsed["grounds_against"] or [],
+                "recommended_outcome": parsed["recommended_outcome"],
+                "recommended_conditions": parsed["conditions"] or [],
+                "confidence": "Medium",
+            }
+        }).encode()
+        step_req = urllib.request.Request(
+            f"{env_url}/step",
+            data=memo_payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        with urllib.request.urlopen(step_req, timeout=timeout) as resp:
+            step_data = json.loads(resp.read())
+        return float(step_data.get("reward", 0.0))
+    except Exception as e:
+        # Network / parse error: fall back to local reward
+        print(f"[env_api] Falling back to local reward: {e}")
+        if _local_reward and episode:
+            rd = _local_reward(
+                agent_outcome=parsed["recommended_outcome"],
+                agent_flight_risk=parsed["flight_risk"] or "Medium",
+                agent_eligible=parsed["statutory_eligible"],
+                agent_computation=parsed["statutory_computation"] or "",
+                agent_conditions=parsed["conditions"] or [],
+                episode=episode,
+            )
+            return rd["total_reward"]
+        return 0.0
 def build_hf_dataset(episodes: List[Dict], tokenizer) -> Dataset: