modelbuilderhq commited on
Commit
9ab33d8
·
verified ·
1 Parent(s): 60c0453

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +4 -2
  2. env.py +4 -4
  3. inference.py +29 -23
  4. models.py +3 -3
  5. openenv.yaml +6 -4
  6. tasks.py +3 -3
  7. tests/test_env.py +48 -5
README.md CHANGED
@@ -35,7 +35,8 @@ Pharmacovigilance teams are responsible for detecting harmful safety patterns af
35
  | Episode length | 2-step triage and review workflow |
36
  | Task count | 3 |
37
  | Difficulties | Easy, Medium, Hard |
38
- | Reward range | `0.0` to `1.0` |
 
39
  | API | `reset()`, `step()`, `state()` |
40
  | Server | FastAPI |
41
 
@@ -114,7 +115,8 @@ triage story.
114
  | Hard-task reasoning bonus if explanation mentions `drug interaction`, `tacrolimus`, `voriconazole`, `azole`, `calcineurin`, or `level monitoring` | `+0.05` |
115
 
116
  Notes:
117
- - Final reward is clamped to `[0.0, 1.0]`.
 
118
  - `suspect_drug` matching is forgiving for the hard task and allows substring matches.
119
  - The environment is deterministic and reproducible because all tasks and grading logic are hardcoded.
120
  - Confidence is optional, but calibrated confidence can improve reward while reckless overconfidence is penalized.
 
35
  | Episode length | 2-step triage and review workflow |
36
  | Task count | 3 |
37
  | Difficulties | Easy, Medium, Hard |
38
+ | Step reward range | `-0.25` to `1.0` |
39
+ | Final grader range | strict `(0, 1)` |
40
  | API | `reset()`, `step()`, `state()` |
41
  | Server | FastAPI |
42
 
 
115
  | Hard-task reasoning bonus if explanation mentions `drug interaction`, `tacrolimus`, `voriconazole`, `azole`, `calcineurin`, or `level monitoring` | `+0.05` |
116
 
117
  Notes:
118
+ - Step-level rewards may be slightly negative for clearly unsafe or suboptimal actions.
119
+ - Final grader outputs remain deterministic and strictly bounded inside `(0, 1)` for evaluation safety.
120
  - `suspect_drug` matching is forgiving for the hard task and allows substring matches.
121
  - The environment is deterministic and reproducible because all tasks and grading logic are hardcoded.
122
  - Confidence is optional, but calibrated confidence can improve reward while reckless overconfidence is penalized.
env.py CHANGED
@@ -36,9 +36,9 @@ class Action(BaseModel):
36
  confidence: Optional[int] = Field(default=None, ge=0, le=100)
37
 
38
 
39
- class Reward(BaseModel):
40
- total: float = Field(..., ge=0.0, le=1.0)
41
- breakdown: dict
42
 
43
 
44
  class PharmaVigilanceEnv:
@@ -71,7 +71,7 @@ class PharmaVigilanceEnv:
71
 
72
  @staticmethod
73
  def _clamp_reward(total: float, breakdown: dict) -> Reward:
74
- return Reward(total=max(0.0, min(1.0, round(total, 4))), breakdown=breakdown)
75
 
76
  def _initial_triage_reward(self, action: Action) -> Reward:
77
  truth = self.current_task.ground_truth
 
36
  confidence: Optional[int] = Field(default=None, ge=0, le=100)
37
 
38
 
39
+ class Reward(BaseModel):
40
+ total: float = Field(..., ge=-1.0, le=1.0)
41
+ breakdown: dict
42
 
43
 
44
  class PharmaVigilanceEnv:
 
71
 
72
  @staticmethod
73
  def _clamp_reward(total: float, breakdown: dict) -> Reward:
74
+ return Reward(total=max(-0.25, min(1.0, round(total, 4))), breakdown=breakdown)
75
 
76
  def _initial_triage_reward(self, action: Action) -> Reward:
77
  truth = self.current_task.ground_truth
inference.py CHANGED
@@ -6,19 +6,20 @@ to the environment server, and prints the exact machine-readable lines expected
6
  by the evaluator.
7
  """
8
 
9
- import argparse
10
- import json
11
- import os
12
- from typing import Iterable, List
13
-
14
- import requests
15
- from openai import OpenAI
16
- from pydantic import ValidationError
17
-
18
- try:
19
- from .models import PharmaAction
20
- except ImportError:
21
- from models import PharmaAction
 
22
 
23
 
24
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
@@ -85,10 +86,12 @@ def choose_tasks(selection: str) -> Iterable[str]:
85
  return TASK_SETS[selection]
86
 
87
 
88
- def client() -> OpenAI:
89
- if not HF_TOKEN:
90
- raise EnvironmentError("HF_TOKEN or API_KEY must be set before running inference.py")
91
- return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
 
 
92
 
93
 
94
  def fetch_reset(task_name: str) -> dict:
@@ -121,7 +124,7 @@ def prompt_for_case(observation: dict) -> str:
121
  )
122
 
123
 
124
- def ask_model(llm: OpenAI, observation: dict) -> PharmaAction:
125
  completion = llm.chat.completions.create(
126
  model=MODEL_NAME,
127
  messages=[
@@ -144,12 +147,15 @@ def compact_action(action: PharmaAction) -> str:
144
  return label
145
 
146
 
147
- def final_score(rewards: List[float]) -> float:
148
- score = sum(rewards) / len(rewards) if rewards else 0.0
149
- return min(max(round(score, 4), 0.01), 0.99)
 
 
 
150
 
151
 
152
- def run_one_task(llm: OpenAI, task_name: str) -> None:
153
  rewards: List[float] = []
154
  steps_taken = 0
155
  score = 0.0
@@ -179,7 +185,7 @@ def run_one_task(llm: OpenAI, task_name: str) -> None:
179
  steps_taken += 1
180
  emit_step(steps_taken, action_text, reward, done, None)
181
 
182
- score = final_score(rewards)
183
  success = score >= 0.60
184
 
185
  except json.JSONDecodeError:
 
6
  by the evaluator.
7
  """
8
 
9
+ import argparse
10
+ import json
11
+ import os
12
+ from typing import Any, Iterable, List
13
+
14
+ import requests
15
+ from pydantic import ValidationError
16
+
17
+ try:
18
+ from .graders import TASK_TO_GRADER
19
+ from .models import PharmaAction
20
+ except ImportError:
21
+ from graders import TASK_TO_GRADER
22
+ from models import PharmaAction
23
 
24
 
25
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 
86
  return TASK_SETS[selection]
87
 
88
 
89
+ def client() -> Any:
90
+ if not HF_TOKEN:
91
+ raise EnvironmentError("HF_TOKEN or API_KEY must be set before running inference.py")
92
+ from openai import OpenAI
93
+
94
+ return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
95
 
96
 
97
  def fetch_reset(task_name: str) -> dict:
 
124
  )
125
 
126
 
127
+ def ask_model(llm: Any, observation: dict) -> PharmaAction:
128
  completion = llm.chat.completions.create(
129
  model=MODEL_NAME,
130
  messages=[
 
147
  return label
148
 
149
 
150
+ def final_score(task_name: str, rewards: List[float]) -> float:
151
+ grader = TASK_TO_GRADER.get(task_name)
152
+ if grader is None:
153
+ score = sum(rewards) / len(rewards) if rewards else 0.0
154
+ return min(max(round(score, 4), 0.01), 0.99)
155
+ return float(grader({"rewards": rewards}))
156
 
157
 
158
+ def run_one_task(llm: Any, task_name: str) -> None:
159
  rewards: List[float] = []
160
  steps_taken = 0
161
  score = 0.0
 
185
  steps_taken += 1
186
  emit_step(steps_taken, action_text, reward, done, None)
187
 
188
+ score = final_score(task_name, rewards)
189
  success = score >= 0.60
190
 
191
  except json.JSONDecodeError:
models.py CHANGED
@@ -53,6 +53,6 @@ class PharmaAction(Action):
53
  )
54
 
55
 
56
- class PharmaReward(BaseModel):
57
- total: float = Field(..., description="Total reward in the 0.0-1.0 range")
58
- breakdown: dict = Field(default_factory=dict, description="Per-component reward breakdown")
 
53
  )
54
 
55
 
56
+ class PharmaReward(BaseModel):
57
+ total: float = Field(..., description="Step reward total, which may be slightly negative for penalties")
58
+ breakdown: dict = Field(default_factory=dict, description="Per-component reward breakdown")
openenv.yaml CHANGED
@@ -69,9 +69,9 @@ observation_space:
69
  required: false
70
  description: "Human-readable feedback from the previous action"
71
 
72
- reward:
73
- min: 0.0
74
- max: 1.0
75
  description: >
76
  Reward is computed over a staged pharmacovigilance decision pipeline:
77
  classification, causal suspect selection, severity assessment, and
@@ -83,7 +83,9 @@ reward:
83
  penalty of -0.20 applies when the agent dismisses a true new signal. The
84
  hard task can earn an additional +0.05 reasoning bonus when the
85
  explanation explicitly references the interaction mechanism or therapeutic
86
- drug monitoring clues.
 
 
87
 
88
  difficulties:
89
  - easy
 
69
  required: false
70
  description: "Human-readable feedback from the previous action"
71
 
72
+ reward:
73
+ min: -0.25
74
+ max: 1.0
75
  description: >
76
  Reward is computed over a staged pharmacovigilance decision pipeline:
77
  classification, causal suspect selection, severity assessment, and
 
83
  penalty of -0.20 applies when the agent dismisses a true new signal. The
84
  hard task can earn an additional +0.05 reasoning bonus when the
85
  explanation explicitly references the interaction mechanism or therapeutic
86
+ drug monitoring clues. Step-level rewards may dip slightly below zero for
87
+ clearly unsafe or suboptimal behavior, while final grader scores remain
88
+ deterministic and normalized for evaluation.
89
 
90
  difficulties:
91
  - easy
tasks.py CHANGED
@@ -104,9 +104,9 @@ def _reward_from_breakdown(breakdown: dict):
104
  from .env import Reward
105
  except ImportError:
106
  from env import Reward
107
-
108
- total = round(sum(breakdown.values()), 4)
109
- return Reward(total=max(0.0, min(1.0, total)), breakdown=breakdown)
110
 
111
 
112
  def known_signal_easy_action_grader(action: Any):
 
104
  from .env import Reward
105
  except ImportError:
106
  from env import Reward
107
+
108
+ total = round(sum(breakdown.values()), 4)
109
+ return Reward(total=max(-0.25, min(1.0, total)), breakdown=breakdown)
110
 
111
 
112
  def known_signal_easy_action_grader(action: Any):
tests/test_env.py CHANGED
@@ -6,15 +6,15 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
6
 
7
  from env import Action, PharmaVigilanceEnv
8
  from tasks import (
9
- cluster_signal_medium_action_grader,
10
- cluster_signal_medium_grader,
11
  confounded_hard_action_grader,
12
  confounded_hard_grader,
13
  get_task,
14
  get_tasks,
15
- known_signal_easy_action_grader,
16
- known_signal_easy_grader,
17
- )
18
 
19
 
20
  def test_reset_loads_easy_task():
@@ -179,6 +179,39 @@ def test_final_step_applies_stubborn_penalty_for_repeating_weak_answer():
179
  assert reward.breakdown["stubborn_penalty"] == -0.05
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def test_overconfidence_penalty_applies_on_weak_single_step_grading():
183
  reward = cluster_signal_medium_action_grader(
184
  Action(
@@ -280,6 +313,16 @@ def test_public_graders_are_strictly_bounded():
280
  assert confounded_hard_grader({"score": 1.5}) == 0.99
281
 
282
 
 
 
 
 
 
 
 
 
 
 
283
  def test_http_reset_then_step_roundtrip():
284
  pytest.importorskip("openenv")
285
  from fastapi.testclient import TestClient
 
6
 
7
  from env import Action, PharmaVigilanceEnv
8
  from tasks import (
9
+ cluster_signal_medium_action_grader,
10
+ cluster_signal_medium_grader,
11
  confounded_hard_action_grader,
12
  confounded_hard_grader,
13
  get_task,
14
  get_tasks,
15
+ known_signal_easy_action_grader,
16
+ known_signal_easy_grader,
17
+ )
18
 
19
 
20
  def test_reset_loads_easy_task():
 
179
  assert reward.breakdown["stubborn_penalty"] == -0.05
180
 
181
 
182
+ def test_initial_step_can_return_negative_reward_for_unsafe_triage():
183
+ env = PharmaVigilanceEnv()
184
+ env.reset("cluster_signal_medium")
185
+
186
+ _, reward, done, info = env.step(
187
+ Action(
188
+ classification="noise",
189
+ suspect_drug="Unknown",
190
+ severity_assessment="mild",
191
+ recommended_action="dismiss",
192
+ reasoning="No obvious concern.",
193
+ confidence=95,
194
+ )
195
+ )
196
+ assert done is False
197
+ assert info["phase"] == "initial_triage"
198
+ assert reward.total < 0.0
199
+
200
+
201
+ def test_single_step_action_grader_can_return_negative_total():
202
+ reward = cluster_signal_medium_action_grader(
203
+ Action(
204
+ classification="noise",
205
+ suspect_drug="Unknown",
206
+ severity_assessment="mild",
207
+ recommended_action="dismiss",
208
+ reasoning="Probably unrelated.",
209
+ confidence=95,
210
+ )
211
+ )
212
+ assert reward.total < 0.0
213
+
214
+
215
  def test_overconfidence_penalty_applies_on_weak_single_step_grading():
216
  reward = cluster_signal_medium_action_grader(
217
  Action(
 
313
  assert confounded_hard_grader({"score": 1.5}) == 0.99
314
 
315
 
316
+ def test_inference_final_score_uses_public_task_grader():
317
+ pytest.importorskip("openenv")
318
+ from inference import final_score
319
+
320
+ rewards = [0.4, 1.0]
321
+ assert final_score("known_signal_easy", rewards) == known_signal_easy_grader({"rewards": rewards})
322
+ assert final_score("cluster_signal_medium", rewards) == cluster_signal_medium_grader({"rewards": rewards})
323
+ assert final_score("confounded_hard", rewards) == confounded_hard_grader({"rewards": rewards})
324
+
325
+
326
  def test_http_reset_then_step_roundtrip():
327
  pytest.importorskip("openenv")
328
  from fastapi.testclient import TestClient