Aman Khare commited on
Commit
7655d3c
·
1 Parent(s): 3856d60

final changes

Browse files
.gitignore ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Python ---
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ *.egg
10
+
11
+ # --- Virtual environments ---
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # --- IDE ---
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+ *~
22
+
23
+ # --- OS ---
24
+ .DS_Store
25
+ Thumbs.db
26
+ desktop.ini
27
+
28
+ # --- Test artifacts ---
29
+ out.txt
30
+ err.txt
31
+ test_output.txt
32
+ test_full.py
33
+ test_all_fixes.py
34
+ test_inference.py
35
+ test_reward.py
36
+ test_presubmission.py
37
+
38
+ # --- Non-submission folders ---
39
+ next step/
40
+ play/
41
+
42
+ # --- Logs ---
43
+ *.log
44
+
45
+ # --- Secrets ---
46
+ .env
47
+ .env.*
__pycache__/inference.cpython-314.pyc DELETED
Binary file (15.4 kB)
 
environment/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (443 Bytes)
 
environment/__pycache__/env.cpython-314.pyc DELETED
Binary file (15.3 kB)
 
environment/__pycache__/models.cpython-314.pyc DELETED
Binary file (5.51 kB)
 
environment/__pycache__/reward.cpython-314.pyc DELETED
Binary file (7.6 kB)
 
environment/env.py CHANGED
@@ -240,9 +240,31 @@ def state(self) -> EnvironmentState:
240
  # --------------------------------------------------------------------- #
241
 
242
  def _handle_submit(self, action: Action, info: dict) -> Reward:
243
- """Process a ``submit_note`` action."""
244
- if action.soap_note is None:
245
- error = "submit_note requires a non-null soap_note."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  self._errors_so_far.append(error)
247
  return compute_reward(
248
  action,
@@ -253,7 +275,7 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
253
  info={"error": error},
254
  )
255
 
256
- self._current_draft = _soap_to_text(action.soap_note)
257
  self._done = True
258
 
259
  # Attempt to grade via the task-specific grader
@@ -270,7 +292,7 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
270
  )
271
 
272
  try:
273
- raw_signals = grader(action.soap_note, self._task)
274
  # Grader returns a signals dict; extract a single scalar score
275
  # as the mean of its values for use as grader_score.
276
  grader_score = (
@@ -278,9 +300,9 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
278
  if raw_signals else 0.0
279
  )
280
  info["grader_signals"] = raw_signals
281
- except NotImplementedError:
282
- info["warning"] = "Grader not yet implemented; returning placeholder."
283
- grader_score = 0.5
284
 
285
  return compute_reward(
286
  action,
@@ -297,11 +319,9 @@ def _handle_clarify(self, action: Action, info: dict) -> Reward:
297
  if not question:
298
  error = "request_clarify requires a non-empty clarify_question."
299
  self._errors_so_far.append(error)
300
- return compute_reward(
301
- action,
302
- grader_score=0.0,
303
- step_count=self._step_count,
304
- errors_so_far=self._errors_so_far,
305
  done=False,
306
  info={"error": error},
307
  )
@@ -315,12 +335,10 @@ def _handle_clarify(self, action: Action, info: dict) -> Reward:
315
  "No additional information available for that question."
316
  )
317
 
318
- # Clarification steps earn no grader_score; step_penalty accrues naturally
319
- return compute_reward(
320
- action,
321
- grader_score=0.0,
322
- step_count=self._step_count,
323
- errors_so_far=self._errors_so_far,
324
  done=False,
325
  info=info,
326
  )
@@ -330,11 +348,9 @@ def _handle_revise(self, action: Action, info: dict) -> Reward:
330
  if action.section is None or action.revision_text is None:
331
  error = "revise_section requires both 'section' and 'revision_text'."
332
  self._errors_so_far.append(error)
333
- return compute_reward(
334
- action,
335
- grader_score=0.0,
336
- step_count=self._step_count,
337
- errors_so_far=self._errors_so_far,
338
  done=False,
339
  info={"error": error},
340
  )
@@ -358,12 +374,10 @@ def _handle_revise(self, action: Action, info: dict) -> Reward:
358
 
359
  info["revised_section"] = action.section
360
 
361
- # Revision steps earn no grader_score; deductions still apply
362
- return compute_reward(
363
- action,
364
- grader_score=0.0,
365
- step_count=self._step_count,
366
- errors_so_far=self._errors_so_far,
367
  done=False,
368
  info=info,
369
  )
 
240
  # --------------------------------------------------------------------- #
241
 
242
  def _handle_submit(self, action: Action, info: dict) -> Reward:
243
+ """Process a ``submit_note`` action.
244
+
245
+ If ``action.soap_note`` is provided, it is used directly.
246
+ Otherwise, if the agent has built up a draft via ``revise_section``,
247
+ the draft is parsed into a SOAPNote automatically.
248
+ """
249
+ soap = action.soap_note
250
+
251
+ # Fall back to the current draft if no explicit note is provided
252
+ if soap is None and self._current_draft:
253
+ sections: dict[str, str] = {}
254
+ for line in self._current_draft.split("\n"):
255
+ for prefix in ("S: ", "O: ", "A: ", "P: "):
256
+ if line.startswith(prefix):
257
+ sections[prefix[0]] = line[len(prefix):]
258
+ if all(k in sections for k in "SOAP"):
259
+ soap = SOAPNote(
260
+ subjective=sections["S"],
261
+ objective=sections["O"],
262
+ assessment=sections["A"],
263
+ plan=sections["P"],
264
+ )
265
+
266
+ if soap is None:
267
+ error = "submit_note requires a non-null soap_note (or a complete draft from revise_section)."
268
  self._errors_so_far.append(error)
269
  return compute_reward(
270
  action,
 
275
  info={"error": error},
276
  )
277
 
278
+ self._current_draft = _soap_to_text(soap)
279
  self._done = True
280
 
281
  # Attempt to grade via the task-specific grader
 
292
  )
293
 
294
  try:
295
+ raw_signals = grader(soap, self._task)
296
  # Grader returns a signals dict; extract a single scalar score
297
  # as the mean of its values for use as grader_score.
298
  grader_score = (
 
300
  if raw_signals else 0.0
301
  )
302
  info["grader_signals"] = raw_signals
303
+ except Exception as exc:
304
+ info["warning"] = f"Grader error: {exc}"
305
+ grader_score = 0.0
306
 
307
  return compute_reward(
308
  action,
 
319
  if not question:
320
  error = "request_clarify requires a non-empty clarify_question."
321
  self._errors_so_far.append(error)
322
+ return Reward(
323
+ value=0.0,
324
+ signals={"error": 1.0},
 
 
325
  done=False,
326
  info={"error": error},
327
  )
 
335
  "No additional information available for that question."
336
  )
337
 
338
+ # Intermediate actions get zero reward only submit_note earns score
339
+ return Reward(
340
+ value=0.0,
341
+ signals={"intermediate_step": 1.0},
 
 
342
  done=False,
343
  info=info,
344
  )
 
348
  if action.section is None or action.revision_text is None:
349
  error = "revise_section requires both 'section' and 'revision_text'."
350
  self._errors_so_far.append(error)
351
+ return Reward(
352
+ value=0.0,
353
+ signals={"error": 1.0},
 
 
354
  done=False,
355
  info={"error": error},
356
  )
 
374
 
375
  info["revised_section"] = action.section
376
 
377
+ # Intermediate actions get zero reward only submit_note earns score
378
+ return Reward(
379
+ value=0.0,
380
+ signals={"intermediate_step": 1.0},
 
 
381
  done=False,
382
  info=info,
383
  )
environment/tasks/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (1.15 kB)
 
environment/tasks/__pycache__/task_easy.cpython-314.pyc DELETED
Binary file (1.69 kB)
 
environment/tasks/__pycache__/task_hard.cpython-314.pyc DELETED
Binary file (2.4 kB)
 
environment/tasks/__pycache__/task_medium.cpython-314.pyc DELETED
Binary file (2.02 kB)
 
environment/tasks/task_easy.py CHANGED
@@ -1,6 +1,7 @@
1
  """Easy task — routine check-up.
2
 
3
- Grader is intentionally left unimplemented.
 
4
  """
5
 
6
  from __future__ import annotations
@@ -32,26 +33,53 @@
32
 
33
 
34
  # ---------------------------------------------------------------------------
35
- # Grader (not yet implemented)
36
  # ---------------------------------------------------------------------------
37
 
38
  def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
39
  """Score a submitted SOAP note against the easy-task rubric.
40
 
41
- Parameters
42
- ----------
43
- soap_note:
44
- The agent's submitted clinical note.
45
- task:
46
- The task definition dict (``EASY_TASK``).
47
 
48
  Returns
49
  -------
50
- dict mapping signal names float scores in [0, 1].
51
-
52
- Raises
53
- ------
54
- NotImplementedError
55
- Grader has not been implemented yet.
56
  """
57
- raise NotImplementedError("Easy-task grader is not yet implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Easy task — routine check-up.
2
 
3
+ Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
4
+ against expected findings from a simple cold / blood pressure check visit.
5
  """
6
 
7
  from __future__ import annotations
 
33
 
34
 
35
  # ---------------------------------------------------------------------------
36
+ # Grader
37
  # ---------------------------------------------------------------------------
38
 
39
  def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
40
  """Score a submitted SOAP note against the easy-task rubric.
41
 
42
+ Checks for mention of key clinical findings from the transcript:
43
+ chief complaints, vitals, viral URI assessment, and supportive plan.
 
 
 
 
44
 
45
  Returns
46
  -------
47
+ dict mapping signal names to float scores in [0, 1].
 
 
 
 
 
48
  """
49
+ text_s = soap_note.subjective.lower()
50
+ text_o = soap_note.objective.lower()
51
+ text_a = soap_note.assessment.lower()
52
+ text_p = soap_note.plan.lower()
53
+
54
+ # 1. Subjective — chief complaints
55
+ s_score = 0.0
56
+ if "sore throat" in text_s or "runny nose" in text_s or "congestion" in text_s:
57
+ s_score += 0.5
58
+ if "5 days" in text_s or "five days" in text_s or "headache" in text_s:
59
+ s_score += 0.5
60
+
61
+ # 2. Objective — vitals
62
+ o_score = 0.0
63
+ if "118/76" in text_o or "118 over 76" in text_o or "blood pressure" in text_o:
64
+ o_score += 0.5
65
+ if "72" in text_o or "heart rate" in text_o or "lungs clear" in text_o:
66
+ o_score += 0.5
67
+
68
+ # 3. Assessment — viral URI
69
+ a_score = 0.0
70
+ if "viral" in text_a or "uri" in text_a or "upper respiratory" in text_a:
71
+ a_score += 1.0
72
+
73
+ # 4. Plan — supportive care
74
+ p_score = 0.0
75
+ if "fluids" in text_p or "rest" in text_p or "hydrat" in text_p:
76
+ p_score += 0.5
77
+ if "dayquil" in text_p or "follow" in text_p or "return" in text_p:
78
+ p_score += 0.5
79
+
80
+ return {
81
+ "subjective_accuracy": min(s_score, 1.0),
82
+ "objective_accuracy": min(o_score, 1.0),
83
+ "assessment_accuracy": min(a_score, 1.0),
84
+ "plan_accuracy": min(p_score, 1.0),
85
+ }
environment/tasks/task_hard.py CHANGED
@@ -1,6 +1,8 @@
1
  """Hard task — complex ER visit.
2
 
3
- Grader is intentionally left unimplemented.
 
 
4
  """
5
 
6
  from __future__ import annotations
@@ -61,26 +63,56 @@
61
 
62
 
63
  # ---------------------------------------------------------------------------
64
- # Grader (not yet implemented)
65
  # ---------------------------------------------------------------------------
66
 
67
  def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
68
  """Score a submitted SOAP note against the hard-task rubric.
69
 
70
- Parameters
71
- ----------
72
- soap_note:
73
- The agent's submitted clinical note.
74
- task:
75
- The task definition dict (``HARD_TASK``).
76
 
77
  Returns
78
  -------
79
- dict mapping signal names float scores in [0, 1].
80
-
81
- Raises
82
- ------
83
- NotImplementedError
84
- Grader has not been implemented yet.
85
  """
86
- raise NotImplementedError("Hard-task grader is not yet implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Hard task — complex ER visit.
2
 
3
+ Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
4
+ against expected findings from a complex ER visit with overlapping chest pain,
5
+ SOB, and a possible PE complicated by a contrast dye allergy.
6
  """
7
 
8
  from __future__ import annotations
 
63
 
64
 
65
  # ---------------------------------------------------------------------------
66
+ # Grader
67
  # ---------------------------------------------------------------------------
68
 
69
  def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
70
  """Score a submitted SOAP note against the hard-task rubric.
71
 
72
+ Checks for chest pain / SOB and the nitroglycerin contradiction (subjective),
73
+ D-dimer and contrast allergy (objective), ACS vs PE differential (assessment),
74
+ and V/Q scan + ICU admission (plan).
 
 
 
75
 
76
  Returns
77
  -------
78
+ dict mapping signal names to float scores in [0, 1].
 
 
 
 
 
79
  """
80
+ text_s = soap_note.subjective.lower()
81
+ text_o = soap_note.objective.lower()
82
+ text_a = soap_note.assessment.lower()
83
+ text_p = soap_note.plan.lower()
84
+
85
+ # 1. Subjective — catching the contradiction and presenting complaints
86
+ s_score = 0.0
87
+ if "chest pain" in text_s or "shortness of breath" in text_s or "sob" in text_s:
88
+ s_score += 0.5
89
+ if "nitroglycerin" in text_s or "contradict" in text_s or "denied" in text_s:
90
+ s_score += 0.5
91
+
92
+ # 2. Objective — elevated D-dimer and allergy awareness
93
+ o_score = 0.0
94
+ if "d-dimer" in text_o or "1840" in text_o or "d dimer" in text_o:
95
+ o_score += 0.5
96
+ if "allergy" in text_o or "contrast" in text_o or "troponin" in text_o:
97
+ o_score += 0.5
98
+
99
+ # 3. Assessment — the dual differential (ACS vs PE)
100
+ a_score = 0.0
101
+ if "acs" in text_a or "acute coronary" in text_a or "coronary" in text_a or "ischemia" in text_a:
102
+ a_score += 0.5
103
+ if "pe" in text_a or "pulmonary embolism" in text_a or "embolism" in text_a:
104
+ a_score += 0.5
105
+
106
+ # 4. Plan — adapting to the allergy (V/Q scan) and admission
107
+ p_score = 0.0
108
+ if "v/q" in text_p or "ventilation" in text_p or "perfusion" in text_p:
109
+ p_score += 0.5
110
+ if "icu" in text_p or "admit" in text_p or "cardiac" in text_p:
111
+ p_score += 0.5
112
+
113
+ return {
114
+ "subjective_accuracy": min(s_score, 1.0),
115
+ "objective_accuracy": min(o_score, 1.0),
116
+ "assessment_accuracy": min(a_score, 1.0),
117
+ "plan_accuracy": min(p_score, 1.0),
118
+ }
environment/tasks/task_medium.py CHANGED
@@ -1,6 +1,7 @@
1
  """Medium task — chronic disease follow-up.
2
 
3
- Grader is intentionally left unimplemented.
 
4
  """
5
 
6
  from __future__ import annotations
@@ -43,26 +44,55 @@
43
 
44
 
45
  # ---------------------------------------------------------------------------
46
- # Grader (not yet implemented)
47
  # ---------------------------------------------------------------------------
48
 
49
  def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
50
  """Score a submitted SOAP note against the medium-task rubric.
51
 
52
- Parameters
53
- ----------
54
- soap_note:
55
- The agent's submitted clinical note.
56
- task:
57
- The task definition dict (``MEDIUM_TASK``).
58
 
59
  Returns
60
  -------
61
- dict mapping signal names float scores in [0, 1].
62
-
63
- Raises
64
- ------
65
- NotImplementedError
66
- Grader has not been implemented yet.
67
  """
68
- raise NotImplementedError("Medium-task grader is not yet implemented.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Medium task — chronic disease follow-up.
2
 
3
+ Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
4
+ against expected findings from a Type 2 Diabetes / Hypertension follow-up.
5
  """
6
 
7
  from __future__ import annotations
 
44
 
45
 
46
  # ---------------------------------------------------------------------------
47
+ # Grader
48
  # ---------------------------------------------------------------------------
49
 
50
  def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
51
  """Score a submitted SOAP note against the medium-task rubric.
52
 
53
+ Checks for mention of dietary habits, HbA1c lab values, core diagnoses,
54
+ and medication adjustments (glipizide, lisinopril uptitration).
 
 
 
 
55
 
56
  Returns
57
  -------
58
+ dict mapping signal names to float scores in [0, 1].
 
 
 
 
 
59
  """
60
+ text_s = soap_note.subjective.lower()
61
+ text_o = soap_note.objective.lower()
62
+ text_a = soap_note.assessment.lower()
63
+ text_p = soap_note.plan.lower()
64
+
65
+ # 1. Subjective — dietary habits / statin gap
66
+ s_score = 0.0
67
+ if "restaurant" in text_s or "diet" in text_s or "eating" in text_s:
68
+ s_score += 0.5
69
+ if "statin" in text_s or "gap" in text_s or "missed" in text_s:
70
+ s_score += 0.5
71
+
72
+ # 2. Objective — HbA1c values
73
+ o_score = 0.0
74
+ if "7.8" in text_o or "7.2" in text_o or "a1c" in text_o or "hba1c" in text_o:
75
+ o_score += 0.5
76
+ if "156" in text_o or "fasting glucose" in text_o or "glucose" in text_o:
77
+ o_score += 0.5
78
+
79
+ # 3. Assessment — core diagnoses
80
+ a_score = 0.0
81
+ if "diabetes" in text_a or "t2dm" in text_a or "dm" in text_a:
82
+ a_score += 0.5
83
+ if "hypertension" in text_a or "htn" in text_a or "blood pressure" in text_a:
84
+ a_score += 0.5
85
+
86
+ # 4. Plan — medication changes
87
+ p_score = 0.0
88
+ if "glipizide" in text_p and ("5" in text_p or "add" in text_p):
89
+ p_score += 0.5
90
+ if "lisinopril" in text_p and ("40" in text_p or "increase" in text_p or "uptitrat" in text_p):
91
+ p_score += 0.5
92
+
93
+ return {
94
+ "subjective_accuracy": min(s_score, 1.0),
95
+ "objective_accuracy": min(o_score, 1.0),
96
+ "assessment_accuracy": min(a_score, 1.0),
97
+ "plan_accuracy": min(p_score, 1.0),
98
+ }
err.txt DELETED
@@ -1,24 +0,0 @@
1
- {"event": "START", "timestamp": 1775576189.364181, "task_id": "easy_routine_checkup"}
2
- [DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
3
- {"event": "STEP", "timestamp": 1775576190.2672057, "step": 1, "action_type": "submit_note", "reward": 0.7}
4
- {"event": "END", "timestamp": 1775576190.2674263, "task_id": "easy_routine_checkup", "final_score": 0.7}
5
- {"event": "START", "timestamp": 1775576190.269494, "task_id": "medium_chronic_disease_followup"}
6
- [DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
7
- {"event": "STEP", "timestamp": 1775576190.6036963, "step": 1, "action_type": "submit_note", "reward": 0.7}
8
- {"event": "END", "timestamp": 1775576190.6037915, "task_id": "medium_chronic_disease_followup", "final_score": 0.7}
9
- {"event": "START", "timestamp": 1775576190.604777, "task_id": "hard_complex_er_visit"}
10
- [DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
11
- {"event": "STEP", "timestamp": 1775576190.9611442, "step": 1, "action_type": "submit_note", "reward": 0.7}
12
- {"event": "END", "timestamp": 1775576190.961212, "task_id": "hard_complex_er_visit", "final_score": 0.7}
13
-
14
- ============================================================
15
- SUMMARY
16
- ============================================================
17
- Task Score Steps
18
- ------------------------------- ------- -----
19
- easy_routine_checkup 0.7000 1
20
- medium_chronic_disease_followup 0.7000 1
21
- hard_complex_er_visit 0.7000 1
22
- ------------------------------- ------- -----
23
- AVERAGE 0.7000
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py CHANGED
@@ -212,17 +212,18 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
212
  action = Action(**action_dict)
213
  action_str = f"submit_note(sections=S,O,A,P)"
214
  except Exception as exc:
215
- # On model / parse failure, submit a minimal note to avoid hanging
 
216
  action = Action(
217
  action_type="submit_note",
218
  soap_note=SOAPNote(
219
- subjective="Unable to generate.",
220
- objective="Unable to generate.",
221
- assessment="Unable to generate.",
222
- plan="Unable to generate.",
223
  ),
224
  )
225
- action_str = f"submit_note(fallback)"
226
  last_error = str(exc)
227
 
228
  # ---- step ----
 
212
  action = Action(**action_dict)
213
  action_str = f"submit_note(sections=S,O,A,P)"
214
  except Exception as exc:
215
+ # On model / parse failure, submit an empty note so all sub-signals
216
+ # grade to 0.0 (format_valid=0 because fields are empty, grader=0).
217
  action = Action(
218
  action_type="submit_note",
219
  soap_note=SOAPNote(
220
+ subjective="",
221
+ objective="",
222
+ assessment="",
223
+ plan="",
224
  ),
225
  )
226
+ action_str = "submit_note(fallback)"
227
  last_error = str(exc)
228
 
229
  # ---- step ----
openenv.yaml CHANGED
@@ -206,6 +206,6 @@ graders:
206
  inference:
207
  script: inference.py
208
  env_vars:
209
- - OPENAI_API_KEY
210
  - API_BASE_URL
211
  - MODEL_NAME
 
206
  inference:
207
  script: inference.py
208
  env_vars:
209
+ - HF_TOKEN
210
  - API_BASE_URL
211
  - MODEL_NAME
out.txt DELETED
@@ -1,9 +0,0 @@
1
- [START] task=easy_routine_checkup env=clinical-note-scribe model=gpt-4o-mini
2
- [STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
3
- [END] success=true steps=1 score=0.70 rewards=0.70
4
- [START] task=medium_chronic_disease_followup env=clinical-note-scribe model=gpt-4o-mini
5
- [STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
6
- [END] success=true steps=1 score=0.70 rewards=0.70
7
- [START] task=hard_complex_er_visit env=clinical-note-scribe model=gpt-4o-mini
8
- [STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
9
- [END] success=true steps=1 score=0.70 rewards=0.70
 
 
 
 
 
 
 
 
 
 
server/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (221 Bytes)
 
server/__pycache__/app.cpython-314.pyc DELETED
Binary file (1.26 kB)
 
server/__pycache__/routes.cpython-314.pyc DELETED
Binary file (6.37 kB)
 
server/routes.py CHANGED
@@ -110,12 +110,39 @@ async def reset(body: ResetRequest) -> Observation:
110
  response_model=StepResponse,
111
  summary="Submit an action and advance the environment by one step",
112
  )
113
- async def step(action: Action) -> StepResponse:
114
- """Execute *action* in the current episode.
115
 
116
- The underlying environment emits a ``[STEP]`` log event (and ``[END]``
117
- when the episode terminates).
 
118
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  _log("STEP", endpoint="/step", action_type=action.action_type)
120
  try:
121
  obs, reward, done, info = _env.step(action)
 
110
  response_model=StepResponse,
111
  summary="Submit an action and advance the environment by one step",
112
  )
113
+ async def step(payload: dict[str, Any]) -> StepResponse:
114
+ """Execute an action in the current episode.
115
 
116
+ Accepts a raw JSON body and validates it into an ``Action``.
117
+ If validation fails, the error is recorded in the environment
118
+ instead of returning an HTTP 422.
119
  """
120
+ from pydantic import ValidationError
121
+ from environment.models import Reward
122
+
123
+ try:
124
+ action = Action(**payload)
125
+ except (ValidationError, TypeError) as exc:
126
+ # Gracefully absorb bad payloads instead of crashing with HTTP 422
127
+ _log("STEP", endpoint="/step", action_type="invalid", error=str(exc))
128
+ error_msg = f"Invalid action payload: {exc}"
129
+ _env._errors_so_far.append(error_msg)
130
+ _env._step_count += 1
131
+
132
+ obs = _env._build_observation()
133
+ reward = Reward(
134
+ value=0.0,
135
+ signals={"error": 1.0},
136
+ done=False,
137
+ info={"error": error_msg},
138
+ )
139
+ return StepResponse(
140
+ observation=obs,
141
+ reward=reward,
142
+ done=False,
143
+ info={"error": error_msg},
144
+ )
145
+
146
  _log("STEP", endpoint="/step", action_type=action.action_type)
147
  try:
148
  obs, reward, done, info = _env.step(action)
test_inference.py DELETED
@@ -1,26 +0,0 @@
1
- import sys
2
- sys.path.insert(0, ".")
3
- from inference import SYSTEM_PROMPT, TASK_IDS, _parse_json, _build_user_prompt
4
- from environment import Action
5
-
6
- print("Imports OK")
7
- print("Tasks:", TASK_IDS)
8
-
9
- # Test JSON parsing
10
- j = _parse_json('{"action_type": "submit_note", "soap_note": {"subjective": "S", "objective": "O", "assessment": "A", "plan": "P"}}')
11
- print("Parse OK:", j["action_type"])
12
-
13
- # Test markdown fence stripping
14
- fenced = '```json\n{"action_type": "submit_note", "soap_note": {"subjective": "S", "objective": "O", "assessment": "A", "plan": "P"}}\n```'
15
- j2 = _parse_json(fenced)
16
- print("Fence strip OK:", j2["action_type"])
17
-
18
- # Test Action creation from parsed output
19
- action = Action(**j2)
20
- print("Action created:", action.action_type, "/ sections:", list(action.soap_note.model_fields.keys()))
21
-
22
- # Test prompt building
23
- p = _build_user_prompt("Hello doctor", {"name": "Test", "age": 30})
24
- print("Prompt len:", len(p), "chars")
25
-
26
- print("\nAll checks passed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_output.txt DELETED
@@ -1,9 +0,0 @@
1
- 
2
- --- Sub-signal unit tests ---
3
- [OK] conciseness(short): got=1.0 want=1.0
4
- [OK] conciseness(long) : got=0.0 want=0.0
5
- [OK] safe_lang(clean) : got=1.0 want=1.0
6
- [OK] safe_lang(unsafe) : got=0.0 want=0.0
7
- [OK] format_valid(ok) : got=1.0 want=1.0
8
- [OK] format_valid(bad) : got=0.0 want=0.0
9
- [OK] format_valid(clfy): got=1.0 want=1.0
 
 
 
 
 
 
 
 
 
 
test_reward.py DELETED
@@ -1,75 +0,0 @@
1
- import sys
2
- sys.path.insert(0, ".")
3
-
4
- from environment import ClinicalNoteScribeEnv, Action, SOAPNote
5
- from environment.reward import (
6
- compute_reward, _conciseness_bonus, _safe_language_score, _format_valid,
7
- WORD_LIMIT, FREE_STEPS, STEP_PENALTY_RATE, ERROR_PENALTY_RATE,
8
- )
9
-
10
- def check(label, got, want):
11
- ok = abs(got - want) < 1e-6
12
- sym = "OK" if ok else "FAIL"
13
- print(f" [{sym}] {label}: got={got} want={want}")
14
- return ok
15
-
16
- short_note = SOAPNote(
17
- subjective="Headache and runny nose for 5 days.",
18
- objective="BP 118/76, HR 72, afebrile, clear lungs.",
19
- assessment="Viral URI.",
20
- plan="DayQuil, fluids, rest. Follow up if fever develops.",
21
- )
22
- long_note = SOAPNote(subjective=" ".join(["word"] * (WORD_LIMIT + 1)), objective="O", assessment="A", plan="P")
23
- unsafe_note = SOAPNote(subjective="Patient definitely has pneumonia.", objective="O", assessment="A", plan="P")
24
- empty_note = SOAPNote(subjective="", objective="O", assessment="A", plan="P")
25
-
26
- submit_ok = Action(action_type="submit_note", soap_note=short_note)
27
- submit_bad = Action(action_type="submit_note", soap_note=empty_note)
28
- clarify = Action(action_type="request_clarify", clarify_question="fever?")
29
-
30
- print("\n--- Sub-signal unit tests ---")
31
- check("conciseness(short)", _conciseness_bonus(short_note), 1.0)
32
- check("conciseness(long) ", _conciseness_bonus(long_note), 0.0)
33
- check("safe_lang(clean) ", _safe_language_score(short_note), 1.0)
34
- check("safe_lang(unsafe) ", _safe_language_score(unsafe_note), 0.0)
35
- check("format_valid(ok) ", _format_valid(submit_ok), 1.0)
36
- check("format_valid(bad) ", _format_valid(submit_bad), 0.0)
37
- check("format_valid(clfy)", _format_valid(clarify), 1.0)
38
-
39
- print("\n--- grader=1.0, steps=2, errors=0 → expect value=1.0 ---")
40
- r = compute_reward(submit_ok, grader_score=1.0, step_count=2, errors_so_far=[])
41
- check("value ", r.value, 1.0)
42
- check("grader_score wt ", r.signals["grader_score"], 0.60)
43
- check("conciseness wt ", r.signals["conciseness_bonus"], 0.10)
44
- check("safe_lang wt ", r.signals["safe_language_score"], 0.15)
45
- check("format_valid wt ", r.signals["format_valid"], 0.15)
46
- check("step_penalty ", r.signals["step_penalty"], 0.0)
47
- check("error_penalty ", r.signals["error_penalty"], 0.0)
48
-
49
- print("\n--- grader=1.0, steps=5 (+2 extra) → expect deduct 0.10 ---")
50
- r2 = compute_reward(submit_ok, grader_score=1.0, step_count=5, errors_so_far=[])
51
- check("step_penalty ", r2.signals["step_penalty"], -(2 * STEP_PENALTY_RATE))
52
- check("value ", r2.value, round(1.0 - 2 * STEP_PENALTY_RATE, 4))
53
-
54
- print("\n--- grader=1.0, steps=2, errors=2 → expect deduct 0.20 ---")
55
- r3 = compute_reward(submit_ok, grader_score=1.0, step_count=2, errors_so_far=["e1", "e2"])
56
- check("error_penalty ", r3.signals["error_penalty"], -(2 * ERROR_PENALTY_RATE))
57
- check("value ", r3.value, round(1.0 - 2 * ERROR_PENALTY_RATE, 4))
58
-
59
- print("\n--- all bad signals → expect value clamped to 0.0 ---")
60
- bad_note = SOAPNote(subjective=" ".join(["word"] * 500) + " Patient definitely has cancer.", objective="", assessment="A", plan="P")
61
- bad_act = Action(action_type="submit_note", soap_note=bad_note)
62
- r4 = compute_reward(bad_act, grader_score=0.0, step_count=10, errors_so_far=["e1","e2","e3"])
63
- check("value clamped ", r4.value, 0.0)
64
-
65
- print("\n--- end-to-end env: clarify(step1) then submit(step2) ---")
66
- env = ClinicalNoteScribeEnv()
67
- env.reset("easy_routine_checkup")
68
- _, rc, dc, _ = env.step(Action(action_type="request_clarify", clarify_question="did the patient report any fever?"))
69
- check("clarify done=False", float(dc), 0.0)
70
- _, rs, ds, _ = env.step(submit_ok)
71
- check("submit done=True ", float(ds), 1.0)
72
- assert 0.0 <= rs.value <= 1.0
73
- print(f" Final value: {rs.value}")
74
- print(f" Signals: { {k:v for k,v in rs.signals.items() if not k.startswith('_')} }")
75
- print("\nAll done.")