Spaces:

thecodebeast
/

Clinical-Note-Scribe

Running

App Files Files Community

Aman Khare commited on 11 days ago

Commit

7655d3c

1 Parent(s): 3856d60

final changes

Browse files

Files changed (25) hide show

.gitignore +47 -0
__pycache__/inference.cpython-314.pyc +0 -0
environment/__pycache__/__init__.cpython-314.pyc +0 -0
environment/__pycache__/env.cpython-314.pyc +0 -0
environment/__pycache__/models.cpython-314.pyc +0 -0
environment/__pycache__/reward.cpython-314.pyc +0 -0
environment/env.py +44 -30
environment/tasks/__pycache__/__init__.cpython-314.pyc +0 -0
environment/tasks/__pycache__/task_easy.cpython-314.pyc +0 -0
environment/tasks/__pycache__/task_hard.cpython-314.pyc +0 -0
environment/tasks/__pycache__/task_medium.cpython-314.pyc +0 -0
environment/tasks/task_easy.py +43 -15
environment/tasks/task_hard.py +47 -15
environment/tasks/task_medium.py +45 -15
err.txt +0 -24
inference.py +7 -6
openenv.yaml +1 -1
out.txt +0 -9
server/__pycache__/__init__.cpython-314.pyc +0 -0
server/__pycache__/app.cpython-314.pyc +0 -0
server/__pycache__/routes.cpython-314.pyc +0 -0
server/routes.py +31 -4
test_inference.py +0 -26
test_output.txt +0 -9
test_reward.py +0 -75

.gitignore ADDED Viewed

	@@ -0,0 +1,47 @@

+# --- Python ---
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+dist/
+build/
+*.egg
+# --- Virtual environments ---
+.venv/
+venv/
+env/
+# --- IDE ---
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# --- OS ---
+.DS_Store
+Thumbs.db
+desktop.ini
+# --- Test artifacts ---
+out.txt
+err.txt
+test_output.txt
+test_full.py
+test_all_fixes.py
+test_inference.py
+test_reward.py
+test_presubmission.py
+# --- Non-submission folders ---
+next step/
+play/
+# --- Logs ---
+*.log
+# --- Secrets ---
+.env
+.env.*

__pycache__/inference.cpython-314.pyc DELETED Viewed

Binary file (15.4 kB)

environment/__pycache__/__init__.cpython-314.pyc DELETED Viewed

Binary file (443 Bytes)

environment/__pycache__/env.cpython-314.pyc DELETED Viewed

Binary file (15.3 kB)

environment/__pycache__/models.cpython-314.pyc DELETED Viewed

Binary file (5.51 kB)

environment/__pycache__/reward.cpython-314.pyc DELETED Viewed

Binary file (7.6 kB)

environment/env.py CHANGED Viewed

@@ -240,9 +240,31 @@ def state(self) -> EnvironmentState:
     # --------------------------------------------------------------------- #
     def _handle_submit(self, action: Action, info: dict) -> Reward:
-        """Process a ``submit_note`` action."""
-        if action.soap_note is None:
-            error = "submit_note requires a non-null soap_note."
             self._errors_so_far.append(error)
             return compute_reward(
                 action,
@@ -253,7 +275,7 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
                 info={"error": error},
             )
-        self._current_draft = _soap_to_text(action.soap_note)
         self._done = True
         # Attempt to grade via the task-specific grader
@@ -270,7 +292,7 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
             )
         try:
-            raw_signals = grader(action.soap_note, self._task)
             # Grader returns a signals dict; extract a single scalar score
             # as the mean of its values for use as grader_score.
             grader_score = (
@@ -278,9 +300,9 @@ def _handle_submit(self, action: Action, info: dict) -> Reward:
                 if raw_signals else 0.0
             )
             info["grader_signals"] = raw_signals
-        except NotImplementedError:
-            info["warning"] = "Grader not yet implemented; returning placeholder."
-            grader_score = 0.5
         return compute_reward(
             action,
@@ -297,11 +319,9 @@ def _handle_clarify(self, action: Action, info: dict) -> Reward:
         if not question:
             error = "request_clarify requires a non-empty clarify_question."
             self._errors_so_far.append(error)
-            return compute_reward(
-                action,
-                grader_score=0.0,
-                step_count=self._step_count,
-                errors_so_far=self._errors_so_far,
                 done=False,
                 info={"error": error},
             )
@@ -315,12 +335,10 @@ def _handle_clarify(self, action: Action, info: dict) -> Reward:
                 "No additional information available for that question."
             )
-        # Clarification steps earn no grader_score; step_penalty accrues naturally
-        return compute_reward(
-            action,
-            grader_score=0.0,
-            step_count=self._step_count,
-            errors_so_far=self._errors_so_far,
             done=False,
             info=info,
         )
@@ -330,11 +348,9 @@ def _handle_revise(self, action: Action, info: dict) -> Reward:
         if action.section is None or action.revision_text is None:
             error = "revise_section requires both 'section' and 'revision_text'."
             self._errors_so_far.append(error)
-            return compute_reward(
-                action,
-                grader_score=0.0,
-                step_count=self._step_count,
-                errors_so_far=self._errors_so_far,
                 done=False,
                 info={"error": error},
             )
@@ -358,12 +374,10 @@ def _handle_revise(self, action: Action, info: dict) -> Reward:
         info["revised_section"] = action.section
-        # Revision steps earn no grader_score; deductions still apply
-        return compute_reward(
-            action,
-            grader_score=0.0,
-            step_count=self._step_count,
-            errors_so_far=self._errors_so_far,
             done=False,
             info=info,
         )

     # --------------------------------------------------------------------- #
     def _handle_submit(self, action: Action, info: dict) -> Reward:
+        """Process a ``submit_note`` action.
+        If ``action.soap_note`` is provided, it is used directly.
+        Otherwise, if the agent has built up a draft via ``revise_section``,
+        the draft is parsed into a SOAPNote automatically.
+        """
+        soap = action.soap_note
+        # Fall back to the current draft if no explicit note is provided
+        if soap is None and self._current_draft:
+            sections: dict[str, str] = {}
+            for line in self._current_draft.split("\n"):
+                for prefix in ("S: ", "O: ", "A: ", "P: "):
+                    if line.startswith(prefix):
+                        sections[prefix[0]] = line[len(prefix):]
+            if all(k in sections for k in "SOAP"):
+                soap = SOAPNote(
+                    subjective=sections["S"],
+                    objective=sections["O"],
+                    assessment=sections["A"],
+                    plan=sections["P"],
+                )
+        if soap is None:
+            error = "submit_note requires a non-null soap_note (or a complete draft from revise_section)."
             self._errors_so_far.append(error)
             return compute_reward(
                 action,
                 info={"error": error},
             )
+        self._current_draft = _soap_to_text(soap)
         self._done = True
         # Attempt to grade via the task-specific grader
             )
         try:
+            raw_signals = grader(soap, self._task)
             # Grader returns a signals dict; extract a single scalar score
             # as the mean of its values for use as grader_score.
             grader_score = (
                 if raw_signals else 0.0
             )
             info["grader_signals"] = raw_signals
+        except Exception as exc:
+            info["warning"] = f"Grader error: {exc}"
+            grader_score = 0.0
         return compute_reward(
             action,
         if not question:
             error = "request_clarify requires a non-empty clarify_question."
             self._errors_so_far.append(error)
+            return Reward(
+                value=0.0,
+                signals={"error": 1.0},
                 done=False,
                 info={"error": error},
             )
                 "No additional information available for that question."
             )
+        # Intermediate actions get zero reward — only submit_note earns score
+        return Reward(
+            value=0.0,
+            signals={"intermediate_step": 1.0},
             done=False,
             info=info,
         )
         if action.section is None or action.revision_text is None:
             error = "revise_section requires both 'section' and 'revision_text'."
             self._errors_so_far.append(error)
+            return Reward(
+                value=0.0,
+                signals={"error": 1.0},
                 done=False,
                 info={"error": error},
             )
         info["revised_section"] = action.section
+        # Intermediate actions get zero reward — only submit_note earns score
+        return Reward(
+            value=0.0,
+            signals={"intermediate_step": 1.0},
             done=False,
             info=info,
         )

environment/tasks/__pycache__/__init__.cpython-314.pyc DELETED Viewed

Binary file (1.15 kB)

environment/tasks/__pycache__/task_easy.cpython-314.pyc DELETED Viewed

Binary file (1.69 kB)

environment/tasks/__pycache__/task_hard.cpython-314.pyc DELETED Viewed

Binary file (2.4 kB)

environment/tasks/__pycache__/task_medium.cpython-314.pyc DELETED Viewed

Binary file (2.02 kB)

environment/tasks/task_easy.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Easy task — routine check-up.
-Grader is intentionally left unimplemented.
 """
 from __future__ import annotations
@@ -32,26 +33,53 @@
 # ---------------------------------------------------------------------------
-# Grader (not yet implemented)
 # ---------------------------------------------------------------------------
 def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the easy-task rubric.
-    Parameters
-    ----------
-    soap_note:
-        The agent's submitted clinical note.
-    task:
-        The task definition dict (``EASY_TASK``).
     Returns
     -------
-    dict mapping signal names → float scores in [0, 1].
-    Raises
-    ------
-    NotImplementedError
-        Grader has not been implemented yet.
     """
-    raise NotImplementedError("Easy-task grader is not yet implemented.")

 """Easy task — routine check-up.
+Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
+against expected findings from a simple cold / blood pressure check visit.
 """
 from __future__ import annotations
 # ---------------------------------------------------------------------------
+# Grader
 # ---------------------------------------------------------------------------
 def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the easy-task rubric.
+    Checks for mention of key clinical findings from the transcript:
+    chief complaints, vitals, viral URI assessment, and supportive plan.
     Returns
     -------
+    dict mapping signal names to float scores in [0, 1].
     """
+    text_s = soap_note.subjective.lower()
+    text_o = soap_note.objective.lower()
+    text_a = soap_note.assessment.lower()
+    text_p = soap_note.plan.lower()
+    # 1. Subjective — chief complaints
+    s_score = 0.0
+    if "sore throat" in text_s or "runny nose" in text_s or "congestion" in text_s:
+        s_score += 0.5
+    if "5 days" in text_s or "five days" in text_s or "headache" in text_s:
+        s_score += 0.5
+    # 2. Objective — vitals
+    o_score = 0.0
+    if "118/76" in text_o or "118 over 76" in text_o or "blood pressure" in text_o:
+        o_score += 0.5
+    if "72" in text_o or "heart rate" in text_o or "lungs clear" in text_o:
+        o_score += 0.5
+    # 3. Assessment — viral URI
+    a_score = 0.0
+    if "viral" in text_a or "uri" in text_a or "upper respiratory" in text_a:
+        a_score += 1.0
+    # 4. Plan — supportive care
+    p_score = 0.0
+    if "fluids" in text_p or "rest" in text_p or "hydrat" in text_p:
+        p_score += 0.5
+    if "dayquil" in text_p or "follow" in text_p or "return" in text_p:
+        p_score += 0.5
+    return {
+        "subjective_accuracy": min(s_score, 1.0),
+        "objective_accuracy": min(o_score, 1.0),
+        "assessment_accuracy": min(a_score, 1.0),
+        "plan_accuracy": min(p_score, 1.0),
+    }

environment/tasks/task_hard.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Hard task — complex ER visit.
-Grader is intentionally left unimplemented.
 """
 from __future__ import annotations
@@ -61,26 +63,56 @@
 # ---------------------------------------------------------------------------
-# Grader (not yet implemented)
 # ---------------------------------------------------------------------------
 def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the hard-task rubric.
-    Parameters
-    ----------
-    soap_note:
-        The agent's submitted clinical note.
-    task:
-        The task definition dict (``HARD_TASK``).
     Returns
     -------
-    dict mapping signal names → float scores in [0, 1].
-    Raises
-    ------
-    NotImplementedError
-        Grader has not been implemented yet.
     """
-    raise NotImplementedError("Hard-task grader is not yet implemented.")

 """Hard task — complex ER visit.
+Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
+against expected findings from a complex ER visit with overlapping chest pain,
+SOB, and a possible PE complicated by a contrast dye allergy.
 """
 from __future__ import annotations
 # ---------------------------------------------------------------------------
+# Grader
 # ---------------------------------------------------------------------------
 def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the hard-task rubric.
+    Checks for chest pain / SOB and the nitroglycerin contradiction (subjective),
+    D-dimer and contrast allergy (objective), ACS vs PE differential (assessment),
+    and V/Q scan + ICU admission (plan).
     Returns
     -------
+    dict mapping signal names to float scores in [0, 1].
     """
+    text_s = soap_note.subjective.lower()
+    text_o = soap_note.objective.lower()
+    text_a = soap_note.assessment.lower()
+    text_p = soap_note.plan.lower()
+    # 1. Subjective — catching the contradiction and presenting complaints
+    s_score = 0.0
+    if "chest pain" in text_s or "shortness of breath" in text_s or "sob" in text_s:
+        s_score += 0.5
+    if "nitroglycerin" in text_s or "contradict" in text_s or "denied" in text_s:
+        s_score += 0.5
+    # 2. Objective — elevated D-dimer and allergy awareness
+    o_score = 0.0
+    if "d-dimer" in text_o or "1840" in text_o or "d dimer" in text_o:
+        o_score += 0.5
+    if "allergy" in text_o or "contrast" in text_o or "troponin" in text_o:
+        o_score += 0.5
+    # 3. Assessment — the dual differential (ACS vs PE)
+    a_score = 0.0
+    if "acs" in text_a or "acute coronary" in text_a or "coronary" in text_a or "ischemia" in text_a:
+        a_score += 0.5
+    if "pe" in text_a or "pulmonary embolism" in text_a or "embolism" in text_a:
+        a_score += 0.5
+    # 4. Plan — adapting to the allergy (V/Q scan) and admission
+    p_score = 0.0
+    if "v/q" in text_p or "ventilation" in text_p or "perfusion" in text_p:
+        p_score += 0.5
+    if "icu" in text_p or "admit" in text_p or "cardiac" in text_p:
+        p_score += 0.5
+    return {
+        "subjective_accuracy": min(s_score, 1.0),
+        "objective_accuracy": min(o_score, 1.0),
+        "assessment_accuracy": min(a_score, 1.0),
+        "plan_accuracy": min(p_score, 1.0),
+    }

environment/tasks/task_medium.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Medium task — chronic disease follow-up.
-Grader is intentionally left unimplemented.
 """
 from __future__ import annotations
@@ -43,26 +44,55 @@
 # ---------------------------------------------------------------------------
-# Grader (not yet implemented)
 # ---------------------------------------------------------------------------
 def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the medium-task rubric.
-    Parameters
-    ----------
-    soap_note:
-        The agent's submitted clinical note.
-    task:
-        The task definition dict (``MEDIUM_TASK``).
     Returns
     -------
-    dict mapping signal names → float scores in [0, 1].
-    Raises
-    ------
-    NotImplementedError
-        Grader has not been implemented yet.
     """
-    raise NotImplementedError("Medium-task grader is not yet implemented.")

 """Medium task — chronic disease follow-up.
+Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
+against expected findings from a Type 2 Diabetes / Hypertension follow-up.
 """
 from __future__ import annotations
 # ---------------------------------------------------------------------------
+# Grader
 # ---------------------------------------------------------------------------
 def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
     """Score a submitted SOAP note against the medium-task rubric.
+    Checks for mention of dietary habits, HbA1c lab values, core diagnoses,
+    and medication adjustments (glipizide, lisinopril uptitration).
     Returns
     -------
+    dict mapping signal names to float scores in [0, 1].
     """
+    text_s = soap_note.subjective.lower()
+    text_o = soap_note.objective.lower()
+    text_a = soap_note.assessment.lower()
+    text_p = soap_note.plan.lower()
+    # 1. Subjective — dietary habits / statin gap
+    s_score = 0.0
+    if "restaurant" in text_s or "diet" in text_s or "eating" in text_s:
+        s_score += 0.5
+    if "statin" in text_s or "gap" in text_s or "missed" in text_s:
+        s_score += 0.5
+    # 2. Objective — HbA1c values
+    o_score = 0.0
+    if "7.8" in text_o or "7.2" in text_o or "a1c" in text_o or "hba1c" in text_o:
+        o_score += 0.5
+    if "156" in text_o or "fasting glucose" in text_o or "glucose" in text_o:
+        o_score += 0.5
+    # 3. Assessment — core diagnoses
+    a_score = 0.0
+    if "diabetes" in text_a or "t2dm" in text_a or "dm" in text_a:
+        a_score += 0.5
+    if "hypertension" in text_a or "htn" in text_a or "blood pressure" in text_a:
+        a_score += 0.5
+    # 4. Plan — medication changes
+    p_score = 0.0
+    if "glipizide" in text_p and ("5" in text_p or "add" in text_p):
+        p_score += 0.5
+    if "lisinopril" in text_p and ("40" in text_p or "increase" in text_p or "uptitrat" in text_p):
+        p_score += 0.5
+    return {
+        "subjective_accuracy": min(s_score, 1.0),
+        "objective_accuracy": min(o_score, 1.0),
+        "assessment_accuracy": min(a_score, 1.0),
+        "plan_accuracy": min(p_score, 1.0),
+    }

err.txt DELETED Viewed

@@ -1,24 +0,0 @@
-{"event": "START", "timestamp": 1775576189.364181, "task_id": "easy_routine_checkup"}
-[DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
-{"event": "STEP", "timestamp": 1775576190.2672057, "step": 1, "action_type": "submit_note", "reward": 0.7}
-{"event": "END", "timestamp": 1775576190.2674263, "task_id": "easy_routine_checkup", "final_score": 0.7}
-{"event": "START", "timestamp": 1775576190.269494, "task_id": "medium_chronic_disease_followup"}
-[DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
-{"event": "STEP", "timestamp": 1775576190.6036963, "step": 1, "action_type": "submit_note", "reward": 0.7}
-{"event": "END", "timestamp": 1775576190.6037915, "task_id": "medium_chronic_disease_followup", "final_score": 0.7}
-{"event": "START", "timestamp": 1775576190.604777, "task_id": "hard_complex_er_visit"}
-[DEBUG] Model request failed: Error code: 401 - {'error': 'Invalid username or password.'}
-{"event": "STEP", "timestamp": 1775576190.9611442, "step": 1, "action_type": "submit_note", "reward": 0.7}
-{"event": "END", "timestamp": 1775576190.961212, "task_id": "hard_complex_er_visit", "final_score": 0.7}
-============================================================
-  SUMMARY
-============================================================
-  Task                               Score  Steps
-  -------------------------------  -------  -----
-  easy_routine_checkup              0.7000      1
-  medium_chronic_disease_followup   0.7000      1
-  hard_complex_er_visit             0.7000      1
-  -------------------------------  -------  -----
-  AVERAGE                           0.7000

inference.py CHANGED Viewed

@@ -212,17 +212,18 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
                 action = Action(**action_dict)
                 action_str = f"submit_note(sections=S,O,A,P)"
             except Exception as exc:
-                # On model / parse failure, submit a minimal note to avoid hanging
                 action = Action(
                     action_type="submit_note",
                     soap_note=SOAPNote(
-                        subjective="Unable to generate.",
-                        objective="Unable to generate.",
-                        assessment="Unable to generate.",
-                        plan="Unable to generate.",
                     ),
                 )
-                action_str = f"submit_note(fallback)"
                 last_error = str(exc)
             # ---- step ----

                 action = Action(**action_dict)
                 action_str = f"submit_note(sections=S,O,A,P)"
             except Exception as exc:
+                # On model / parse failure, submit an empty note so all sub-signals
+                # grade to 0.0 (format_valid=0 because fields are empty, grader=0).
                 action = Action(
                     action_type="submit_note",
                     soap_note=SOAPNote(
+                        subjective="",
+                        objective="",
+                        assessment="",
+                        plan="",
                     ),
                 )
+                action_str = "submit_note(fallback)"
                 last_error = str(exc)
             # ---- step ----

openenv.yaml CHANGED Viewed

@@ -206,6 +206,6 @@ graders:
 inference:
   script: inference.py
   env_vars:
-    - OPENAI_API_KEY
     - API_BASE_URL
     - MODEL_NAME

 inference:
   script: inference.py
   env_vars:
+    - HF_TOKEN
     - API_BASE_URL
     - MODEL_NAME

out.txt DELETED Viewed

@@ -1,9 +0,0 @@
-[START] task=easy_routine_checkup env=clinical-note-scribe model=gpt-4o-mini
-[STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
-[END] success=true steps=1 score=0.70 rewards=0.70
-[START] task=medium_chronic_disease_followup env=clinical-note-scribe model=gpt-4o-mini
-[STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
-[END] success=true steps=1 score=0.70 rewards=0.70
-[START] task=hard_complex_er_visit env=clinical-note-scribe model=gpt-4o-mini
-[STEP] step=1 action=submit_note(fallback) reward=0.70 done=true error=Error code: 401 - {'error': 'Invalid username or password.'}
-[END] success=true steps=1 score=0.70 rewards=0.70

server/__pycache__/__init__.cpython-314.pyc DELETED Viewed

Binary file (221 Bytes)

server/__pycache__/app.cpython-314.pyc DELETED Viewed

Binary file (1.26 kB)

server/__pycache__/routes.cpython-314.pyc DELETED Viewed

Binary file (6.37 kB)

server/routes.py CHANGED Viewed

@@ -110,12 +110,39 @@ async def reset(body: ResetRequest) -> Observation:
     response_model=StepResponse,
     summary="Submit an action and advance the environment by one step",
 )
-async def step(action: Action) -> StepResponse:
-    """Execute *action* in the current episode.
-    The underlying environment emits a ``[STEP]`` log event (and ``[END]``
-    when the episode terminates).
     """
     _log("STEP", endpoint="/step", action_type=action.action_type)
     try:
         obs, reward, done, info = _env.step(action)

     response_model=StepResponse,
     summary="Submit an action and advance the environment by one step",
 )
+async def step(payload: dict[str, Any]) -> StepResponse:
+    """Execute an action in the current episode.
+    Accepts a raw JSON body and validates it into an ``Action``.
+    If validation fails, the error is recorded in the environment
+    instead of returning an HTTP 422.
     """
+    from pydantic import ValidationError
+    from environment.models import Reward
+    try:
+        action = Action(**payload)
+    except (ValidationError, TypeError) as exc:
+        # Gracefully absorb bad payloads instead of crashing with HTTP 422
+        _log("STEP", endpoint="/step", action_type="invalid", error=str(exc))
+        error_msg = f"Invalid action payload: {exc}"
+        _env._errors_so_far.append(error_msg)
+        _env._step_count += 1
+        obs = _env._build_observation()
+        reward = Reward(
+            value=0.0,
+            signals={"error": 1.0},
+            done=False,
+            info={"error": error_msg},
+        )
+        return StepResponse(
+            observation=obs,
+            reward=reward,
+            done=False,
+            info={"error": error_msg},
+        )
     _log("STEP", endpoint="/step", action_type=action.action_type)
     try:
         obs, reward, done, info = _env.step(action)

test_inference.py DELETED Viewed

@@ -1,26 +0,0 @@
-import sys
-sys.path.insert(0, ".")
-from inference import SYSTEM_PROMPT, TASK_IDS, _parse_json, _build_user_prompt
-from environment import Action
-print("Imports OK")
-print("Tasks:", TASK_IDS)
-# Test JSON parsing
-j = _parse_json('{"action_type": "submit_note", "soap_note": {"subjective": "S", "objective": "O", "assessment": "A", "plan": "P"}}')
-print("Parse OK:", j["action_type"])
-# Test markdown fence stripping
-fenced = '```json\n{"action_type": "submit_note", "soap_note": {"subjective": "S", "objective": "O", "assessment": "A", "plan": "P"}}\n```'
-j2 = _parse_json(fenced)
-print("Fence strip OK:", j2["action_type"])
-# Test Action creation from parsed output
-action = Action(**j2)
-print("Action created:", action.action_type, "/ sections:", list(action.soap_note.model_fields.keys()))
-# Test prompt building
-p = _build_user_prompt("Hello doctor", {"name": "Test", "age": 30})
-print("Prompt len:", len(p), "chars")
-print("\nAll checks passed.")

test_output.txt DELETED Viewed

@@ -1,9 +0,0 @@
---- Sub-signal unit tests ---
-  [OK] conciseness(short): got=1.0  want=1.0
-  [OK] conciseness(long) : got=0.0  want=0.0
-  [OK] safe_lang(clean)  : got=1.0  want=1.0
-  [OK] safe_lang(unsafe) : got=0.0  want=0.0
-  [OK] format_valid(ok)  : got=1.0  want=1.0
-  [OK] format_valid(bad) : got=0.0  want=0.0
-  [OK] format_valid(clfy): got=1.0  want=1.0

test_reward.py DELETED Viewed

@@ -1,75 +0,0 @@
-import sys
-sys.path.insert(0, ".")
-from environment import ClinicalNoteScribeEnv, Action, SOAPNote
-from environment.reward import (
-    compute_reward, _conciseness_bonus, _safe_language_score, _format_valid,
-    WORD_LIMIT, FREE_STEPS, STEP_PENALTY_RATE, ERROR_PENALTY_RATE,
-)
-def check(label, got, want):
-    ok = abs(got - want) < 1e-6
-    sym = "OK" if ok else "FAIL"
-    print(f"  [{sym}] {label}: got={got}  want={want}")
-    return ok
-short_note = SOAPNote(
-    subjective="Headache and runny nose for 5 days.",
-    objective="BP 118/76, HR 72, afebrile, clear lungs.",
-    assessment="Viral URI.",
-    plan="DayQuil, fluids, rest. Follow up if fever develops.",
-)
-long_note  = SOAPNote(subjective=" ".join(["word"] * (WORD_LIMIT + 1)), objective="O", assessment="A", plan="P")
-unsafe_note = SOAPNote(subjective="Patient definitely has pneumonia.", objective="O", assessment="A", plan="P")
-empty_note  = SOAPNote(subjective="", objective="O", assessment="A", plan="P")
-submit_ok  = Action(action_type="submit_note", soap_note=short_note)
-submit_bad = Action(action_type="submit_note", soap_note=empty_note)
-clarify    = Action(action_type="request_clarify", clarify_question="fever?")
-print("\n--- Sub-signal unit tests ---")
-check("conciseness(short)", _conciseness_bonus(short_note), 1.0)
-check("conciseness(long) ", _conciseness_bonus(long_note),  0.0)
-check("safe_lang(clean)  ", _safe_language_score(short_note),  1.0)
-check("safe_lang(unsafe) ", _safe_language_score(unsafe_note), 0.0)
-check("format_valid(ok)  ", _format_valid(submit_ok),  1.0)
-check("format_valid(bad) ", _format_valid(submit_bad), 0.0)
-check("format_valid(clfy)", _format_valid(clarify),    1.0)
-print("\n--- grader=1.0, steps=2, errors=0 → expect value=1.0 ---")
-r = compute_reward(submit_ok, grader_score=1.0, step_count=2, errors_so_far=[])
-check("value            ", r.value, 1.0)
-check("grader_score wt  ", r.signals["grader_score"],         0.60)
-check("conciseness wt   ", r.signals["conciseness_bonus"],    0.10)
-check("safe_lang wt     ", r.signals["safe_language_score"],  0.15)
-check("format_valid wt  ", r.signals["format_valid"],         0.15)
-check("step_penalty     ", r.signals["step_penalty"],         0.0)
-check("error_penalty    ", r.signals["error_penalty"],        0.0)
-print("\n--- grader=1.0, steps=5 (+2 extra) → expect deduct 0.10 ---")
-r2 = compute_reward(submit_ok, grader_score=1.0, step_count=5, errors_so_far=[])
-check("step_penalty     ", r2.signals["step_penalty"], -(2 * STEP_PENALTY_RATE))
-check("value            ", r2.value, round(1.0 - 2 * STEP_PENALTY_RATE, 4))
-print("\n--- grader=1.0, steps=2, errors=2 → expect deduct 0.20 ---")
-r3 = compute_reward(submit_ok, grader_score=1.0, step_count=2, errors_so_far=["e1", "e2"])
-check("error_penalty    ", r3.signals["error_penalty"], -(2 * ERROR_PENALTY_RATE))
-check("value            ", r3.value, round(1.0 - 2 * ERROR_PENALTY_RATE, 4))
-print("\n--- all bad signals → expect value clamped to 0.0 ---")
-bad_note = SOAPNote(subjective=" ".join(["word"] * 500) + " Patient definitely has cancer.", objective="", assessment="A", plan="P")
-bad_act  = Action(action_type="submit_note", soap_note=bad_note)
-r4 = compute_reward(bad_act, grader_score=0.0, step_count=10, errors_so_far=["e1","e2","e3"])
-check("value clamped    ", r4.value, 0.0)
-print("\n--- end-to-end env: clarify(step1) then submit(step2) ---")
-env = ClinicalNoteScribeEnv()
-env.reset("easy_routine_checkup")
-_, rc, dc, _ = env.step(Action(action_type="request_clarify", clarify_question="did the patient report any fever?"))
-check("clarify done=False", float(dc), 0.0)
-_, rs, ds, _ = env.step(submit_ok)
-check("submit  done=True ", float(ds), 1.0)
-assert 0.0 <= rs.value <= 1.0
-print(f"  Final value: {rs.value}")
-print(f"  Signals: { {k:v for k,v in rs.signals.items() if not k.startswith('_')} }")
-print("\nAll done.")