Spaces:

thecodebeast
/

Clinical-Note-Scribe

Running

App Files Files Community

Aman Khare commited on 8 days ago

Commit

8b7bdb7

1 Parent(s): 3c850f8

Optimize codebase + add minimalist frontend

Browse files

Files changed (11) hide show

environment/env.py +95 -305
environment/models.py +28 -156
environment/reward.py +70 -166
environment/tasks/task_easy.py +19 -67
environment/tasks/task_hard.py +24 -100
environment/tasks/task_medium.py +21 -78
frontend/app.js +262 -0
frontend/index.html +392 -0
inference.py +7 -38
server/app.py +13 -33
server/routes.py +19 -118

environment/env.py CHANGED Viewed

@@ -1,18 +1,4 @@
-"""ClinicalNoteScribeEnv — core environment loop.
-Implements the ``reset() → Observation``, ``step(Action) → (Observation, Reward, bool, dict)``,
-and ``state() → EnvironmentState`` interface required by the OpenEnv spec.
-Structured logging
-------------------
-Every episode emits exactly three kinds of JSON log lines to **stdout**:
-- ``{"event": "START", "task_id": "...", "timestamp": ...}``
-- ``{"event": "STEP",  "step": N, "action_type": "...", "reward": R}``
-- ``{"event": "END",   "task_id": "...", "final_score": S}``
-The OpenEnv validator scrapes ``[START]``, ``[STEP]``, ``[END]`` keywords.
-"""
 from __future__ import annotations
@@ -27,115 +13,55 @@
 from environment.tasks import GRADER_REGISTRY, TASK_REGISTRY
 logger = logging.getLogger("clinical_note_scribe")
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-def _load_transcript(transcript_path: str) -> str:
-    """Load a transcript text file from *project-root-relative* path."""
-    base = Path(__file__).resolve().parent.parent  # clinical-note-scribe/
-    full_path = base / transcript_path
-    if full_path.exists():
-        return full_path.read_text(encoding="utf-8")
-    return f"[Transcript file not found: {transcript_path}]"
-def _log_event(event: str, **kwargs: Any) -> None:
-    """Emit a structured JSON log line to stdout via the logger."""
-    payload: dict[str, Any] = {"event": event, "timestamp": time.time()}
-    payload.update(kwargs)
-    logger.info(json.dumps(payload))
-def _soap_to_text(soap: SOAPNote) -> str:
-    """Flatten a SOAPNote into a readable multi-line string."""
-    return (
-        f"S: {soap.subjective}\n"
-        f"O: {soap.objective}\n"
-        f"A: {soap.assessment}\n"
-        f"P: {soap.plan}"
-    )
-# ---------------------------------------------------------------------------
-# Main environment class
-# ---------------------------------------------------------------------------
-class ClinicalNoteScribeEnv:
-    """Open-environment wrapper for the clinical note-scribe tasks.
-    Lifecycle
-    ---------
-    1. ``env.reset(task_id)``  → returns initial ``Observation``
-    2. ``env.step(action)``    → returns ``(Observation, Reward, done, info)``
-    3. ``env.state()``         → returns full ``EnvironmentState`` snapshot
-    Parameters
-    ----------
-    clarify_answers_path:
-        Project-root-relative path to the clarification lookup JSON.
-    """
-    def __init__(
-        self,
-        clarify_answers_path: str = "data/clarify_answers.json",
-    ) -> None:
-        self._clarify_answers: dict[str, str] = {}
-        base = Path(__file__).resolve().parent.parent
-        ca_path = base / clarify_answers_path
-        if ca_path.exists():
-            self._clarify_answers = json.loads(ca_path.read_text(encoding="utf-8"))
-        # Episode state (initialised properly in reset())
         self._task: dict[str, Any] = {}
-        self._task_id: str = ""
-        self._transcript: str = ""
         self._patient_context: dict[str, Any] = {}
-        self._max_steps: int = 10
-        self._step_count: int = 0
-        self._done: bool = True
         self._current_draft: str | None = None
         self._errors_so_far: list[str] = []
         self._last_reward: Reward | None = None
         self._last_observation: Observation | None = None
-    # --------------------------------------------------------------------- #
     # Public API
-    # --------------------------------------------------------------------- #
     def reset(self, task_id: str | None = None) -> Observation:
-        """Start (or restart) an episode for the given *task_id*.
-        Parameters
-        ----------
-        task_id:
-            One of the keys in ``TASK_REGISTRY``.  When ``None`` the first
-            registered task is used.
-        Returns
-        -------
-        Observation
-            The initial observation for the episode.
-        Raises
-        ------
-        ValueError
-            If *task_id* is not found in the registry.
-        """
-        if task_id is None:
-            task_id = next(iter(TASK_REGISTRY))
         if task_id not in TASK_REGISTRY:
-            available = ", ".join(TASK_REGISTRY.keys())
-            raise ValueError(
-                f"Unknown task_id '{task_id}'. Available: {available}"
-            )
         self._task = TASK_REGISTRY[task_id]
         self._task_id = task_id
-        self._transcript = _load_transcript(self._task["transcript_file"])
         self._patient_context = self._task.get("patient_context", {})
         self._max_steps = self._task.get("max_steps", 10)
         self._step_count = 0
@@ -144,254 +70,118 @@ def reset(self, task_id: str | None = None) -> Observation:
         self._errors_so_far = []
         self._last_reward = None
-        _log_event("START", task_id=self._task_id)
-        obs = self._build_observation()
-        self._last_observation = obs
-        return obs
     def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
-        """Execute one agent action and return the resulting observation, reward,
-        done flag, and info dict.
-        Parameters
-        ----------
-        action:
-            The agent's chosen action.
-        Returns
-        -------
-        tuple[Observation, Reward, bool, dict]
-        """
         if self._done:
-            raise RuntimeError(
-                "Episode is done. Call reset() before stepping again."
-            )
         self._step_count += 1
         info: dict[str, Any] = {}
-        # ---- dispatch by action type ----
-        if action.action_type == "submit_note":
-            reward = self._handle_submit(action, info)
-        elif action.action_type == "request_clarify":
-            reward = self._handle_clarify(action, info)
-        elif action.action_type == "revise_section":
-            reward = self._handle_revise(action, info)
         else:
-            # Should never happen thanks to the Literal type, but be safe
-            self._errors_so_far.append(
-                f"Unknown action_type: {action.action_type}"
-            )
-            reward = compute_reward(
-                action,
-                grader_score=0.0,
-                step_count=self._step_count,
-                errors_so_far=self._errors_so_far,
-                done=False,
-                info={"error": "bad_action"},
-            )
-        # ---- enforce max-step termination ----
         if self._step_count >= self._max_steps and not self._done:
             self._done = True
-            reward = Reward(
-                value=reward.value,
-                signals=reward.signals,
-                done=True,
-                info={**reward.info, "termination_reason": "max_steps_reached"},
-            )
         self._last_reward = reward
-        _log_event(
-            "STEP",
-            step=self._step_count,
-            action_type=action.action_type,
-            reward=reward.value,
-        )
         if self._done:
-            _log_event(
-                "END",
-                task_id=self._task_id,
-                final_score=reward.value,
-            )
-        obs = self._build_observation()
-        self._last_observation = obs
-        return obs, reward, self._done, info
     def state(self) -> EnvironmentState:
-        """Return the full internal state snapshot."""
         return EnvironmentState(
-            task_id=self._task_id,
-            step_count=self._step_count,
-            max_steps=self._max_steps,
-            done=self._done,
-            current_draft=self._current_draft,
             errors_so_far=list(self._errors_so_far),
-            last_reward=self._last_reward,
-            observation=self._last_observation,
         )
-    # --------------------------------------------------------------------- #
     # Action handlers
-    # --------------------------------------------------------------------- #
-    def _handle_submit(self, action: Action, info: dict) -> Reward:
-        """Process a ``submit_note`` action.
-        If ``action.soap_note`` is provided, it is used directly.
-        Otherwise, if the agent has built up a draft via ``revise_section``,
-        the draft is parsed into a SOAPNote automatically.
-        """
         soap = action.soap_note
-        # Fall back to the current draft if no explicit note is provided
         if soap is None and self._current_draft:
-            sections: dict[str, str] = {}
             for line in self._current_draft.split("\n"):
-                for prefix in ("S: ", "O: ", "A: ", "P: "):
-                    if line.startswith(prefix):
-                        sections[prefix[0]] = line[len(prefix):]
-            if all(k in sections for k in "SOAP"):
-                soap = SOAPNote(
-                    subjective=sections["S"],
-                    objective=sections["O"],
-                    assessment=sections["A"],
-                    plan=sections["P"],
-                )
         if soap is None:
-            error = "submit_note requires a non-null soap_note (or a complete draft from revise_section)."
-            self._errors_so_far.append(error)
-            return compute_reward(
-                action,
-                grader_score=0.0,
-                step_count=self._step_count,
-                errors_so_far=self._errors_so_far,
-                done=False,
-                info={"error": error},
-            )
-        self._current_draft = _soap_to_text(soap)
         self._done = True
-        # Attempt to grade via the task-specific grader
         grader = GRADER_REGISTRY.get(self._task_id)
-        if grader is None:
             info["warning"] = "No grader registered; returning default reward."
-            return compute_reward(
-                action,
-                grader_score=0.5,
-                step_count=self._step_count,
-                errors_so_far=self._errors_so_far,
-                done=True,
-                info=info,
-            )
         try:
-            raw_signals = grader(soap, self._task)
-            # Grader returns a signals dict; extract a single scalar score
-            # as the mean of its values for use as grader_score.
-            grader_score = (
-                sum(raw_signals.values()) / len(raw_signals)
-                if raw_signals else 0.0
-            )
-            info["grader_signals"] = raw_signals
         except Exception as exc:
             info["warning"] = f"Grader error: {exc}"
-            grader_score = 0.0
-        return compute_reward(
-            action,
-            grader_score=grader_score,
-            step_count=self._step_count,
-            errors_so_far=self._errors_so_far,
-            done=True,
-            info=info,
-        )
-    def _handle_clarify(self, action: Action, info: dict) -> Reward:
-        """Process a ``request_clarify`` action."""
-        question = (action.clarify_question or "").strip()
-        if not question:
-            error = "request_clarify requires a non-empty clarify_question."
-            self._errors_so_far.append(error)
-            return Reward(
-                value=0.0,
-                signals={"error": 1.0},
-                done=False,
-                info={"error": error},
-            )
-        # Lookup a canned answer (case-insensitive key match)
-        answer = self._clarify_answers.get(question.lower())
-        if answer:
-            info["clarify_answer"] = answer
-        else:
-            info["clarify_answer"] = (
-                "No additional information available for that question."
-            )
-        # Intermediate actions get zero reward — only submit_note earns score
-        return Reward(
-            value=0.0,
-            signals={"intermediate_step": 1.0},
-            done=False,
-            info=info,
-        )
-    def _handle_revise(self, action: Action, info: dict) -> Reward:
-        """Process a ``revise_section`` action."""
         if action.section is None or action.revision_text is None:
-            error = "revise_section requires both 'section' and 'revision_text'."
-            self._errors_so_far.append(error)
-            return Reward(
-                value=0.0,
-                signals={"error": 1.0},
-                done=False,
-                info={"error": error},
-            )
-        # If there is an existing draft, patch the requested section
         if self._current_draft:
             lines = self._current_draft.split("\n")
-            prefix = f"{action.section}: "
-            patched = False
             for i, line in enumerate(lines):
                 if line.startswith(prefix):
-                    lines[i] = f"{prefix}{action.revision_text}"
-                    patched = True
                     break
-            if patched:
-                self._current_draft = "\n".join(lines)
             else:
                 self._current_draft += f"\n{prefix}{action.revision_text}"
         else:
-            self._current_draft = f"{action.section}: {action.revision_text}"
         info["revised_section"] = action.section
-        # Intermediate actions get zero reward — only submit_note earns score
-        return Reward(
-            value=0.0,
-            signals={"intermediate_step": 1.0},
-            done=False,
-            info=info,
-        )
-    # --------------------------------------------------------------------- #
-    # Internal helpers
-    # --------------------------------------------------------------------- #
-    def _build_observation(self) -> Observation:
-        return Observation(
-            transcript=self._transcript,
-            task_id=self._task_id,
-            patient_context=self._patient_context,
-            current_draft=self._current_draft,
-            errors_so_far=list(self._errors_so_far),
-            step_count=self._step_count,
-        )

+"""ClinicalNoteScribeEnv — core environment implementing reset/step/state."""
 from __future__ import annotations
 from environment.tasks import GRADER_REGISTRY, TASK_REGISTRY
 logger = logging.getLogger("clinical_note_scribe")
+_ROOT = Path(__file__).resolve().parent.parent
+def _log(event: str, **kw: Any) -> None:
+    logger.info(json.dumps({"event": event, "timestamp": time.time(), **kw}))
+class ClinicalNoteScribeEnv:
+    """OpenEnv-compliant environment for clinical SOAP-note generation."""
+    def __init__(self, clarify_answers_path: str = "data/clarify_answers.json") -> None:
+        ca = _ROOT / clarify_answers_path
+        self._clarify_answers: dict[str, str] = json.loads(ca.read_text()) if ca.exists() else {}
+        self._reset_state()
+    def _reset_state(self) -> None:
         self._task: dict[str, Any] = {}
+        self._task_id = ""
+        self._transcript = ""
         self._patient_context: dict[str, Any] = {}
+        self._max_steps = 10
+        self._step_count = 0
+        self._done = True
         self._current_draft: str | None = None
         self._errors_so_far: list[str] = []
         self._last_reward: Reward | None = None
         self._last_observation: Observation | None = None
+    def _obs(self) -> Observation:
+        return Observation(
+            transcript=self._transcript,
+            task_id=self._task_id,
+            patient_context=self._patient_context,
+            current_draft=self._current_draft,
+            errors_so_far=list(self._errors_so_far),
+            step_count=self._step_count,
+        )
     # Public API
     def reset(self, task_id: str | None = None) -> Observation:
+        task_id = task_id or next(iter(TASK_REGISTRY))
         if task_id not in TASK_REGISTRY:
+            raise ValueError(f"Unknown task_id '{task_id}'. Available: {', '.join(TASK_REGISTRY)}")
         self._task = TASK_REGISTRY[task_id]
         self._task_id = task_id
+        path = _ROOT / self._task["transcript_file"]
+        self._transcript = path.read_text(encoding="utf-8") if path.exists() else f"[Not found: {path}]"
         self._patient_context = self._task.get("patient_context", {})
         self._max_steps = self._task.get("max_steps", 10)
         self._step_count = 0
         self._errors_so_far = []
         self._last_reward = None
+        _log("START", task_id=task_id)
+        self._last_observation = self._obs()
+        return self._last_observation
     def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
         if self._done:
+            raise RuntimeError("Episode is done. Call reset() before stepping again.")
         self._step_count += 1
         info: dict[str, Any] = {}
+        # Dispatch
+        handler = {
+            "submit_note": self._submit,
+            "request_clarify": self._clarify,
+            "revise_section": self._revise,
+        }.get(action.action_type)
+        if handler:
+            reward = handler(action, info)
         else:
+            self._errors_so_far.append(f"Unknown action_type: {action.action_type}")
+            reward = compute_reward(action, 0.0, self._step_count, self._errors_so_far, done=False, info={"error": "bad_action"})
+        # Max-step termination
         if self._step_count >= self._max_steps and not self._done:
             self._done = True
+            reward = Reward(value=reward.value, signals=reward.signals, done=True,
+                            info={**reward.info, "termination_reason": "max_steps_reached"})
         self._last_reward = reward
+        _log("STEP", step=self._step_count, action_type=action.action_type, reward=reward.value)
         if self._done:
+            _log("END", task_id=self._task_id, final_score=reward.value)
+        self._last_observation = self._obs()
+        return self._last_observation, reward, self._done, info
     def state(self) -> EnvironmentState:
         return EnvironmentState(
+            task_id=self._task_id, step_count=self._step_count, max_steps=self._max_steps,
+            done=self._done, current_draft=self._current_draft,
             errors_so_far=list(self._errors_so_far),
+            last_reward=self._last_reward, observation=self._last_observation,
         )
     # Action handlers
+    def _submit(self, action: Action, info: dict) -> Reward:
         soap = action.soap_note
+        # Fall back to draft
         if soap is None and self._current_draft:
+            secs = {}
             for line in self._current_draft.split("\n"):
+                for p in ("S: ", "O: ", "A: ", "P: "):
+                    if line.startswith(p):
+                        secs[p[0]] = line[len(p):]
+            if all(k in secs for k in "SOAP"):
+                soap = SOAPNote(subjective=secs["S"], objective=secs["O"], assessment=secs["A"], plan=secs["P"])
         if soap is None:
+            err = "submit_note requires a non-null soap_note (or a complete draft from revise_section)."
+            self._errors_so_far.append(err)
+            return compute_reward(action, 0.0, self._step_count, self._errors_so_far, done=False, info={"error": err})
+        self._current_draft = f"S: {soap.subjective}\nO: {soap.objective}\nA: {soap.assessment}\nP: {soap.plan}"
         self._done = True
         grader = GRADER_REGISTRY.get(self._task_id)
+        if not grader:
             info["warning"] = "No grader registered; returning default reward."
+            return compute_reward(action, 0.5, self._step_count, self._errors_so_far, done=True, info=info)
         try:
+            signals = grader(soap, self._task)
+            score = sum(signals.values()) / len(signals) if signals else 0.0
+            info["grader_signals"] = signals
         except Exception as exc:
             info["warning"] = f"Grader error: {exc}"
+            score = 0.0
+        return compute_reward(action, score, self._step_count, self._errors_so_far, done=True, info=info)
+    def _clarify(self, action: Action, info: dict) -> Reward:
+        q = (action.clarify_question or "").strip()
+        if not q:
+            err = "request_clarify requires a non-empty clarify_question."
+            self._errors_so_far.append(err)
+            return Reward(value=0.0, signals={"error": 1.0}, done=False, info={"error": err})
+        info["clarify_answer"] = self._clarify_answers.get(q.lower(), "No additional information available for that question.")
+        return Reward(value=0.0, signals={"intermediate_step": 1.0}, done=False, info=info)
+    def _revise(self, action: Action, info: dict) -> Reward:
         if action.section is None or action.revision_text is None:
+            err = "revise_section requires both 'section' and 'revision_text'."
+            self._errors_so_far.append(err)
+            return Reward(value=0.0, signals={"error": 1.0}, done=False, info={"error": err})
+        prefix = f"{action.section}: "
         if self._current_draft:
             lines = self._current_draft.split("\n")
             for i, line in enumerate(lines):
                 if line.startswith(prefix):
+                    lines[i] = prefix + action.revision_text
+                    self._current_draft = "\n".join(lines)
                     break
             else:
                 self._current_draft += f"\n{prefix}{action.revision_text}"
         else:
+            self._current_draft = prefix + action.revision_text
         info["revised_section"] = action.section
+        return Reward(value=0.0, signals={"intermediate_step": 1.0}, done=False, info=info)

environment/models.py CHANGED Viewed

@@ -1,175 +1,47 @@
-"""Pydantic v2 models for the Clinical Note Scribe environment.
-Defines the typed contracts for observations, actions, rewards,
-and overall environment state used by the OpenEnv spec.
-"""
 from __future__ import annotations
 from typing import Any, Literal, Optional
 from pydantic import BaseModel, Field
-# ---------------------------------------------------------------------------
-# Observation — what the agent sees after each step
-# ---------------------------------------------------------------------------
 class Observation(BaseModel):
-    """Snapshot of the environment returned to the agent."""
-    transcript: str = Field(
-        ...,
-        description="Full doctor–patient transcript for the current task.",
-    )
-    task_id: str = Field(
-        ...,
-        description="Unique identifier for the task (e.g. 'easy_routine_checkup').",
-    )
-    patient_context: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Structured patient demographics and history.",
-    )
-    current_draft: Optional[str] = Field(
-        default=None,
-        description="The agent's most recent SOAP-note draft, if any.",
-    )
-    errors_so_far: list[str] = Field(
-        default_factory=list,
-        description="Accumulated error/feedback messages from prior steps.",
-    )
-    step_count: int = Field(
-        default=0,
-        ge=0,
-        description="Number of steps taken in the current episode.",
-    )
-# ---------------------------------------------------------------------------
-# SOAPNote — structured clinical note
-# ---------------------------------------------------------------------------
 class SOAPNote(BaseModel):
-    """Standard SOAP clinical-note format."""
-    subjective: str = Field(
-        ...,
-        description="Patient's self-reported symptoms and history.",
-    )
-    objective: str = Field(
-        ...,
-        description="Clinician's measurable findings (vitals, exam, labs).",
-    )
-    assessment: str = Field(
-        ...,
-        description="Clinician's diagnosis or differential.",
-    )
-    plan: str = Field(
-        ...,
-        description="Treatment plan, follow-ups, and prescriptions.",
-    )
-# ---------------------------------------------------------------------------
-# Action — what the agent can do
-# ---------------------------------------------------------------------------
 class Action(BaseModel):
-    """An action the agent submits to the environment."""
-    action_type: Literal["submit_note", "request_clarify", "revise_section"] = Field(
-        ...,
-        description="The kind of action the agent is taking.",
-    )
-    # --- submit_note fields ---
-    soap_note: Optional[SOAPNote] = Field(
-        default=None,
-        description="Complete SOAP note (required when action_type == 'submit_note').",
-    )
-    # --- revise_section fields ---
-    section: Optional[Literal["S", "O", "A", "P"]] = Field(
-        default=None,
-        description="Which SOAP section to revise (required when action_type == 'revise_section').",
-    )
-    revision_text: Optional[str] = Field(
-        default=None,
-        description="Replacement text for the specified section.",
-    )
-    # --- request_clarify fields ---
-    clarify_question: Optional[str] = Field(
-        default=None,
-        description="Free-text question the agent asks for clarification.",
-    )
-# ---------------------------------------------------------------------------
-# Reward — multi-signal feedback
-# ---------------------------------------------------------------------------
 class Reward(BaseModel):
-    """Reward returned after each step."""
-    value: float = Field(
-        ...,
-        ge=0.0,
-        le=1.0,
-        description="Aggregate reward in the range [0.0, 1.0].",
-    )
-    signals: dict[str, float] = Field(
-        default_factory=dict,
-        description="Breakdown of individual reward sub-signals.",
-    )
-    done: bool = Field(
-        ...,
-        description="Whether the episode has ended.",
-    )
-    info: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Auxiliary metadata (e.g. grader diagnostics).",
-    )
-# ---------------------------------------------------------------------------
-# EnvironmentState — full internal state exposed by state()
-# ---------------------------------------------------------------------------
 class EnvironmentState(BaseModel):
-    """Complete snapshot of the environment's internal state."""
-    task_id: str = Field(
-        ...,
-        description="Active task identifier.",
-    )
-    step_count: int = Field(
-        default=0,
-        ge=0,
-        description="Steps taken so far in this episode.",
-    )
-    max_steps: int = Field(
-        default=10,
-        ge=1,
-        description="Maximum steps allowed per episode.",
-    )
-    done: bool = Field(
-        default=False,
-        description="Whether the current episode has terminated.",
-    )
-    current_draft: Optional[str] = Field(
-        default=None,
-        description="Latest SOAP-note draft text, if any.",
-    )
-    errors_so_far: list[str] = Field(
-        default_factory=list,
-        description="Accumulated feedback/error messages.",
-    )
-    last_reward: Optional[Reward] = Field(
-        default=None,
-        description="Most recent reward object, if a step has been taken.",
-    )
-    observation: Optional[Observation] = Field(
-        default=None,
-        description="Most recent observation returned to the agent.",
-    )

+"""Pydantic v2 models for the Clinical Note Scribe environment."""
 from __future__ import annotations
 from typing import Any, Literal, Optional
 from pydantic import BaseModel, Field
 class Observation(BaseModel):
+    transcript: str = Field(..., description="Full doctor–patient transcript.")
+    task_id: str = Field(..., description="Unique task identifier.")
+    patient_context: dict[str, Any] = Field(default_factory=dict, description="Patient demographics and history.")
+    current_draft: Optional[str] = Field(None, description="Most recent SOAP-note draft.")
+    errors_so_far: list[str] = Field(default_factory=list, description="Accumulated error messages.")
+    step_count: int = Field(0, ge=0, description="Steps taken in the current episode.")
 class SOAPNote(BaseModel):
+    subjective: str = Field(..., description="Patient's self-reported symptoms and history.")
+    objective: str = Field(..., description="Clinician's measurable findings.")
+    assessment: str = Field(..., description="Clinician's diagnosis or differential.")
+    plan: str = Field(..., description="Treatment plan, follow-ups, and prescriptions.")
 class Action(BaseModel):
+    action_type: Literal["submit_note", "request_clarify", "revise_section"] = Field(..., description="Action kind.")
+    soap_note: Optional[SOAPNote] = Field(None, description="SOAP note (required for submit_note).")
+    section: Optional[Literal["S", "O", "A", "P"]] = Field(None, description="Section to revise.")
+    revision_text: Optional[str] = Field(None, description="Replacement text for the section.")
+    clarify_question: Optional[str] = Field(None, description="Clarification question.")
 class Reward(BaseModel):
+    value: float = Field(..., ge=0.0, le=1.0, description="Aggregate reward [0, 1].")
+    signals: dict[str, float] = Field(default_factory=dict, description="Reward sub-signals.")
+    done: bool = Field(..., description="Whether the episode ended.")
+    info: dict[str, Any] = Field(default_factory=dict, description="Auxiliary metadata.")
 class EnvironmentState(BaseModel):
+    task_id: str = Field(..., description="Active task identifier.")
+    step_count: int = Field(0, ge=0, description="Steps taken so far.")
+    max_steps: int = Field(10, ge=1, description="Max steps per episode.")
+    done: bool = Field(False, description="Whether the episode terminated.")
+    current_draft: Optional[str] = Field(None, description="Latest SOAP draft text.")
+    errors_so_far: list[str] = Field(default_factory=list, description="Error messages.")
+    last_reward: Optional[Reward] = Field(None, description="Most recent reward.")
+    observation: Optional[Observation] = Field(None, description="Most recent observation.")

environment/reward.py CHANGED Viewed

@@ -1,18 +1,4 @@
-"""Multi-signal reward computation for the Clinical Note Scribe environment.
-Reward formula (all weights sum to 1.0 before penalties):
-  weighted_sum = grader_score       × 0.60   (clinical accuracy from task grader)
-               + conciseness_bonus  × 0.10   (1.0 if note ≤ 400 words, else 0.0)
-               + safe_language_score× 0.15   (1.0 if no unsafe-certainty phrases)
-               + format_valid       × 0.15   (1.0 if SOAP JSON is well-formed)
-Deductions (applied after weighted sum):
-  - 0.05 × max(0, step_count - 3)   (penalty for excessive clarification steps)
-  - 0.10 × len(errors_so_far)        (penalty for each invalid action)
-Final value is clamped to [0.0, 1.0].
-"""
 from __future__ import annotations
@@ -21,99 +7,38 @@
 from environment.models import Action, Reward, SOAPNote
-# ---------------------------------------------------------------------------
-# Weights
-# ---------------------------------------------------------------------------
-W_GRADER      = 0.60
-W_CONCISE     = 0.10
-W_SAFE_LANG   = 0.15
-W_FORMAT      = 0.15
-# Deduction constants
-STEP_PENALTY_RATE   = 0.05   # per step beyond FREE_STEPS
-FREE_STEPS          = 3
-ERROR_PENALTY_RATE  = 0.10   # per item in errors_so_far
-# Conciseness threshold
 WORD_LIMIT = 400
-# Phrases that indicate unsafe clinical certainty
-# (over-confident language that a scribe should avoid in a note)
-_UNSAFE_PATTERNS: list[re.Pattern[str]] = [
-    re.compile(p, re.IGNORECASE)
-    for p in [
-        r"\bpatient definitely has\b",
-        r"\bdiagnosis is certain\b",
-        r"\bno doubt\b",
-        r"\babsolutely confirmed\b",
-        r"\b100%\s+certain\b",
-        r"\bwill definitely\b",
-        r"\bguaranteed to\b",
-        r"\bcannot be\s+\w+\s+else\b",
-        r"\bwithout question\b",
-        r"\bthis is clearly\b",
-    ]
-]
-# ---------------------------------------------------------------------------
-# Sub-signal helpers
-# ---------------------------------------------------------------------------
-def _conciseness_bonus(soap_note: Optional[SOAPNote]) -> float:
-    """Return 1.0 if the total SOAP note word count is at or below WORD_LIMIT."""
-    if soap_note is None:
-        return 0.0
-    text = " ".join([
-        soap_note.subjective,
-        soap_note.objective,
-        soap_note.assessment,
-        soap_note.plan,
-    ])
-    word_count = len(text.split())
-    return 1.0 if word_count <= WORD_LIMIT else 0.0
-def _safe_language_score(soap_note: Optional[SOAPNote]) -> float:
-    """Return 1.0 if no unsafe-certainty phrases are found in the SOAP note."""
-    if soap_note is None:
-        return 1.0   # no note submitted → no unsafe language
-    text = " ".join([
-        soap_note.subjective,
-        soap_note.objective,
-        soap_note.assessment,
-        soap_note.plan,
-    ])
-    for pattern in _UNSAFE_PATTERNS:
-        if pattern.search(text):
-            return 0.0
-    return 1.0
-def _format_valid(action: Action) -> float:
-    """Return 1.0 if the submitted note has all required non-empty SOAP fields.
-    This acts as a lightweight structural / «JSON well-formed» check:
-    each of S, O, A, P must be a non-empty string, and the action_type
-    must be ``submit_note``.
-    """
-    if action.action_type != "submit_note":
-        return 1.0   # non-submission actions are not graded on format
-    if action.soap_note is None:
-        return 0.0
-    soap = action.soap_note
-    fields = [soap.subjective, soap.objective, soap.assessment, soap.plan]
-    if all(isinstance(f, str) and f.strip() for f in fields):
-        return 1.0
-    return 0.0
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
 def compute_reward(
     action: Action,
     grader_score: float,
@@ -123,75 +48,54 @@ def compute_reward(
     done: bool = False,
     info: Optional[dict[str, Any]] = None,
 ) -> Reward:
-    """Compute the multi-signal reward for a completed step.
-    Parameters
-    ----------
-    action:
-        The action that was just executed.
-    grader_score:
-        Clinical-accuracy score returned by the task-specific grader (0.0–1.0).
-        Use 0.0 for non-submission actions.
-    step_count:
-        Total number of steps taken so far in the episode (including this one).
-    errors_so_far:
-        List of error messages accumulated during the episode.
-    done:
-        Whether the episode ended with this step.
-    info:
-        Optional auxiliary metadata dict to include in the Reward.
-    Returns
-    -------
-    Reward
-        Fully populated Reward with ``value`` and ``signals`` breakdown.
-    """
     grader_score = max(0.0, min(1.0, grader_score))
-    # ---- per-signal scores ----
-    conciseness  = _conciseness_bonus(action.soap_note)
-    safe_lang    = _safe_language_score(action.soap_note)
-    fmt          = _format_valid(action)
-    # ---- weighted sum ----
-    weighted = (
         grader_score * W_GRADER
-        + conciseness  * W_CONCISE
-        + safe_lang    * W_SAFE_LANG
-        + fmt          * W_FORMAT
     )
-    # ---- deductions ----
-    extra_steps   = max(0, step_count - FREE_STEPS)
-    step_penalty  = extra_steps * STEP_PENALTY_RATE
-    error_penalty = len(errors_so_far) * ERROR_PENALTY_RATE
-    raw = weighted - step_penalty - error_penalty
-    # ---- clamp ----
-    value = max(0.01, min(0.99, raw))
-    signals: dict[str, float] = {
-        # positive contributions
-        "grader_score":        round(grader_score * W_GRADER,   4),
-        "conciseness_bonus":   round(conciseness  * W_CONCISE,  4),
-        "safe_language_score": round(safe_lang     * W_SAFE_LANG, 4),
-        "format_valid":        round(fmt           * W_FORMAT,   4),
-        # deductions (stored as negative numbers for clarity)
-        "step_penalty":        round(-step_penalty,  4),
-        "error_penalty":       round(-error_penalty, 4),
-        # raw sub-signal values (unweighted, for introspection)
-        "_grader_score_raw":        round(grader_score, 4),
-        "_conciseness_raw":         round(conciseness,  4),
-        "_safe_language_raw":       round(safe_lang,    4),
-        "_format_valid_raw":        round(fmt,          4),
-        "_extra_steps":             float(extra_steps),
-        "_error_count":             float(len(errors_so_far)),
-    }
     return Reward(
-        value=round(value, 4),
-        signals=signals,
         done=done,
         info=info or {},
     )

+"""Multi-signal reward computation for the Clinical Note Scribe environment."""
 from __future__ import annotations
 from environment.models import Action, Reward, SOAPNote
+# Reward weights (sum to 1.0)
+W_GRADER, W_CONCISE, W_SAFE_LANG, W_FORMAT = 0.60, 0.10, 0.15, 0.15
+# Deductions
+STEP_PENALTY_RATE = 0.05
+FREE_STEPS = 3
+ERROR_PENALTY_RATE = 0.10
 WORD_LIMIT = 400
+# Pre-compiled unsafe clinical certainty patterns
+_UNSAFE_RE = re.compile(
+    r"\bpatient definitely has\b"
+    r"|\bdiagnosis is certain\b"
+    r"|\bno doubt\b"
+    r"|\babsolutely confirmed\b"
+    r"|\b100%\s+certain\b"
+    r"|\bwill definitely\b"
+    r"|\bguaranteed to\b"
+    r"|\bcannot be\s+\w+\s+else\b"
+    r"|\bwithout question\b"
+    r"|\bthis is clearly\b",
+    re.IGNORECASE,
+)
+def _soap_text(soap: Optional[SOAPNote]) -> Optional[str]:
+    """Join all SOAP fields into one string. Returns None if no note."""
+    if soap is None:
+        return None
+    return f"{soap.subjective} {soap.objective} {soap.assessment} {soap.plan}"
 def compute_reward(
     action: Action,
     grader_score: float,
     done: bool = False,
     info: Optional[dict[str, Any]] = None,
 ) -> Reward:
+    """Compute the multi-signal reward for a completed step."""
     grader_score = max(0.0, min(1.0, grader_score))
+    text = _soap_text(action.soap_note)
+    # Sub-signals
+    conciseness = 1.0 if text and len(text.split()) <= WORD_LIMIT else (0.0 if text else 0.0)
+    safe_lang = 0.0 if (text and _UNSAFE_RE.search(text)) else 1.0
+    fmt = (
+        1.0
+        if action.action_type != "submit_note"
+        else (
+            1.0
+            if action.soap_note and all(
+                getattr(action.soap_note, f).strip()
+                for f in ("subjective", "objective", "assessment", "plan")
+            )
+            else 0.0
+        )
+    )
+    # Weighted sum minus deductions, clamped to (0, 1)
+    extra_steps = max(0, step_count - FREE_STEPS)
+    raw = (
         grader_score * W_GRADER
+        + conciseness * W_CONCISE
+        + safe_lang * W_SAFE_LANG
+        + fmt * W_FORMAT
+        - extra_steps * STEP_PENALTY_RATE
+        - len(errors_so_far) * ERROR_PENALTY_RATE
     )
+    value = round(max(0.01, min(0.99, raw)), 4)
     return Reward(
+        value=value,
+        signals={
+            "grader_score": round(grader_score * W_GRADER, 4),
+            "conciseness_bonus": round(conciseness * W_CONCISE, 4),
+            "safe_language_score": round(safe_lang * W_SAFE_LANG, 4),
+            "format_valid": round(fmt * W_FORMAT, 4),
+            "step_penalty": round(-extra_steps * STEP_PENALTY_RATE, 4),
+            "error_penalty": round(-len(errors_so_far) * ERROR_PENALTY_RATE, 4),
+            "_grader_score_raw": round(grader_score, 4),
+            "_conciseness_raw": round(conciseness, 4),
+            "_safe_language_raw": round(safe_lang, 4),
+            "_format_valid_raw": round(fmt, 4),
+            "_extra_steps": float(extra_steps),
+            "_error_count": float(len(errors_so_far)),
+        },
         done=done,
         info=info or {},
     )

environment/tasks/task_easy.py CHANGED Viewed

@@ -1,85 +1,37 @@
-"""Easy task — routine check-up.
-Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
-against expected findings from a simple cold / blood pressure check visit.
-"""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
-# ---------------------------------------------------------------------------
-# Task definition
-# ---------------------------------------------------------------------------
 EASY_TASK: dict[str, Any] = {
     "task_id": "easy_routine_checkup",
     "description": "Generate a SOAP note for a routine annual check-up visit.",
     "transcript_file": "data/transcripts/easy.txt",
     "patient_context": {
-        "patient_id": "P-1001",
-        "name": "Jane Doe",
-        "age": 34,
-        "sex": "F",
-        "known_conditions": [],
-        "current_medications": [],
-        "allergies": ["Penicillin"],
     },
     "max_steps": 5,
 }
-# ---------------------------------------------------------------------------
-# Grader
-# ---------------------------------------------------------------------------
 def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
-    """Score a submitted SOAP note against the easy-task rubric.
-    Checks for mention of key clinical findings from the transcript:
-    chief complaints, vitals, viral URI assessment, and supportive plan.
-    Returns
-    -------
-    dict mapping signal names to float scores in [0, 1].
-    """
-    text_s = soap_note.subjective.lower()
-    text_o = soap_note.objective.lower()
-    text_a = soap_note.assessment.lower()
-    text_p = soap_note.plan.lower()
-    # 1. Subjective — chief complaints
-    s_score = 0.0
-    if "sore throat" in text_s or "runny nose" in text_s or "congestion" in text_s:
-        s_score += 0.5
-    if "5 days" in text_s or "five days" in text_s or "headache" in text_s:
-        s_score += 0.5
-    # 2. Objective — vitals
-    o_score = 0.0
-    if "118/76" in text_o or "118 over 76" in text_o or "blood pressure" in text_o:
-        o_score += 0.5
-    if "72" in text_o or "heart rate" in text_o or "lungs clear" in text_o:
-        o_score += 0.5
-    # 3. Assessment — viral URI
-    a_score = 0.0
-    if "viral" in text_a or "uri" in text_a or "upper respiratory" in text_a:
-        a_score += 1.0
-    # 4. Plan — supportive care
-    p_score = 0.0
-    if "fluids" in text_p or "rest" in text_p or "hydrat" in text_p:
-        p_score += 0.5
-    if "dayquil" in text_p or "follow" in text_p or "return" in text_p:
-        p_score += 0.5
     return {
-        "subjective_accuracy": max(0.01, min(s_score, 0.99)),
-        "objective_accuracy": max(0.01, min(o_score, 0.99)),
-        "assessment_accuracy": max(0.01, min(a_score, 0.99)),
-        "plan_accuracy": max(0.01, min(p_score, 0.99)),
     }

+"""Easy task — routine check-up."""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
 EASY_TASK: dict[str, Any] = {
     "task_id": "easy_routine_checkup",
     "description": "Generate a SOAP note for a routine annual check-up visit.",
     "transcript_file": "data/transcripts/easy.txt",
     "patient_context": {
+        "patient_id": "P-1001", "name": "Jane Doe", "age": 34, "sex": "F",
+        "known_conditions": [], "current_medications": [], "allergies": ["Penicillin"],
     },
     "max_steps": 5,
 }
 def grade_easy(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
+    s, o, a, p = (soap_note.subjective.lower(), soap_note.objective.lower(),
+                  soap_note.assessment.lower(), soap_note.plan.lower())
+    s_score = 0.5 * any(k in s for k in ("sore throat", "runny nose", "congestion")) \
+            + 0.5 * any(k in s for k in ("5 days", "five days", "headache"))
+    o_score = 0.5 * any(k in o for k in ("118/76", "118 over 76", "blood pressure")) \
+            + 0.5 * any(k in o for k in ("72", "heart rate", "lungs clear"))
+    a_score = 1.0 * any(k in a for k in ("viral", "uri", "upper respiratory"))
+    p_score = 0.5 * any(k in p for k in ("fluids", "rest", "hydrat")) \
+            + 0.5 * any(k in p for k in ("dayquil", "follow", "return"))
+    clamp = lambda v: max(0.01, min(v, 0.99))
     return {
+        "subjective_accuracy": clamp(s_score),
+        "objective_accuracy": clamp(o_score),
+        "assessment_accuracy": clamp(a_score),
+        "plan_accuracy": clamp(p_score),
     }

environment/tasks/task_hard.py CHANGED Viewed

@@ -1,118 +1,42 @@
-"""Hard task — complex ER visit.
-Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
-against expected findings from a complex ER visit with overlapping chest pain,
-SOB, and a possible PE complicated by a contrast dye allergy.
-"""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
-# ---------------------------------------------------------------------------
-# Task definition
-# ---------------------------------------------------------------------------
 HARD_TASK: dict[str, Any] = {
     "task_id": "hard_complex_er_visit",
-    "description": (
-        "Generate a SOAP note for a complex emergency-room visit involving "
-        "chest pain, polytrauma assessment, and multiple co-morbidities."
-    ),
     "transcript_file": "data/transcripts/hard.txt",
     "patient_context": {
-        "patient_id": "P-3782",
-        "name": "Maria Garcia",
-        "age": 72,
-        "sex": "F",
-        "known_conditions": [
-            "Coronary Artery Disease",
-            "Atrial Fibrillation",
-            "Chronic Kidney Disease Stage 3",
-            "Osteoarthritis",
-        ],
-        "current_medications": [
-            "Aspirin 81 mg daily",
-            "Warfarin 5 mg daily",
-            "Metoprolol 50 mg BID",
-            "Furosemide 40 mg daily",
-            "Amlodipine 5 mg daily",
-        ],
         "allergies": ["Sulfa drugs", "Contrast dye"],
-        "recent_labs": {
-            "troponin_I": "0.08 ng/mL",
-            "BNP": "450 pg/mL",
-            "creatinine": "1.9 mg/dL",
-            "eGFR": "34 mL/min",
-            "INR": "2.6",
-            "hemoglobin": "10.2 g/dL",
-        },
-        "vitals_on_arrival": {
-            "BP": "168/94 mmHg",
-            "HR": "112 bpm (irregular)",
-            "RR": "22 breaths/min",
-            "SpO2": "91% on room air",
-            "Temp": "37.2°C",
-        },
     },
     "max_steps": 10,
 }
-# ---------------------------------------------------------------------------
-# Grader
-# ---------------------------------------------------------------------------
 def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
-    """Score a submitted SOAP note against the hard-task rubric.
-    Checks for chest pain / SOB and the nitroglycerin contradiction (subjective),
-    D-dimer and contrast allergy (objective), ACS vs PE differential (assessment),
-    and V/Q scan + ICU admission (plan).
-    Returns
-    -------
-    dict mapping signal names to float scores in [0, 1].
-    """
-    text_s = soap_note.subjective.lower()
-    text_o = soap_note.objective.lower()
-    text_a = soap_note.assessment.lower()
-    text_p = soap_note.plan.lower()
-    # 1. Subjective — catching the contradiction and presenting complaints
-    s_score = 0.0
-    if "chest pain" in text_s or "shortness of breath" in text_s or "sob" in text_s:
-        s_score += 0.5
-    if "nitroglycerin" in text_s or "contradict" in text_s or "denied" in text_s:
-        s_score += 0.5
-    # 2. Objective — elevated D-dimer and allergy awareness
-    o_score = 0.0
-    if "d-dimer" in text_o or "1840" in text_o or "d dimer" in text_o:
-        o_score += 0.5
-    if "allergy" in text_o or "contrast" in text_o or "troponin" in text_o:
-        o_score += 0.5
-    # 3. Assessment — the dual differential (ACS vs PE)
-    a_score = 0.0
-    if "acs" in text_a or "acute coronary" in text_a or "coronary" in text_a or "ischemia" in text_a:
-        a_score += 0.5
-    if "pe" in text_a or "pulmonary embolism" in text_a or "embolism" in text_a:
-        a_score += 0.5
-    # 4. Plan — adapting to the allergy (V/Q scan) and admission
-    p_score = 0.0
-    if "v/q" in text_p or "ventilation" in text_p or "perfusion" in text_p:
-        p_score += 0.5
-    if "icu" in text_p or "admit" in text_p or "cardiac" in text_p:
-        p_score += 0.5
     return {
-        "subjective_accuracy": max(0.01, min(s_score, 0.99)),
-        "objective_accuracy": max(0.01, min(o_score, 0.99)),
-        "assessment_accuracy": max(0.01, min(a_score, 0.99)),
-        "plan_accuracy": max(0.01, min(p_score, 0.99)),
     }

+"""Hard task — complex ER visit."""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
 HARD_TASK: dict[str, Any] = {
     "task_id": "hard_complex_er_visit",
+    "description": "Generate a SOAP note for a complex ER visit with chest pain, PE differential, and contrast allergy.",
     "transcript_file": "data/transcripts/hard.txt",
     "patient_context": {
+        "patient_id": "P-3782", "name": "Maria Garcia", "age": 72, "sex": "F",
+        "known_conditions": ["Coronary Artery Disease", "Atrial Fibrillation", "Chronic Kidney Disease Stage 3", "Osteoarthritis"],
+        "current_medications": ["Aspirin 81 mg daily", "Warfarin 5 mg daily", "Metoprolol 50 mg BID", "Furosemide 40 mg daily", "Amlodipine 5 mg daily"],
         "allergies": ["Sulfa drugs", "Contrast dye"],
+        "recent_labs": {"troponin_I": "0.08 ng/mL", "BNP": "450 pg/mL", "creatinine": "1.9 mg/dL", "eGFR": "34 mL/min", "INR": "2.6", "hemoglobin": "10.2 g/dL"},
+        "vitals_on_arrival": {"BP": "168/94 mmHg", "HR": "112 bpm (irregular)", "RR": "22 breaths/min", "SpO2": "91% on room air", "Temp": "37.2°C"},
     },
     "max_steps": 10,
 }
 def grade_hard(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
+    s, o, a, p = (soap_note.subjective.lower(), soap_note.objective.lower(),
+                  soap_note.assessment.lower(), soap_note.plan.lower())
+    s_score = 0.5 * any(k in s for k in ("chest pain", "shortness of breath", "sob")) \
+            + 0.5 * any(k in s for k in ("nitroglycerin", "contradict", "denied"))
+    o_score = 0.5 * any(k in o for k in ("d-dimer", "1840", "d dimer")) \
+            + 0.5 * any(k in o for k in ("allergy", "contrast", "troponin"))
+    a_score = 0.5 * any(k in a for k in ("acs", "acute coronary", "coronary", "ischemia")) \
+            + 0.5 * any(k in a for k in ("pe", "pulmonary embolism", "embolism"))
+    p_score = 0.5 * any(k in p for k in ("v/q", "ventilation", "perfusion")) \
+            + 0.5 * any(k in p for k in ("icu", "admit", "cardiac"))
+    clamp = lambda v: max(0.01, min(v, 0.99))
     return {
+        "subjective_accuracy": clamp(s_score),
+        "objective_accuracy": clamp(o_score),
+        "assessment_accuracy": clamp(a_score),
+        "plan_accuracy": clamp(p_score),
     }

environment/tasks/task_medium.py CHANGED Viewed

@@ -1,98 +1,41 @@
-"""Medium task — chronic disease follow-up.
-Grader uses keyword-based clinical rubric scoring to evaluate the SOAP note
-against expected findings from a Type 2 Diabetes / Hypertension follow-up.
-"""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
-# ---------------------------------------------------------------------------
-# Task definition
-# ---------------------------------------------------------------------------
 MEDIUM_TASK: dict[str, Any] = {
     "task_id": "medium_chronic_disease_followup",
     "description": "Generate a SOAP note for a Type 2 Diabetes follow-up visit.",
     "transcript_file": "data/transcripts/medium.txt",
     "patient_context": {
-        "patient_id": "P-2045",
-        "name": "Robert Smith",
-        "age": 58,
-        "sex": "M",
         "known_conditions": ["Type 2 Diabetes Mellitus", "Hypertension"],
-        "current_medications": [
-            "Metformin 1000 mg BID",
-            "Lisinopril 20 mg daily",
-            "Atorvastatin 40 mg daily",
-        ],
         "allergies": [],
-        "recent_labs": {
-            "HbA1c": "7.8%",
-            "fasting_glucose": "156 mg/dL",
-            "creatinine": "1.1 mg/dL",
-            "eGFR": "78 mL/min",
-            "LDL": "102 mg/dL",
-        },
     },
     "max_steps": 8,
 }
-# ---------------------------------------------------------------------------
-# Grader
-# ---------------------------------------------------------------------------
 def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
-    """Score a submitted SOAP note against the medium-task rubric.
-    Checks for mention of dietary habits, HbA1c lab values, core diagnoses,
-    and medication adjustments (glipizide, lisinopril uptitration).
-    Returns
-    -------
-    dict mapping signal names to float scores in [0, 1].
-    """
-    text_s = soap_note.subjective.lower()
-    text_o = soap_note.objective.lower()
-    text_a = soap_note.assessment.lower()
-    text_p = soap_note.plan.lower()
-    # 1. Subjective — dietary habits / statin gap
-    s_score = 0.0
-    if "restaurant" in text_s or "diet" in text_s or "eating" in text_s:
-        s_score += 0.5
-    if "statin" in text_s or "gap" in text_s or "missed" in text_s:
-        s_score += 0.5
-    # 2. Objective — HbA1c values
-    o_score = 0.0
-    if "7.8" in text_o or "7.2" in text_o or "a1c" in text_o or "hba1c" in text_o:
-        o_score += 0.5
-    if "156" in text_o or "fasting glucose" in text_o or "glucose" in text_o:
-        o_score += 0.5
-    # 3. Assessment — core diagnoses
-    a_score = 0.0
-    if "diabetes" in text_a or "t2dm" in text_a or "dm" in text_a:
-        a_score += 0.5
-    if "hypertension" in text_a or "htn" in text_a or "blood pressure" in text_a:
-        a_score += 0.5
-    # 4. Plan — medication changes
-    p_score = 0.0
-    if "glipizide" in text_p and ("5" in text_p or "add" in text_p):
-        p_score += 0.5
-    if "lisinopril" in text_p and ("40" in text_p or "increase" in text_p or "uptitrat" in text_p):
-        p_score += 0.5
     return {
-        "subjective_accuracy": max(0.01, min(s_score, 0.99)),
-        "objective_accuracy": max(0.01, min(o_score, 0.99)),
-        "assessment_accuracy": max(0.01, min(a_score, 0.99)),
-        "plan_accuracy": max(0.01, min(p_score, 0.99)),
     }

+"""Medium task — chronic disease follow-up."""
 from __future__ import annotations
 from typing import Any
 from environment.models import SOAPNote
 MEDIUM_TASK: dict[str, Any] = {
     "task_id": "medium_chronic_disease_followup",
     "description": "Generate a SOAP note for a Type 2 Diabetes follow-up visit.",
     "transcript_file": "data/transcripts/medium.txt",
     "patient_context": {
+        "patient_id": "P-2045", "name": "Robert Smith", "age": 58, "sex": "M",
         "known_conditions": ["Type 2 Diabetes Mellitus", "Hypertension"],
+        "current_medications": ["Metformin 1000 mg BID", "Lisinopril 20 mg daily", "Atorvastatin 40 mg daily"],
         "allergies": [],
+        "recent_labs": {"HbA1c": "7.8%", "fasting_glucose": "156 mg/dL", "creatinine": "1.1 mg/dL", "eGFR": "78 mL/min", "LDL": "102 mg/dL"},
     },
     "max_steps": 8,
 }
 def grade_medium(soap_note: SOAPNote, task: dict[str, Any]) -> dict[str, float]:
+    s, o, a, p = (soap_note.subjective.lower(), soap_note.objective.lower(),
+                  soap_note.assessment.lower(), soap_note.plan.lower())
+    s_score = 0.5 * any(k in s for k in ("restaurant", "diet", "eating")) \
+            + 0.5 * any(k in s for k in ("statin", "gap", "missed"))
+    o_score = 0.5 * any(k in o for k in ("7.8", "7.2", "a1c", "hba1c")) \
+            + 0.5 * any(k in o for k in ("156", "fasting glucose", "glucose"))
+    a_score = 0.5 * any(k in a for k in ("diabetes", "t2dm", "dm")) \
+            + 0.5 * any(k in a for k in ("hypertension", "htn", "blood pressure"))
+    p_score = 0.5 * ("glipizide" in p and any(k in p for k in ("5", "add"))) \
+            + 0.5 * ("lisinopril" in p and any(k in p for k in ("40", "increase", "uptitrat")))
+    clamp = lambda v: max(0.01, min(v, 0.99))
     return {
+        "subjective_accuracy": clamp(s_score),
+        "objective_accuracy": clamp(o_score),
+        "assessment_accuracy": clamp(a_score),
+        "plan_accuracy": clamp(p_score),
     }

frontend/app.js ADDED Viewed

	@@ -0,0 +1,262 @@

+// Clinical Note Scribe — Frontend Logic
+const API = "";
+// DOM refs
+const taskSelect = document.getElementById("taskSelect");
+const resetBtn = document.getElementById("resetBtn");
+const stepBtn = document.getElementById("stepBtn");
+const statusBadge = document.getElementById("statusBadge");
+const contextSection = document.getElementById("contextSection");
+const contextGrid = document.getElementById("contextGrid");
+const transcriptArea = document.getElementById("transcriptArea");
+const actionSection = document.getElementById("actionSection");
+const actionType = document.getElementById("actionType");
+const sectionSelect = document.getElementById("sectionSelect");
+const soapInputs = document.getElementById("soapInputs");
+const reviseInput = document.getElementById("reviseInput");
+const clarifyInput = document.getElementById("clarifyInput");
+const rewardSection = document.getElementById("rewardSection");
+const scoreValue = document.getElementById("scoreValue");
+const rewardFill = document.getElementById("rewardFill");
+const draftArea = document.getElementById("draftArea");
+const draftEmpty = document.getElementById("draftEmpty");
+const soapDraft = document.getElementById("soapDraft");
+const soapGrid = document.getElementById("soapGrid");
+const logContainer = document.getElementById("logContainer");
+let currentObs = null;
+let isDone = false;
+// Logging
+function addLog(msg, type = "") {
+  const time = new Date().toLocaleTimeString("en-US", { hour12: false });
+  const entry = document.createElement("div");
+  entry.className = "log-entry " + type;
+  entry.innerHTML = `<span class="log-time">${time}</span>${msg}`;
+  logContainer.prepend(entry);
+}
+// Status badge
+function setStatus(state) {
+  statusBadge.className = "status-badge " + state;
+  statusBadge.textContent = state === "idle" ? "Idle" : state === "active" ? "Active" : "Done";
+}
+// Toggle action inputs based on action type
+actionType.addEventListener("change", () => {
+  const val = actionType.value;
+  soapInputs.style.display = val === "submit_note" ? "block" : "none";
+  reviseInput.style.display = val === "revise_section" ? "block" : "none";
+  clarifyInput.style.display = val === "request_clarify" ? "block" : "none";
+  sectionSelect.style.display = val === "revise_section" ? "inline-block" : "none";
+});
+// Format transcript with speaker highlighting
+function renderTranscript(text) {
+  if (!text) return "";
+  const lines = text.split("\n");
+  return lines.map(line => {
+    if (/^(Dr\.|Doctor)/i.test(line.trim())) {
+      return `<div><span class="speaker-doctor">${escapeHtml(line)}</span></div>`;
+    } else if (/^(Patient|Pt)/i.test(line.trim())) {
+      return `<div><span class="speaker-patient">${escapeHtml(line)}</span></div>`;
+    }
+    return `<div>${escapeHtml(line)}</div>`;
+  }).join("");
+}
+function escapeHtml(str) {
+  const div = document.createElement("div");
+  div.textContent = str;
+  return div.innerHTML;
+}
+// Render patient context as cards
+function renderContext(ctx) {
+  if (!ctx || Object.keys(ctx).length === 0) {
+    contextSection.style.display = "none";
+    return;
+  }
+  contextSection.style.display = "block";
+  contextGrid.innerHTML = "";
+  const flat = flattenContext(ctx);
+  for (const [key, val] of Object.entries(flat)) {
+    const card = document.createElement("div");
+    card.className = "context-card";
+    card.innerHTML = `<div class="label">${escapeHtml(key)}</div><div class="value">${escapeHtml(String(val))}</div>`;
+    contextGrid.appendChild(card);
+  }
+}
+function flattenContext(obj, prefix = "") {
+  const result = {};
+  for (const [k, v] of Object.entries(obj)) {
+    const key = prefix ? `${prefix} › ${k}` : k;
+    if (v && typeof v === "object" && !Array.isArray(v)) {
+      Object.assign(result, flattenContext(v, key));
+    } else if (Array.isArray(v)) {
+      result[key] = v.length > 0 ? v.join(", ") : "—";
+    } else {
+      result[key] = v ?? "—";
+    }
+  }
+  return result;
+}
+// Render SOAP draft
+function renderDraft(draftText) {
+  if (!draftText) {
+    draftEmpty.style.display = "flex";
+    soapDraft.style.display = "none";
+    return;
+  }
+  draftEmpty.style.display = "none";
+  soapDraft.style.display = "block";
+  const sections = { S: "", O: "", A: "", P: "" };
+  const lines = draftText.split("\n");
+  for (const line of lines) {
+    for (const prefix of ["S: ", "O: ", "A: ", "P: "]) {
+      if (line.startsWith(prefix)) {
+        sections[prefix[0]] = line.slice(prefix.length);
+      }
+    }
+  }
+  const labels = { S: "Subjective", O: "Objective", A: "Assessment", P: "Plan" };
+  soapGrid.innerHTML = "";
+  for (const [key, label] of Object.entries(labels)) {
+    const card = document.createElement("div");
+    card.className = `soap-card ${key.toLowerCase()}`;
+    card.innerHTML = `
+      <div class="soap-label">${label}</div>
+      <div class="soap-text">${escapeHtml(sections[key]) || '<em style="opacity:0.4">Empty</em>'}</div>
+    `;
+    soapGrid.appendChild(card);
+  }
+}
+// Render reward
+function renderReward(rewardObj) {
+  if (!rewardObj) {
+    rewardSection.style.display = "none";
+    return;
+  }
+  rewardSection.style.display = "block";
+  const val = rewardObj.value;
+  scoreValue.textContent = val.toFixed(4);
+  rewardFill.style.width = (val * 100) + "%";
+  if (val >= 0.7) {
+    scoreValue.style.color = "var(--green)";
+  } else if (val >= 0.4) {
+    scoreValue.style.color = "var(--yellow)";
+  } else {
+    scoreValue.style.color = "var(--red)";
+  }
+}
+// Update UI from observation
+function updateUI(obs, reward = null, done = false) {
+  currentObs = obs;
+  isDone = done;
+  transcriptArea.innerHTML = `<div class="transcript-box">${renderTranscript(obs.transcript)}</div>`;
+  renderContext(obs.patient_context);
+  renderDraft(obs.current_draft);
+  if (reward) renderReward(reward);
+  actionSection.style.display = done ? "none" : "block";
+  setStatus(done ? "done" : "active");
+  if (done) {
+    addLog(`Episode complete — score: ${reward ? reward.value.toFixed(4) : "N/A"}`, "success");
+  }
+}
+// Reset
+resetBtn.addEventListener("click", async () => {
+  const taskId = taskSelect.value;
+  resetBtn.disabled = true;
+  addLog(`Resetting with task: ${taskId}`);
+  try {
+    const res = await fetch(`${API}/reset`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ task_id: taskId }),
+    });
+    if (!res.ok) throw new Error(await res.text());
+    const obs = await res.json();
+    rewardSection.style.display = "none";
+    updateUI(obs);
+    addLog("Environment reset successfully", "success");
+  } catch (err) {
+    addLog(`Reset failed: ${err.message}`, "error");
+  } finally {
+    resetBtn.disabled = false;
+  }
+});
+// Step
+stepBtn.addEventListener("click", async () => {
+  if (isDone) {
+    addLog("Episode is done. Reset first.", "error");
+    return;
+  }
+  const action = actionType.value;
+  let payload = {};
+  if (action === "submit_note") {
+    payload = {
+      action_type: "submit_note",
+      soap_note: {
+        subjective: document.getElementById("inputS").value,
+        objective: document.getElementById("inputO").value,
+        assessment: document.getElementById("inputA").value,
+        plan: document.getElementById("inputP").value,
+      },
+    };
+  } else if (action === "revise_section") {
+    payload = {
+      action_type: "revise_section",
+      section: sectionSelect.value,
+      revision_text: document.getElementById("inputRevision").value,
+    };
+  } else if (action === "request_clarify") {
+    payload = {
+      action_type: "request_clarify",
+      clarify_question: document.getElementById("inputClarify").value,
+    };
+  }
+  stepBtn.disabled = true;
+  addLog(`Sending action: ${action}`);
+  try {
+    const res = await fetch(`${API}/step`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(payload),
+    });
+    if (!res.ok) throw new Error(await res.text());
+    const data = await res.json();
+    updateUI(data.observation, data.reward, data.done);
+    if (data.info && data.info.clarify_answer) {
+      addLog(`Clarify answer: ${data.info.clarify_answer}`);
+    }
+    addLog(`Step done — reward: ${data.reward.value.toFixed(4)}, done: ${data.done}`);
+  } catch (err) {
+    addLog(`Step failed: ${err.message}`, "error");
+  } finally {
+    stepBtn.disabled = false;
+  }
+});
+// Init
+addLog("Frontend loaded. Select a task and click Reset.");

frontend/index.html ADDED Viewed

	@@ -0,0 +1,392 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Clinical Note Scribe</title>
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+  <style>
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+    :root {
+      --bg: #0f1117;
+      --surface: #1a1d27;
+      --surface-2: #242836;
+      --border: #2e3345;
+      --text: #e4e6ef;
+      --text-muted: #8b8fa3;
+      --accent: #4f8cff;
+      --accent-glow: rgba(79, 140, 255, 0.15);
+      --green: #34d399;
+      --red: #f87171;
+      --yellow: #fbbf24;
+      --radius: 10px;
+      --font: 'Inter', -apple-system, sans-serif;
+    }
+    body {
+      font-family: var(--font);
+      background: var(--bg);
+      color: var(--text);
+      min-height: 100vh;
+      line-height: 1.5;
+    }
+    /* Header */
+    .header {
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      padding: 16px 28px;
+      border-bottom: 1px solid var(--border);
+      background: var(--surface);
+    }
+    .header h1 {
+      font-size: 18px;
+      font-weight: 600;
+      display: flex;
+      align-items: center;
+      gap: 8px;
+    }
+    .header h1 span { font-size: 22px; }
+    .header-right {
+      display: flex;
+      align-items: center;
+      gap: 12px;
+    }
+    .status-badge {
+      font-size: 12px;
+      padding: 4px 10px;
+      border-radius: 20px;
+      font-weight: 500;
+    }
+    .status-badge.idle { background: var(--surface-2); color: var(--text-muted); }
+    .status-badge.active { background: rgba(52,211,153,0.15); color: var(--green); }
+    .status-badge.done { background: rgba(79,140,255,0.15); color: var(--accent); }
+    /* Layout */
+    .layout {
+      display: grid;
+      grid-template-columns: 1fr 1fr;
+      gap: 0;
+      height: calc(100vh - 57px);
+    }
+    .panel {
+      display: flex;
+      flex-direction: column;
+      overflow: hidden;
+    }
+    .panel-left { border-right: 1px solid var(--border); }
+    .panel-section {
+      padding: 16px 20px;
+      border-bottom: 1px solid var(--border);
+    }
+    .panel-section-title {
+      font-size: 11px;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.8px;
+      color: var(--text-muted);
+      margin-bottom: 10px;
+    }
+    .scrollable {
+      flex: 1;
+      overflow-y: auto;
+      padding: 16px 20px;
+    }
+    .scrollable::-webkit-scrollbar { width: 6px; }
+    .scrollable::-webkit-scrollbar-track { background: transparent; }
+    .scrollable::-webkit-scrollbar-thumb { background: var(--border); border-radius: 3px; }
+    /* Controls */
+    .controls {
+      display: flex;
+      gap: 8px;
+      flex-wrap: wrap;
+      align-items: center;
+    }
+    select, input, textarea {
+      font-family: var(--font);
+      font-size: 13px;
+      background: var(--surface-2);
+      border: 1px solid var(--border);
+      color: var(--text);
+      border-radius: 6px;
+      padding: 7px 10px;
+      outline: none;
+      transition: border-color 0.15s;
+    }
+    select:focus, input:focus, textarea:focus {
+      border-color: var(--accent);
+    }
+    textarea {
+      width: 100%;
+      resize: vertical;
+      min-height: 80px;
+    }
+    .btn {
+      font-family: var(--font);
+      font-size: 13px;
+      font-weight: 500;
+      padding: 7px 16px;
+      border: none;
+      border-radius: 6px;
+      cursor: pointer;
+      transition: all 0.15s;
+    }
+    .btn:disabled { opacity: 0.4; cursor: not-allowed; }
+    .btn-primary { background: var(--accent); color: #fff; }
+    .btn-primary:hover:not(:disabled) { background: #3d7ae8; }
+    .btn-secondary { background: var(--surface-2); color: var(--text); border: 1px solid var(--border); }
+    .btn-secondary:hover:not(:disabled) { background: var(--border); }
+    .btn-green { background: var(--green); color: #0f1117; }
+    .btn-green:hover:not(:disabled) { background: #2bc48d; }
+    /* Transcript */
+    .transcript-box {
+      font-size: 13px;
+      white-space: pre-wrap;
+      color: var(--text);
+      line-height: 1.7;
+    }
+    .transcript-box .speaker-doctor { color: var(--accent); font-weight: 500; }
+    .transcript-box .speaker-patient { color: var(--green); font-weight: 500; }
+    /* Context cards */
+    .context-grid {
+      display: grid;
+      grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+      gap: 8px;
+    }
+    .context-card {
+      background: var(--surface-2);
+      border-radius: 8px;
+      padding: 10px 12px;
+    }
+    .context-card .label {
+      font-size: 10px;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+      color: var(--text-muted);
+      margin-bottom: 3px;
+    }
+    .context-card .value {
+      font-size: 13px;
+      color: var(--text);
+    }
+    /* SOAP Draft */
+    .soap-grid {
+      display: grid;
+      grid-template-columns: 1fr 1fr;
+      gap: 10px;
+    }
+    .soap-card {
+      background: var(--surface-2);
+      border-radius: 8px;
+      padding: 12px 14px;
+      border-left: 3px solid var(--border);
+    }
+    .soap-card.s { border-left-color: #60a5fa; }
+    .soap-card.o { border-left-color: #34d399; }
+    .soap-card.a { border-left-color: #fbbf24; }
+    .soap-card.p { border-left-color: #c084fc; }
+    .soap-card .soap-label {
+      font-size: 11px;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.5px;
+      margin-bottom: 6px;
+    }
+    .soap-card.s .soap-label { color: #60a5fa; }
+    .soap-card.o .soap-label { color: #34d399; }
+    .soap-card.a .soap-label { color: #fbbf24; }
+    .soap-card.p .soap-label { color: #c084fc; }
+    .soap-card .soap-text {
+      font-size: 13px;
+      color: var(--text-muted);
+      line-height: 1.6;
+    }
+    /* Reward bar */
+    .reward-bar {
+      display: flex;
+      align-items: center;
+      gap: 12px;
+      padding: 10px 0;
+    }
+    .reward-bar .score-value {
+      font-size: 28px;
+      font-weight: 700;
+      font-variant-numeric: tabular-nums;
+    }
+    .reward-meter {
+      flex: 1;
+      height: 8px;
+      background: var(--surface-2);
+      border-radius: 4px;
+      overflow: hidden;
+    }
+    .reward-meter-fill {
+      height: 100%;
+      border-radius: 4px;
+      background: linear-gradient(90deg, var(--accent), var(--green));
+      transition: width 0.4s ease;
+    }
+    /* Action form */
+    .action-form {
+      display: flex;
+      flex-direction: column;
+      gap: 10px;
+    }
+    .action-row {
+      display: flex;
+      gap: 8px;
+      align-items: flex-start;
+    }
+    .action-row select { min-width: 160px; }
+    /* Log */
+    .log-container {
+      font-size: 12px;
+      font-family: 'SF Mono', 'Fira Code', monospace;
+      color: var(--text-muted);
+      max-height: 120px;
+      overflow-y: auto;
+      padding: 8px 0;
+    }
+    .log-entry {
+      padding: 2px 0;
+      border-bottom: 1px solid rgba(46, 51, 69, 0.4);
+    }
+    .log-entry .log-time { color: var(--border); margin-right: 8px; }
+    .log-entry.error { color: var(--red); }
+    .log-entry.success { color: var(--green); }
+    /* Empty state */
+    .empty-state {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      height: 100%;
+      color: var(--text-muted);
+      text-align: center;
+      gap: 12px;
+    }
+    .empty-state .icon { font-size: 36px; opacity: 0.5; }
+    .empty-state p { font-size: 14px; max-width: 280px; }
+  </style>
+</head>
+<body>
+  <header class="header">
+    <h1><span>🏥</span> Clinical Note Scribe</h1>
+    <div class="header-right">
+      <span class="status-badge idle" id="statusBadge">Idle</span>
+    </div>
+  </header>
+  <div class="layout">
+    <!-- Left Panel: Transcript + Context -->
+    <div class="panel panel-left">
+      <div class="panel-section">
+        <div class="controls">
+          <select id="taskSelect">
+            <option value="easy_routine_checkup">🟢 Easy — Routine Check-Up</option>
+            <option value="medium_chronic_disease_followup">🟡 Medium — Chronic Follow-Up</option>
+            <option value="hard_complex_er_visit">🔴 Hard — Complex ER Visit</option>
+          </select>
+          <button class="btn btn-primary" id="resetBtn">Reset</button>
+        </div>
+      </div>
+      <div class="panel-section" id="contextSection" style="display:none;">
+        <div class="panel-section-title">Patient Context</div>
+        <div class="context-grid" id="contextGrid"></div>
+      </div>
+      <div class="scrollable" id="transcriptArea">
+        <div class="empty-state">
+          <div class="icon">📋</div>
+          <p>Select a task and click <strong>Reset</strong> to load the transcript.</p>
+        </div>
+      </div>
+    </div>
+    <!-- Right Panel: Actions + Draft + Reward -->
+    <div class="panel">
+      <div class="panel-section" id="actionSection" style="display:none;">
+        <div class="panel-section-title">Action</div>
+        <div class="action-form">
+          <div class="action-row">
+            <select id="actionType">
+              <option value="submit_note">Submit Note</option>
+              <option value="revise_section">Revise Section</option>
+              <option value="request_clarify">Request Clarify</option>
+            </select>
+            <select id="sectionSelect" style="display:none;">
+              <option value="S">Subjective</option>
+              <option value="O">Objective</option>
+              <option value="A">Assessment</option>
+              <option value="P">Plan</option>
+            </select>
+            <button class="btn btn-green" id="stepBtn">Send</button>
+          </div>
+          <div id="soapInputs">
+            <div style="display:grid; grid-template-columns:1fr 1fr; gap:8px;">
+              <textarea id="inputS" placeholder="Subjective..." rows="3"></textarea>
+              <textarea id="inputO" placeholder="Objective..." rows="3"></textarea>
+              <textarea id="inputA" placeholder="Assessment..." rows="3"></textarea>
+              <textarea id="inputP" placeholder="Plan..." rows="3"></textarea>
+            </div>
+          </div>
+          <div id="reviseInput" style="display:none;">
+            <textarea id="inputRevision" placeholder="Revision text..." rows="3"></textarea>
+          </div>
+          <div id="clarifyInput" style="display:none;">
+            <input id="inputClarify" type="text" placeholder="Your question..." style="width:100%;">
+          </div>
+        </div>
+      </div>
+      <div class="panel-section" id="rewardSection" style="display:none;">
+        <div class="panel-section-title">Reward</div>
+        <div class="reward-bar">
+          <div class="score-value" id="scoreValue">0.00</div>
+          <div class="reward-meter">
+            <div class="reward-meter-fill" id="rewardFill" style="width:0%"></div>
+          </div>
+        </div>
+      </div>
+      <div class="scrollable" id="draftArea">
+        <div class="empty-state" id="draftEmpty">
+          <div class="icon">📝</div>
+          <p>Your SOAP note draft will appear here after you submit or revise.</p>
+        </div>
+        <div id="soapDraft" style="display:none;">
+          <div class="panel-section-title" style="margin-bottom:12px;">Current Draft</div>
+          <div class="soap-grid" id="soapGrid"></div>
+        </div>
+      </div>
+      <div class="panel-section" style="border-top:1px solid var(--border); border-bottom:none;">
+        <div class="panel-section-title">Log</div>
+        <div class="log-container" id="logContainer"></div>
+      </div>
+    </div>
+  </div>
+  <script src="/static/app.js"></script>
+</body>
+</html>

inference.py CHANGED Viewed

@@ -1,19 +1,18 @@
 """
 Inference Script — Clinical Note Scribe
-===================================
 MANDATORY
-- Before submitting, ensure the following variables are defined in your environment configuration:
     API_BASE_URL       The API endpoint for the LLM.
     MODEL_NAME         The model identifier to use for inference.
     HF_TOKEN           Your Hugging Face / API key.
-    LOCAL_IMAGE_NAME   The name of the local image to use for the environment
-                       if you are using from_docker_image() method.
 - Defaults are set only for API_BASE_URL and MODEL_NAME:
     API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
     MODEL_NAME   = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
-- The inference script must be named `inference.py` and placed in the root directory.
 - Participants must use OpenAI Client for all LLM calls using above variables.
 STDOUT FORMAT
@@ -47,29 +46,20 @@
 from openai import OpenAI
-# ---------------------------------------------------------------------------
 # Silence the underlying env's stdout JSON logs (redirect them to stderr)
-# ---------------------------------------------------------------------------
 env_logger = logging.getLogger("clinical_note_scribe")
 env_logger.setLevel(logging.INFO)
 env_logger.handlers.clear()
 env_logger.addHandler(logging.StreamHandler(sys.stderr))
 env_logger.propagate = False
-# ---------------------------------------------------------------------------
 # Environment imports
-# ---------------------------------------------------------------------------
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from environment import ClinicalNoteScribeEnv, Action, SOAPNote  # noqa: E402
 from environment.tasks import TASK_REGISTRY                       # noqa: E402
-# ---------------------------------------------------------------------------
 # Config
-# ---------------------------------------------------------------------------
 LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
 HF_TOKEN         = os.getenv("HF_TOKEN")
 API_BASE_URL     = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
@@ -77,14 +67,11 @@
 BENCHMARK    = "clinical-note-scribe"
 TASK_IDS     = list(TASK_REGISTRY.keys())
-MAX_STEPS    = 5       # Max steps per task (submit + optional clarify/revise)
 MAX_TOKENS   = 1024
 TEMPERATURE  = 0.2
-# ---------------------------------------------------------------------------
 # System prompt
-# ---------------------------------------------------------------------------
 SYSTEM_PROMPT = textwrap.dedent("""\
     You are a clinical documentation assistant. Given a doctor-patient transcript
     and patient context, generate a concise, clinically accurate SOAP note.
@@ -109,10 +96,7 @@
 """).strip()
-# ---------------------------------------------------------------------------
 # Stdout logging — mandatory hackathon format
-# ---------------------------------------------------------------------------
 def log_start(task: str, env: str, model: str) -> None:
     print(f"[START] task={task} env={env} model={model}", flush=True)
@@ -134,10 +118,7 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
     )
-# ---------------------------------------------------------------------------
 # Helpers
-# ---------------------------------------------------------------------------
 def _build_user_prompt(transcript: str, patient_context: dict[str, Any]) -> str:
     """Build the user message containing the transcript and context."""
     ctx_str = json.dumps(patient_context, indent=2, default=str)
@@ -187,10 +168,7 @@ def get_soap_note(client: OpenAI, transcript: str, patient_context: dict[str, An
         raise
-# ---------------------------------------------------------------------------
 # Per-task runner
-# ---------------------------------------------------------------------------
 def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[str, Any]:
     """Run a single task episode and return the result dict."""
     rewards: List[float] = []
@@ -202,18 +180,15 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
     log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
     try:
-        # ---- reset ----
         obs = env.reset(task_id)
         for step in range(1, MAX_STEPS + 1):
-            # ---- generate SOAP note via LLM ----
             try:
                 action_dict = get_soap_note(client, obs.transcript, obs.patient_context)
                 action = Action(**action_dict)
                 action_str = f"submit_note(sections=S,O,A,P)"
             except Exception as exc:
-                # On model / parse failure, submit an empty note so all sub-signals
-                # grade to 0.0 (format_valid=0 because fields are empty, grader=0).
                 action = Action(
                     action_type="submit_note",
                     soap_note=SOAPNote(
@@ -226,14 +201,12 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
                 action_str = "submit_note(fallback)"
                 last_error = str(exc)
-            # ---- step ----
             obs, reward_obj, done, info = env.step(action)
             reward_val = reward_obj.value
             rewards.append(reward_val)
             steps_taken = step
-            # Check for env-level errors
             error_msg = None
             if obs.errors_so_far:
                 error_msg = obs.errors_so_far[-1]
@@ -252,7 +225,6 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
             if done:
                 break
-        # Final score = last reward value (already in [0, 1])
         score = rewards[-1] if rewards else 0.0
         score = min(max(score, 0.0), 1.0)
         success = score > 0.0
@@ -274,10 +246,7 @@ def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[s
     }
-# ---------------------------------------------------------------------------
 # Main
-# ---------------------------------------------------------------------------
 def main() -> None:
     if not HF_TOKEN:
         print(
@@ -295,7 +264,7 @@ def main() -> None:
         result = run_task(client, env, task_id)
         results.append(result)
-    # ---- Summary table ----
     print("", file=sys.stderr, flush=True)
     print("=" * 60, file=sys.stderr, flush=True)
     print("  SUMMARY", file=sys.stderr, flush=True)

 """
 Inference Script — Clinical Note Scribe
+========================================
 MANDATORY
+- Before submitting, ensure the following variables are defined:
     API_BASE_URL       The API endpoint for the LLM.
     MODEL_NAME         The model identifier to use for inference.
     HF_TOKEN           Your Hugging Face / API key.
+    LOCAL_IMAGE_NAME   The name of the local image for the environment.
 - Defaults are set only for API_BASE_URL and MODEL_NAME:
     API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
     MODEL_NAME   = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+- The inference script must be named inference.py and placed in the root directory.
 - Participants must use OpenAI Client for all LLM calls using above variables.
 STDOUT FORMAT
 from openai import OpenAI
 # Silence the underlying env's stdout JSON logs (redirect them to stderr)
 env_logger = logging.getLogger("clinical_note_scribe")
 env_logger.setLevel(logging.INFO)
 env_logger.handlers.clear()
 env_logger.addHandler(logging.StreamHandler(sys.stderr))
 env_logger.propagate = False
 # Environment imports
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from environment import ClinicalNoteScribeEnv, Action, SOAPNote  # noqa: E402
 from environment.tasks import TASK_REGISTRY                       # noqa: E402
 # Config
 LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
 HF_TOKEN         = os.getenv("HF_TOKEN")
 API_BASE_URL     = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 BENCHMARK    = "clinical-note-scribe"
 TASK_IDS     = list(TASK_REGISTRY.keys())
+MAX_STEPS    = 5
 MAX_TOKENS   = 1024
 TEMPERATURE  = 0.2
 # System prompt
 SYSTEM_PROMPT = textwrap.dedent("""\
     You are a clinical documentation assistant. Given a doctor-patient transcript
     and patient context, generate a concise, clinically accurate SOAP note.
 """).strip()
 # Stdout logging — mandatory hackathon format
 def log_start(task: str, env: str, model: str) -> None:
     print(f"[START] task={task} env={env} model={model}", flush=True)
     )
 # Helpers
 def _build_user_prompt(transcript: str, patient_context: dict[str, Any]) -> str:
     """Build the user message containing the transcript and context."""
     ctx_str = json.dumps(patient_context, indent=2, default=str)
         raise
 # Per-task runner
 def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[str, Any]:
     """Run a single task episode and return the result dict."""
     rewards: List[float] = []
     log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
     try:
         obs = env.reset(task_id)
         for step in range(1, MAX_STEPS + 1):
             try:
                 action_dict = get_soap_note(client, obs.transcript, obs.patient_context)
                 action = Action(**action_dict)
                 action_str = f"submit_note(sections=S,O,A,P)"
             except Exception as exc:
+                # On model / parse failure, submit an empty note
                 action = Action(
                     action_type="submit_note",
                     soap_note=SOAPNote(
                 action_str = "submit_note(fallback)"
                 last_error = str(exc)
             obs, reward_obj, done, info = env.step(action)
             reward_val = reward_obj.value
             rewards.append(reward_val)
             steps_taken = step
             error_msg = None
             if obs.errors_so_far:
                 error_msg = obs.errors_so_far[-1]
             if done:
                 break
         score = rewards[-1] if rewards else 0.0
         score = min(max(score, 0.0), 1.0)
         success = score > 0.0
     }
 # Main
 def main() -> None:
     if not HF_TOKEN:
         print(
         result = run_task(client, env, task_id)
         results.append(result)
+    # Summary table
     print("", file=sys.stderr, flush=True)
     print("=" * 60, file=sys.stderr, flush=True)
     print("  SUMMARY", file=sys.stderr, flush=True)

server/app.py CHANGED Viewed

@@ -1,60 +1,40 @@
-"""FastAPI application for the Clinical Note Scribe environment.
-Run locally::
-    uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
-Or via Docker (see ``Dockerfile`` in project root).
-"""
 from __future__ import annotations
 import logging
 import sys
 from fastapi import FastAPI
 from server.routes import router
-# ---------------------------------------------------------------------------
-# Configure root logging → structured JSON to stdout
-# ---------------------------------------------------------------------------
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(message)s",
-    handlers=[logging.StreamHandler(sys.stdout)],
-)
-# Silence noisy uvicorn access logs so our structured events stay clean
 logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
-# ---------------------------------------------------------------------------
-# Application factory
-# ---------------------------------------------------------------------------
 app = FastAPI(
     title="Clinical Note Scribe – OpenEnv",
-    description=(
-        "An OpenEnv-compliant environment for evaluating AI agents on "
-        "clinical SOAP-note generation from doctor–patient transcripts."
-    ),
-    version="0.1.0",
 )
-from fastapi.responses import RedirectResponse
-# Mount all routes at root (/)
-app.include_router(router)
 @app.get("/", include_in_schema=False)
 async def root():
-    """Redirect to the FastAPI interactive documentation."""
-    return RedirectResponse(url="/docs")
 def main():
     import uvicorn
     uvicorn.run("server.app:app", host="0.0.0.0", port=7860)
 if __name__ == "__main__":
     main()

+"""FastAPI application for the Clinical Note Scribe environment."""
 from __future__ import annotations
 import logging
 import sys
+from pathlib import Path
 from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
 from server.routes import router
+logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=[logging.StreamHandler(sys.stdout)])
 logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
 app = FastAPI(
     title="Clinical Note Scribe – OpenEnv",
+    description="OpenEnv-compliant environment for evaluating AI agents on clinical SOAP-note generation.",
+    version="1.0.0",
 )
+app.include_router(router)
+FRONTEND_DIR = Path(__file__).resolve().parent.parent / "frontend"
+app.mount("/static", StaticFiles(directory=str(FRONTEND_DIR)), name="static")
 @app.get("/", include_in_schema=False)
 async def root():
+    return FileResponse(str(FRONTEND_DIR / "index.html"))
 def main():
     import uvicorn
     uvicorn.run("server.app:app", host="0.0.0.0", port=7860)
 if __name__ == "__main__":
     main()

server/routes.py CHANGED Viewed

@@ -1,61 +1,29 @@
-"""FastAPI route definitions for the Clinical Note Scribe environment.
-Endpoints
----------
-POST /reset   – start a new episode (takes ``task_id``, returns ``Observation``)
-POST /step    – execute an action   (takes ``Action``,  returns step result)
-GET  /state   – inspect env state   (returns ``EnvironmentState``)
-GET  /health  – liveness probe      (returns ``{"status": "ok"}``)
-Structured logging
-------------------
-The underlying ``ClinicalNoteScribeEnv`` already emits ``[START]``, ``[STEP]``,
-and ``[END]`` JSON lines to stdout via Python's ``logging`` module.  This router
-adds a thin request-level log wrapper so every inbound HTTP call is also
-traceable.
-"""
 from __future__ import annotations
-import logging
 import json
 import time
 from typing import Any, Optional
 from fastapi import APIRouter, HTTPException
-from pydantic import BaseModel, Field
-from environment.models import (
-    Action,
-    EnvironmentState,
-    Observation,
-    Reward,
-)
 from environment.env import ClinicalNoteScribeEnv
 logger = logging.getLogger("clinical_note_scribe.server")
-# ---------------------------------------------------------------------------
-# Singleton environment instance
-# ---------------------------------------------------------------------------
 _env = ClinicalNoteScribeEnv()
-# ---------------------------------------------------------------------------
-# Request / response schemas
-# ---------------------------------------------------------------------------
 class ResetRequest(BaseModel):
-    task_id: Optional[str] = Field(
-        default=None,
-        description=(
-            "Task to load. One of: easy_routine_checkup, "
-            "medium_chronic_disease_followup, hard_complex_er_visit. "
-            "Defaults to the first registered task."
-        ),
-    )
 class StepResponse(BaseModel):
@@ -69,114 +37,47 @@ class HealthResponse(BaseModel):
     status: str = "ok"
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-def _log(event: str, **kwargs: Any) -> None:
-    """Emit a structured JSON log line to stdout."""
-    payload: dict[str, Any] = {"event": event, "timestamp": time.time()}
-    payload.update(kwargs)
-    logger.info(json.dumps(payload, default=str))
-# ---------------------------------------------------------------------------
-# Router
-# ---------------------------------------------------------------------------
-router = APIRouter()
-@router.post(
-    "/reset",
-    response_model=Observation,
-    summary="Reset the environment and start a new episode",
-)
 async def reset(body: Optional[ResetRequest] = None) -> Observation:
-    """Load a task and return the initial ``Observation``.
-    The underlying environment emits a ``[START]`` log event.
-    """
     task_id = body.task_id if body else None
     _log("START", endpoint="/reset", task_id=task_id)
     try:
-        obs = _env.reset(task_id=task_id)
     except ValueError as exc:
         raise HTTPException(status_code=400, detail=str(exc))
-    return obs
-@router.post(
-    "/step",
-    response_model=StepResponse,
-    summary="Submit an action and advance the environment by one step",
-)
 async def step(payload: dict[str, Any]) -> StepResponse:
-    """Execute an action in the current episode.
-    Accepts a raw JSON body and validates it into an ``Action``.
-    If validation fails, the error is recorded in the environment
-    instead of returning an HTTP 422.
-    """
-    from pydantic import ValidationError
-    from environment.models import Reward
     try:
         action = Action(**payload)
     except (ValidationError, TypeError) as exc:
-        # Gracefully absorb bad payloads instead of crashing with HTTP 422
         _log("STEP", endpoint="/step", action_type="invalid", error=str(exc))
         error_msg = f"Invalid action payload: {exc}"
         _env._errors_so_far.append(error_msg)
         _env._step_count += 1
-        obs = _env._build_observation()
-        reward = Reward(
-            value=0.0,
-            signals={"error": 1.0},
-            done=False,
-            info={"error": error_msg},
-        )
         return StepResponse(
-            observation=obs,
-            reward=reward,
-            done=False,
-            info={"error": error_msg},
         )
     _log("STEP", endpoint="/step", action_type=action.action_type)
     try:
         obs, reward, done, info = _env.step(action)
     except RuntimeError as exc:
-        # e.g. stepping after episode is done without reset
         raise HTTPException(status_code=409, detail=str(exc))
     if done:
         _log("END", endpoint="/step", final_score=reward.value)
-    return StepResponse(
-        observation=obs,
-        reward=reward,
-        done=done,
-        info=info,
-    )
-@router.get(
-    "/state",
-    response_model=EnvironmentState,
-    summary="Return the full internal environment state",
-)
 async def state() -> EnvironmentState:
-    """Inspect the environment without mutating it."""
     return _env.state()
-@router.get(
-    "/health",
-    response_model=HealthResponse,
-    summary="Liveness probe",
-)
 async def health() -> HealthResponse:
-    """Returns HTTP 200 with ``{"status": "ok"}``."""
     return HealthResponse()

+"""FastAPI routes for the Clinical Note Scribe environment."""
 from __future__ import annotations
 import json
+import logging
 import time
 from typing import Any, Optional
 from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field, ValidationError
+from environment.models import Action, EnvironmentState, Observation, Reward
 from environment.env import ClinicalNoteScribeEnv
 logger = logging.getLogger("clinical_note_scribe.server")
 _env = ClinicalNoteScribeEnv()
+router = APIRouter()
+def _log(event: str, **kw: Any) -> None:
+    logger.info(json.dumps({"event": event, "timestamp": time.time(), **kw}, default=str))
 class ResetRequest(BaseModel):
+    task_id: Optional[str] = Field(None, description="Task to load. Defaults to first registered task.")
 class StepResponse(BaseModel):
     status: str = "ok"
+@router.post("/reset", response_model=Observation, summary="Reset and start a new episode")
 async def reset(body: Optional[ResetRequest] = None) -> Observation:
     task_id = body.task_id if body else None
     _log("START", endpoint="/reset", task_id=task_id)
     try:
+        return _env.reset(task_id=task_id)
     except ValueError as exc:
         raise HTTPException(status_code=400, detail=str(exc))
+@router.post("/step", response_model=StepResponse, summary="Submit an action")
 async def step(payload: dict[str, Any]) -> StepResponse:
     try:
         action = Action(**payload)
     except (ValidationError, TypeError) as exc:
         _log("STEP", endpoint="/step", action_type="invalid", error=str(exc))
         error_msg = f"Invalid action payload: {exc}"
         _env._errors_so_far.append(error_msg)
         _env._step_count += 1
         return StepResponse(
+            observation=_env._obs(),
+            reward=Reward(value=0.0, signals={"error": 1.0}, done=False, info={"error": error_msg}),
+            done=False, info={"error": error_msg},
         )
     _log("STEP", endpoint="/step", action_type=action.action_type)
     try:
         obs, reward, done, info = _env.step(action)
     except RuntimeError as exc:
         raise HTTPException(status_code=409, detail=str(exc))
     if done:
         _log("END", endpoint="/step", final_score=reward.value)
+    return StepResponse(observation=obs, reward=reward, done=done, info=info)
+@router.get("/state", response_model=EnvironmentState, summary="Inspect environment state")
 async def state() -> EnvironmentState:
     return _env.state()
+@router.get("/health", response_model=HealthResponse, summary="Liveness probe")
 async def health() -> HealthResponse:
     return HealthResponse()