Spaces:

ronitraj
/

QuantumScribe

Sleeping

App Files Files Community

ronitraj commited on 17 days ago

Commit

195f87e

verified ·

1 Parent(s): 1d9e50c

Real env: openenv-core wrapped DecoderEnvironment + /healthz + /decode

Browse files

Files changed (18) hide show

Dockerfile +48 -12
README.md +93 -18
app.py +0 -78
qubit_medic/__init__.py +15 -0
qubit_medic/client/__init__.py +5 -0
qubit_medic/client/client.py +132 -0
qubit_medic/config.py +254 -0
qubit_medic/models.py +143 -0
qubit_medic/prompts.py +230 -0
qubit_medic/server/__init__.py +5 -0
qubit_medic/server/app.py +169 -0
qubit_medic/server/curriculum.py +104 -0
qubit_medic/server/environment.py +314 -0
qubit_medic/server/openenv_adapter.py +289 -0
qubit_medic/server/physics.py +466 -0
qubit_medic/server/rewards.py +264 -0
qubit_medic/wandb_utils.py +482 -0
requirements.txt +35 -0

Dockerfile CHANGED Viewed

@@ -1,25 +1,61 @@
-FROM python:3.11-slim
-ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-# Day-0 deployment-substrate dependencies (Section 11 of the plan):
-# stim + pymatching + fastapi + openenv-core, plus uvicorn for the server.
-RUN pip install --user --no-cache-dir \
-        "stim>=1.13,<2.0" \
-        "pymatching>=2.2,<3.0" \
-        "fastapi>=0.110" \
-        "uvicorn[standard]>=0.27" \
-        "openenv-core>=0.2.1"
-COPY --chown=user app.py /app/app.py
 EXPOSE 7860
-CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Qubit-Medic OpenEnv server container.
+#
+# This image ships ONLY the env-server code:
+#   * stim + pymatching      - quantum simulation + matching baseline
+#   * fastapi + uvicorn      - HTTP transport
+#   * openenv-core           - canonical OpenEnv contract (/reset, /step,
+#                              /state, /health, /schema, /metadata, /mcp,
+#                              /docs)
+#
+# Heavy ML training deps (torch, transformers, trl, unsloth) are
+# deliberately NOT installed - they live in requirements-train.txt and
+# are installed only by the Colab training notebook. Keeping the Spaces
+# image lean avoids the ~10 GB CUDA wheel that would blow the free tier.
+FROM python:3.11-slim AS base
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     PIP_DISABLE_PIP_VERSION_CHECK=1
+# Stim and PyMatching ship manylinux wheels - no system C++ deps needed
+# beyond libstdc++. We keep build-essential as a safety net for any
+# unexpected source-fallback path on the build host.
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+ && rm -rf /var/lib/apt/lists/*
+# HF Spaces best-practice: run as non-root user with UID 1000.
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+COPY --chown=user requirements.txt /app/requirements.txt
+RUN pip install --user --upgrade pip \
+ && pip install --user -r /app/requirements.txt
+COPY --chown=user qubit_medic /app/qubit_medic
+COPY --chown=user README.md /app/README.md
+# Pre-warm Stim/PyMatching caches at build time so the first request
+# after `docker run` has near-zero latency (Section 9.1 of the plan).
+RUN python -c "from qubit_medic.server.environment import DecoderEnvironment; \
+               e = DecoderEnvironment(); \
+               e._cache_for('L1_warmup'); \
+               e._cache_for('L2_target')"
 EXPOSE 7860
+ENV LOG_LEVEL=INFO \
+    QUBIT_MEDIC_HOST=0.0.0.0 \
+    QUBIT_MEDIC_PORT=7860
+# Boots the FastAPI app (qubit_medic.server.app) which is built on top
+# of openenv.core.create_fastapi_app.
+CMD ["python", "-m", "qubit_medic.server.app"]

README.md CHANGED Viewed

@@ -7,43 +7,118 @@ sdk: docker
 app_port: 7860
 pinned: false
 license: mit
-short_description: Day-0 placeholder for the Qubit-Medic OpenEnv server.
 tags:
   - openenv
   - reinforcement-learning
   - quantum-error-correction
   - stim
   - pymatching
 ---
-# QuantumScribe — Day-0 placeholder
-This Space is the **deployment-substrate placeholder** for the
-**Qubit-Medic** OpenEnv server, an RL training environment that teaches
-an LLM to decode errors on the rotated surface code.
 ## Try it
-* `GET /` — root metadata.
-* `GET /healthz` — liveness probe; returns `{"ok": true, "stim_version": "...", ...}`.
 ```bash
 curl https://ronitraj-quantumscribe.hf.space/healthz
 ```
-## What's coming
-This placeholder will be replaced by the real **Qubit-Medic** environment:
-| Endpoint | What it does |
-|---|---|
-| `POST /reset` | Sample a fresh syndrome + observation for the LLM. |
-| `POST /step`  | Score the LLM's predicted Pauli frame with five independent rewards (logical correction, syndrome consistency, Hamming overlap, format compliance, PyMatching beat-rate). |
-| `GET  /health` | Curriculum + episode statistics. |
-| `GET  /healthz` | Liveness (already live on this placeholder). |
 ## Stack
-* [Stim](https://github.com/quantumlib/Stim) — Clifford circuit simulator.
-* [PyMatching](https://github.com/oscarhiggott/PyMatching) — minimum-weight matching baseline.
-* [FastAPI](https://fastapi.tiangolo.com/) + [openenv-core](https://pypi.org/project/openenv-core/) — server + RL contract.

 app_port: 7860
 pinned: false
 license: mit
+short_description: OpenEnv RL env that teaches an LLM to decode quantum errors.
 tags:
   - openenv
   - reinforcement-learning
   - quantum-error-correction
   - stim
   - pymatching
+  - grpo
+  - trl
+  - llm
 ---
+# QuantumScribe — Qubit-Medic OpenEnv
+> An [OpenEnv](https://meta-pytorch.github.io/OpenEnv/) reinforcement-learning
+> environment that teaches a Large Language Model to decode errors on the
+> rotated surface code, using **Stim** for physics-accurate noise sampling
+> and **PyMatching** as the classical baseline to beat.
+This Space hosts the **environment server only**. Training (SFT warm-up
++ GRPO RL) runs on a separate Colab T4; the trained LoRA adapter is
+loaded client-side, not on this Space.
+## Endpoints
+This is the canonical **OpenEnv contract** registered by
+`openenv.core.create_fastapi_app`, plus two extras of our own:
+| Method | Path | Purpose |
+|---|---|---|
+| `POST` | `/reset`     | Sample a fresh syndrome + observation. Body `{"seed": int?, "episode_id": str?}`. Optional `?forced_level=L1_warmup\|L2_target\|L3_stretch`. |
+| `POST` | `/step`      | Score the LLM's prediction with five independent rewards. Body `{"action": {"raw_response": "...", "episode_id": int}, "timeout_s": float?, "request_id": str?}`. |
+| `GET`  | `/state`     | Curriculum + episode counters (no physics-truth fields). |
+| `GET`  | `/health`    | OpenEnv liveness response. |
+| `GET`  | `/healthz`   | Lightweight Day-0 probe — Stim/PyMatching/openenv versions. |
+| `GET`  | `/schema`    | JSON Schema for `QubitMedicAction` / `QubitMedicObservation`. |
+| `GET`  | `/metadata`  | Environment metadata (name, description, version). |
+| `POST` | `/mcp`       | Model Context Protocol endpoint. |
+| `POST` | `/decode`    | PyMatching baseline demo: pass a hand-crafted syndrome, get the matching-decoder result. |
+| `GET`  | `/docs`      | Swagger UI for everything above. |
 ## Try it
+Curl from anywhere:
 ```bash
 curl https://ronitraj-quantumscribe.hf.space/healthz
 ```
+Use it from Python with the OpenEnv client:
+```python
+from openenv.core import GenericEnvClient
+with GenericEnvClient(base_url="https://ronitraj-quantumscribe.hf.space").sync() as env:
+    obs = env.reset(seed=42)
+    print(obs.observation["prompt"][:200])
+    result = env.step({"raw_response": "<answer>X: 0,3 | Z:</answer>", "episode_id": 1})
+    print("reward:", result.reward, "rewards breakdown:", result.observation["info"]["rewards"])
+```
+Or hit the env directly with `httpx`:
+```python
+import httpx
+url = "https://ronitraj-quantumscribe.hf.space"
+obs = httpx.post(f"{url}/reset", json={"seed": 42},
+                 params={"forced_level": "L2_target"}).json()["observation"]
+print(obs["prompt"][:200])
+res = httpx.post(f"{url}/step", json={
+    "action": {"raw_response": "<answer>X: | Z:</answer>",
+               "episode_id": obs["episode_id"]}
+}).json()
+print("reward =", res["reward"], "rewards =", res["observation"]["info"]["rewards"])
+```
+## What the rewards mean
+Each `step` returns five *independent, verifiable* reward components:
+| Reward                   | Weight | What it measures |
+|---|---:|---|
+| `logical_correction`     | 0.40   | 1 if the predicted Pauli frame preserves the logical-Z observable. |
+| `syndrome_consistency`   | 0.20   | Hamming similarity over final-round detector parities. |
+| `hamming_overlap`        | 0.20   | Mean Jaccard similarity vs. the PyMatching reference Pauli frame. |
+| `format_compliance`      | 0.10   | 1 / 0.5 / 0 for full / partial / unparseable LLM output. |
+| `pymatching_beat`        | 0.10   | 1 iff PyMatching is wrong on this syndrome **and** the model is right. |
+All five are computed from the same `(prompt, completion, syndrome)` tuple on every step (no redundant sampling — see the architecture notes in the GitHub README).
 ## Stack
+* [openenv-core](https://pypi.org/project/openenv-core/) `>=0.2.1` — environment contract, FastAPI scaffolding, MCP, WebSocket sessions.
+* [Stim](https://github.com/quantumlib/Stim) — Clifford circuit simulator and detector-error-model generator.
+* [PyMatching](https://github.com/oscarhiggott/PyMatching) `>=2.2` — minimum-weight matching baseline (and ground-truth for the `pymatching_beat` reward).
+* FastAPI + Uvicorn — HTTP transport.
+## Curriculum
+| Level | Distance | Rounds | Physical error rate | Promotion threshold |
+|---|---:|---:|---:|---:|
+| `L1_warmup`  | 3 | 1 | 1e-4 | logical_correction ≥ 0.80 |
+| `L2_target`  | 3 | 3 | 1e-3 | logical_correction ≥ 0.70 |
+| `L3_stretch` | 5 | 5 | 1e-3 | logical_correction ≥ 0.30 |
+The server's curriculum scheduler tracks a moving average of `logical_correction` per level and promotes the agent when it crosses the threshold.
+## See also
+* OpenEnv documentation — <https://meta-pytorch.github.io/OpenEnv/>
+* TRL `environment_factory=` integration — <https://huggingface.co/docs/trl/main/openenv>
+---
+*Built for the META RL hackathon. Source code, training notebook, and reproduction instructions: see the [GitHub repository](https://github.com/ronitraj/qubit-medic) (link to be updated).*

app.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""QuantumScribe - Day-0 deployment-substrate placeholder.
-This minimal FastAPI app proves the Hugging Face Spaces deployment
-substrate works for the Qubit-Medic / QuantumScribe project:
-* Stim + PyMatching + FastAPI + openenv-core install cleanly in HF's
-  build environment (Day-0 step 2).
-* GET /healthz returns {"ok": true, "stim_version": "..."}, proving the
-  server boots and the heavy quantum dependency loads (Day-0 step 3).
-* The endpoint is reachable from a browser (Day-0 step 4) and from a
-  Colab `requests.get(...)` call (Day-0 step 5).
-Once all Day-0 gates pass, this file is replaced by the real Qubit-Medic
-OpenEnv server (qubit_medic.server.app) at the same Space URL, inheriting
-the warm build cache.
-"""
-from __future__ import annotations
-import sys
-import stim
-from fastapi import FastAPI
-app = FastAPI(
-    title="QuantumScribe (Qubit-Medic) - Hello Space",
-    description="Day-0 deployment-substrate placeholder for the Qubit-Medic "
-                "OpenEnv server. Will be replaced by the real env shortly.",
-    version="0.0.1-placeholder",
-)
-_PYMATCHING_VERSION: str
-try:
-    import pymatching as _pm
-    _PYMATCHING_VERSION = getattr(_pm, "__version__", "unknown")
-except Exception as exc:
-    _PYMATCHING_VERSION = f"import-error: {exc}"
-_OPENENV_VERSION: str
-try:
-    import openenv as _oe
-    _OPENENV_VERSION = getattr(_oe, "__version__", "unknown")
-except Exception as exc:
-    _OPENENV_VERSION = f"import-error: {exc}"
-@app.get("/")
-def root() -> dict:
-    return {
-        "service": "QuantumScribe (Qubit-Medic)",
-        "status": "Day-0 placeholder live",
-        "next": "POST /reset and /step will become available once the real "
-                "DecoderEnvironment is pushed.",
-        "endpoints": ["/", "/healthz"],
-        "links": {
-            "github": "https://github.com/ronitraj (replace with repo URL once public)",
-            "openenv_docs": "https://meta-pytorch.org/OpenEnv/",
-        },
-    }
-@app.get("/healthz")
-def healthz() -> dict:
-    """Day-0 liveness probe.
-    Returns the Stim version - so curl-ing this in a browser or from Colab
-    proves both that networking works AND that the heavy quantum deps
-    actually loaded. This is the literal endpoint the plan calls for.
-    """
-    return {
-        "ok": True,
-        "stim_version": stim.__version__,
-        "pymatching_version": _PYMATCHING_VERSION,
-        "openenv_version": _OPENENV_VERSION,
-        "python_version": sys.version.split()[0],
-        "service": "QuantumScribe",
-    }

qubit_medic/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Qubit-Medic: An LLM-trained quantum error-correction decoder.
+The package is split into three layers (Section 0 of the plan):
+* ``qubit_medic.config``   - the locked experiment configuration.
+* ``qubit_medic.server``   - Stim physics, rewards, curriculum, FastAPI app.
+* ``qubit_medic.client``   - the lightweight HTTP stub the trainer imports.
+``qubit_medic.models`` and ``qubit_medic.prompts`` are the contract both sides
+agree on: what the LLM sees and what the LLM is allowed to emit.
+"""
+from qubit_medic import config, models, prompts  # noqa: F401
+__version__ = "1.0.0"

qubit_medic/client/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Qubit-Medic client - the lightweight HTTP stub the trainer imports."""
+from qubit_medic.client.client import DecoderClient, LocalDecoderClient
+__all__ = ["DecoderClient", "LocalDecoderClient"]

qubit_medic/client/client.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Two equivalent client implementations:
+* :class:`DecoderClient`     - hits an HTTP endpoint (HF Spaces deployment).
+  Speaks the **OpenEnv** wire format:
+  - ``POST /reset`` body ``{"seed": int?, "episode_id": str?}``
+  - ``POST /step``  body ``{"action": {"raw_response": "...", ...},
+    "timeout_s": float?, "request_id": str?}``
+* :class:`LocalDecoderClient` - calls the in-process env directly. Use this
+  in tests, in CI, and during local Colab runs to skip HTTP overhead.
+Both expose the same ``reset`` / ``step`` API so the training scripts can
+swap between them via a single env var (``QUBIT_MEDIC_URL``).
+"""
+from __future__ import annotations
+import os
+from typing import Optional, Protocol
+import httpx
+from qubit_medic.models import (
+    DecoderObservation,
+    StepResult,
+)
+class _ClientProtocol(Protocol):
+    def reset(self, *, seed: Optional[int] = None,
+              forced_level: Optional[str] = None) -> DecoderObservation: ...
+    def step(self, *, raw_response: str, episode_id: int) -> StepResult: ...
+    def health(self) -> dict: ...
+    def close(self) -> None: ...
+def _obs_from_openenv(payload: dict) -> DecoderObservation:
+    """Re-hydrate our internal :class:`DecoderObservation` from the
+    OpenEnv response body. The OpenEnv wrapper inlines all our fields
+    onto the observation, so this is a 1-1 field mapping."""
+    return DecoderObservation(
+        prompt=payload.get("prompt", ""),
+        syndrome_bits=list(payload.get("syndrome_bits", [])),
+        distance=int(payload.get("distance", 0)),
+        rounds=int(payload.get("rounds", 0)),
+        p=float(payload.get("p", 0.0)),
+        curriculum_level=payload.get("curriculum_level", ""),
+        episode_id=int(payload.get("episode_id", 0)),
+        dem_digest=payload.get("dem_digest", ""),
+    )
+class DecoderClient:
+    """HTTP client targeting a deployed FastAPI server (OpenEnv shape)."""
+    def __init__(self, base_url: str, *, timeout: float = 60.0) -> None:
+        self._client = httpx.Client(base_url=base_url.rstrip("/"), timeout=timeout)
+    def reset(self, *, seed: Optional[int] = None,
+              forced_level: Optional[str] = None) -> DecoderObservation:
+        # OpenEnv's ResetRequest only accepts seed + episode_id. We pass
+        # forced_level via the URL query string so adapters that honour
+        # it (our QubitMedicEnvironment via **kwargs) pick it up; servers
+        # that ignore it just get a default level.
+        body: dict = {}
+        if seed is not None:
+            body["seed"] = seed
+        params = {"forced_level": forced_level} if forced_level else None
+        r = self._client.post("/reset", json=body, params=params)
+        r.raise_for_status()
+        payload = r.json()
+        # OpenEnv returns {observation: {...}, reward, done}.
+        return _obs_from_openenv(payload.get("observation", payload))
+    def step(self, *, raw_response: str, episode_id: int) -> StepResult:
+        body = {
+            "action": {
+                "raw_response": raw_response,
+                "episode_id": episode_id,
+            },
+        }
+        r = self._client.post("/step", json=body)
+        r.raise_for_status()
+        payload = r.json()
+        obs_payload = payload.get("observation", {})
+        return StepResult(
+            observation=_obs_from_openenv(obs_payload),
+            reward=float(payload.get("reward", 0.0) or 0.0),
+            done=bool(payload.get("done", True)),
+            truncated=bool(obs_payload.get("info", {}).get("timed_out", False)),
+            info=dict(obs_payload.get("info", {})),
+        )
+    def health(self) -> dict:
+        r = self._client.get("/health")
+        r.raise_for_status()
+        return r.json()
+    def healthz(self) -> dict:
+        r = self._client.get("/healthz")
+        r.raise_for_status()
+        return r.json()
+    def close(self) -> None:
+        self._client.close()
+class LocalDecoderClient:
+    """In-process client - calls :class:`DecoderEnvironment` directly."""
+    def __init__(self, env=None) -> None:
+        from qubit_medic.server.environment import DecoderEnvironment
+        self._env = env if env is not None else DecoderEnvironment()
+    def reset(self, *, seed: Optional[int] = None,
+              forced_level: Optional[str] = None) -> DecoderObservation:
+        return self._env.reset(seed=seed, forced_level=forced_level)
+    def step(self, *, raw_response: str, episode_id: int) -> StepResult:
+        return self._env.step(raw_response=raw_response, episode_id=episode_id)
+    def health(self) -> dict:
+        return self._env.health()
+    def close(self) -> None:  # nothing to clean up
+        pass
+def make_default_client() -> _ClientProtocol:
+    """Return :class:`DecoderClient` if ``QUBIT_MEDIC_URL`` is set, else local."""
+    url = os.getenv("QUBIT_MEDIC_URL")
+    if url:
+        return DecoderClient(url)
+    return LocalDecoderClient()

qubit_medic/config.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""Locked experiment configuration (Section 1.4 of the plan).
+Every magic number in the project lives here. Do not hard-code circuit
+parameters, noise rates, or model identifiers anywhere else; import them
+from this module instead.
+Cited literature
+----------------
+Bausch et al., AlphaQubit, *Nature* 635:834 (2024)
+    DOI: 10.1038/s41586-024-08148-8
+    https://www.nature.com/articles/s41586-024-08148-8
+Acharya et al. (Google QAI), *Willow*, arXiv:2408.13687 (2024)
+    https://arxiv.org/abs/2408.13687
+Gidney & Fowler, *SI1000*, arXiv:2108.10457 (2021)
+    https://arxiv.org/abs/2108.10457
+Higgott & Gidney, *PyMatching v2*, arXiv:2303.15933 (2023)
+    https://arxiv.org/abs/2303.15933
+Shao et al., *DeepSeekMath / GRPO*, arXiv:2402.03300 (2024)
+    https://arxiv.org/abs/2402.03300
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Mapping
+# --------------------------------------------------------------------------- #
+# Quantum code geometry                                                        #
+# --------------------------------------------------------------------------- #
+CODE_TASK = "surface_code:rotated_memory_z"
+"""Stim task identifier. We always use the rotated surface code with a Z
+memory experiment - same family AlphaQubit and Willow report on."""
+DISTANCE_PRIMARY: int = 3
+"""Distance-3 is the primary benchmark configuration (AlphaQubit Fig. 2b)."""
+DISTANCE_STRETCH: int = 5
+"""Distance-5 is the stretch-goal configuration for Section 4.3."""
+ROUNDS_FACTOR: int = 1
+"""rounds = ROUNDS_FACTOR * distance. Value 1 matches the AlphaQubit
+distance-equals-rounds protocol."""
+# --------------------------------------------------------------------------- #
+# Noise model: SI1000 sub-rates (Gidney & Fowler 2021, Table 1)                #
+# --------------------------------------------------------------------------- #
+# SI1000 maps a single physical error budget ``p`` to four operation-specific
+# sub-rates. The factors below come from arXiv:2108.10457 Table 1 and are the
+# *same* values Google's QAI uses in their Willow analyses.
+#
+# Stim's surface_code:rotated_memory_z generator accepts four matching knobs:
+#   after_clifford_depolarization        (two-qubit gate noise)
+#   before_round_data_depolarization     (idle data-qubit noise per round)
+#   before_measure_flip_probability      (measurement noise)
+#   after_reset_flip_probability         (reset noise)
+@dataclass(frozen=True)
+class SI1000Rates:
+    """Per-operation error rates derived from a single budget ``p``."""
+    after_clifford_depolarization: float
+    before_round_data_depolarization: float
+    before_measure_flip_probability: float
+    after_reset_flip_probability: float
+    @classmethod
+    def from_p(cls, p: float) -> "SI1000Rates":
+        """Build SI1000 sub-rates from the headline budget ``p``.
+        The factors are exactly Gidney & Fowler 2021 Table 1.
+        """
+        return cls(
+            after_clifford_depolarization=p,
+            before_round_data_depolarization=p / 10.0,
+            before_measure_flip_probability=p * 5.0,
+            after_reset_flip_probability=p * 2.0,
+        )
+    def as_stim_kwargs(self) -> Mapping[str, float]:
+        """Return the kwargs dict accepted by ``stim.Circuit.generated``."""
+        return {
+            "after_clifford_depolarization": self.after_clifford_depolarization,
+            "before_round_data_depolarization": self.before_round_data_depolarization,
+            "before_measure_flip_probability": self.before_measure_flip_probability,
+            "after_reset_flip_probability": self.after_reset_flip_probability,
+        }
+# --------------------------------------------------------------------------- #
+# Curriculum levels (Section 4)                                                #
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class CurriculumLevel:
+    """One rung on the difficulty ladder."""
+    name: str
+    distance: int
+    rounds: int
+    p: float
+    promotion_threshold: float  # logical-correction rate at which we move on
+    eval_size: int              # held-out shots used to test promotion
+CURRICULUM: tuple[CurriculumLevel, ...] = (
+    CurriculumLevel(
+        name="L1_warmup",
+        distance=DISTANCE_PRIMARY,
+        rounds=1,
+        p=0.0001,
+        promotion_threshold=0.80,
+        eval_size=100,
+    ),
+    CurriculumLevel(
+        name="L2_target",
+        distance=DISTANCE_PRIMARY,
+        rounds=DISTANCE_PRIMARY,
+        p=0.001,
+        promotion_threshold=0.70,
+        eval_size=200,
+    ),
+    CurriculumLevel(
+        name="L3_stretch",
+        distance=DISTANCE_STRETCH,
+        rounds=DISTANCE_STRETCH,
+        p=0.001,
+        promotion_threshold=0.30,  # stretch goal - even partial counts
+        eval_size=200,
+    ),
+)
+# --------------------------------------------------------------------------- #
+# Reward weights (Section 3) - sum to 1.0 by construction                      #
+# --------------------------------------------------------------------------- #
+REWARD_WEIGHTS: dict[str, float] = {
+    "logical_correction": 0.40,    # Reward 1 - the unfakeable ground truth
+    "syndrome_consistency": 0.20,  # Reward 2 - prevents lucky-guess attacks
+    "hamming_overlap": 0.20,       # Reward 3 - dense partial credit
+    "format_compliance": 0.10,     # Reward 4 - parser must succeed
+    "pymatching_beat": 0.10,       # Reward 5 - the headline metric
+}
+assert abs(sum(REWARD_WEIGHTS.values()) - 1.0) < 1e-9, "reward weights must sum to 1"
+# --------------------------------------------------------------------------- #
+# Reproducibility                                                              #
+# --------------------------------------------------------------------------- #
+SEEDS: tuple[int, ...] = (42, 1337, 2024)
+"""Three seeds for error bars - never run with anything else."""
+PRIMARY_SEED: int = SEEDS[0]
+# --------------------------------------------------------------------------- #
+# Model + training                                                             #
+# --------------------------------------------------------------------------- #
+MODEL_ID: str = "Qwen/Qwen2.5-3B-Instruct"
+"""3B params, 4-bit quantised + LoRA fits in a Colab T4."""
+LORA_R: int = 16
+LORA_ALPHA: int = 32
+LORA_TARGET_MODULES: tuple[str, ...] = ("q_proj", "k_proj", "v_proj", "o_proj")
+SFT_EPOCHS: int = 1
+SFT_BATCH_SIZE: int = 4
+SFT_GRAD_ACCUM: int = 4
+SFT_LR: float = 2e-4
+SFT_DATASET_SIZE: int = 5_000
+SFT_MAX_SEQ_LEN: int = 2048
+GRPO_STEPS: int = 2_000
+GRPO_GEN_PER_PROMPT: int = 4
+GRPO_LR: float = 1e-5
+GRPO_KL_COEF: float = 0.04
+GRPO_MAX_PROMPT_LEN: int = 512
+GRPO_MAX_COMPLETION_LEN: int = 256
+GRPO_CHECKPOINT_EVERY: int = 250
+GRPO_LOG_EVERY: int = 50
+# Decoding sampler defaults at evaluation/format-test time.
+SAMPLE_TEMPERATURE: float = 0.7
+SAMPLE_TOP_P: float = 0.95
+# --------------------------------------------------------------------------- #
+# Server / deployment                                                          #
+# --------------------------------------------------------------------------- #
+EPISODE_TIMEOUT_SECONDS: float = 30.0
+"""Wall-clock budget per episode (Section 2.6)."""
+DEFAULT_HOST: str = "0.0.0.0"
+DEFAULT_PORT: int = 7860  # Hugging Face Spaces' default exposed port
+# --------------------------------------------------------------------------- #
+# Weights & Biases                                                             #
+# --------------------------------------------------------------------------- #
+# Centralised so the SFT trainer, GRPO trainer, eval script, and notebook
+# all log to the same project / dashboard. Override per-run on the CLI.
+import os as _os  # noqa: E402  (local import to keep top of module clean)
+WANDB_PROJECT: str = _os.environ.get("WANDB_PROJECT", "qubit-medic")
+"""Default W&B project name. Override with ``WANDB_PROJECT=...``."""
+WANDB_ENTITY: str | None = _os.environ.get("WANDB_ENTITY") or None
+"""W&B team or username. ``None`` -> wandb's default entity for the user."""
+WANDB_DEFAULT_TAGS: tuple[str, ...] = (
+    "qubit-medic",
+    "quantum-error-correction",
+    "openenv",
+    f"distance-{DISTANCE_PRIMARY}",
+    "si1000",
+)
+"""Tags applied to every W&B run (per-script tags appended on top)."""
+WANDB_LOG_GENERATIONS_EVERY: int = 50
+"""Log a sample-completion table every N GRPO steps."""
+WANDB_SAMPLE_GENERATIONS: int = 8
+"""Number of generations included in each sample-completion table."""
+WANDB_INLOOP_EVAL_EVERY: int = 200
+"""Run an in-loop evaluation pass (deterministic, ``WANDB_INLOOP_EVAL_EPISODES``
+syndromes) every N GRPO steps. Set to 0 to disable."""
+WANDB_INLOOP_EVAL_EPISODES: int = 50
+"""Number of held-out syndromes per in-loop eval pass (kept small for speed)."""
+# --------------------------------------------------------------------------- #
+# Convenience accessors                                                        #
+# --------------------------------------------------------------------------- #
+def level_by_name(name: str) -> CurriculumLevel:
+    for lvl in CURRICULUM:
+        if lvl.name == name:
+            return lvl
+    raise KeyError(f"unknown curriculum level {name!r}")
+def primary_level() -> CurriculumLevel:
+    """The L2 target benchmark - what the headline numbers come from."""
+    return level_by_name("L2_target")

qubit_medic/models.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""Pydantic data classes shared by client and server (Section 2.2 of the plan).
+Three classes draw the trust boundary:
+* ``DecoderObservation`` - what the LLM sees on each step.
+* ``DecoderAction``      - what the LLM emits (after parsing).
+* ``DecoderState``       - server-side state, never serialised to the client.
+Keeping the wire schema explicit is what closes off reward-hacking attacks:
+the LLM literally cannot reach into the ``true_error_pattern`` because that
+field is not in any class it ever receives.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from pydantic import BaseModel, ConfigDict, Field
+# --------------------------------------------------------------------------- #
+# Wire types - sent across the OpenEnv HTTP boundary                           #
+# --------------------------------------------------------------------------- #
+class DecoderObservation(BaseModel):
+    """The view the LLM (and only the LLM) sees on each step."""
+    model_config = ConfigDict(frozen=True)
+    prompt: str = Field(
+        ...,
+        description=(
+            "Pre-formatted prompt string. This is exactly what the trainer "
+            "passes to the policy - it appears verbatim in training logs."
+        ),
+    )
+    syndrome_bits: list[int] = Field(
+        ...,
+        description=(
+            "Raw detector activations (0/1). Provided for debugging and "
+            "reward-hacking audits; the LLM should be reading the prompt, not "
+            "this array."
+        ),
+    )
+    distance: int = Field(..., description="Code distance for this episode.")
+    rounds: int = Field(..., description="Number of stabiliser rounds.")
+    p: float = Field(..., description="Physical error budget (SI1000 base).")
+    curriculum_level: str = Field(..., description="Curriculum level name.")
+    episode_id: int = Field(..., description="Monotonic episode counter.")
+    dem_digest: str = Field(
+        ...,
+        description=(
+            "Short hash of the detector error model used this episode. The "
+            "trainer logs this so we can group rollouts by physics config."
+        ),
+    )
+class DecoderAction(BaseModel):
+    """Action emitted by the LLM after parsing.
+    ``raw_response`` is preserved exactly so we can satisfy the participant
+    guide's *inspect generations* mandate (Section 2.5 of the plan).
+    """
+    model_config = ConfigDict(frozen=True)
+    x_error_qubits: list[int] = Field(default_factory=list)
+    z_error_qubits: list[int] = Field(default_factory=list)
+    raw_response: str = ""
+    parse_success: bool = True
+class StepResult(BaseModel):
+    """Standard env step return (mirrors OpenEnv core/Gymnasium)."""
+    observation: DecoderObservation
+    reward: float
+    done: bool
+    truncated: bool = False
+    info: dict[str, Any] = Field(default_factory=dict)
+class ResetRequest(BaseModel):
+    """Optional knobs the trainer can pass to ``reset``."""
+    seed: Optional[int] = None
+    forced_level: Optional[str] = Field(
+        default=None,
+        description=(
+            "Override the curriculum scheduler. Used by eval scripts that "
+            "want a specific (distance, rounds, p) configuration."
+        ),
+    )
+class StepRequest(BaseModel):
+    """The trainer sends the LLM's raw text; the server parses + scores."""
+    raw_response: str
+    episode_id: int
+# --------------------------------------------------------------------------- #
+# Server-only state - intentionally NOT a wire type                            #
+# --------------------------------------------------------------------------- #
+class DecoderState(BaseModel):
+    """Per-episode state kept on the server; never sent to the client.
+    Pydantic ``arbitrary_types_allowed`` is on because we hold a reference to
+    a ``stim.Circuit`` object. The state is not serialised over HTTP - it
+    lives in the server's per-episode dict and is discarded on ``done``.
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True, frozen=False)
+    episode_id: int
+    seed: int
+    curriculum_level: str
+    distance: int
+    rounds: int
+    p: float
+    syndrome_bits: list[int]
+    true_x_errors: list[int]
+    true_z_errors: list[int]
+    actual_observable_flip: int  # 0 or 1; the unfakeable ground truth
+    pymatching_observable_pred: int  # 0 or 1; baseline's prediction
+    # Pre-computed quantities the reward functions need.
+    x_observable_support: list[int]  # data qubits whose Z error flips X obs
+    z_observable_support: list[int]  # data qubits whose X error flips Z obs
+    num_data_qubits: int
+    num_stabilizers: int
+    # Stim/PyMatching objects - kept opaque to satisfy Pydantic.
+    circuit_text: str
+    dem_text: str
+    # Reward audit log.
+    last_reward_breakdown: dict[str, float] = Field(default_factory=dict)

qubit_medic/prompts.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Prompt formatter and action parser (Section 2.3 + Section 2.5 of the plan).
+The prompt is engineered around five sections:
+    1. Role declaration
+    2. Physics summary (~50 tokens, plain English)
+    3. Syndrome data (round-by-round, labelled)
+    4. Output format spec (one example included)
+    5. Reasoning trigger ("think step by step ...")
+Total budget ~250-300 tokens for the prompt; ~150 for the response.
+The parser is deliberately permissive on whitespace and bracket style but
+strict on the existence of the two key tokens ``X_ERRORS`` and ``Z_ERRORS``.
+A partial-credit hook is exposed so Reward 4 can hand out 0.5 for "partly
+parseable".
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Iterable
+# --------------------------------------------------------------------------- #
+# Prompt formatting                                                            #
+# --------------------------------------------------------------------------- #
+_ROLE = (
+    "You are a quantum error-correction decoder. You are decoding errors in "
+    "a distance-{distance} rotated surface code memory experiment."
+)
+_PHYSICS_SUMMARY = (
+    "Stabilizers are parity checks measured every round. A *syndrome bit* "
+    "is 1 when a stabilizer's measurement disagrees with its previous round, "
+    "indicating a nearby physical error. Your job is to look at the syndrome "
+    "history and output the smallest physical error pattern (X-flips and "
+    "Z-flips on data qubits, identified by integer IDs) that explains it."
+)
+_OUTPUT_SPEC = (
+    "Output format (REQUIRED, exact):\n"
+    "    X_ERRORS=[id1,id2,...] Z_ERRORS=[id1,id2,...]\n"
+    "Use empty lists when no errors of that type. Example with no errors:\n"
+    "    X_ERRORS=[] Z_ERRORS=[]"
+)
+_REASONING_TRIGGER = (
+    "Think step by step about which qubits could have caused this syndrome, "
+    "then output your prediction in the required format."
+)
+def format_syndrome_block(
+    syndrome_bits: Iterable[int],
+    rounds: int,
+    num_x_stabilizers: int,
+    num_z_stabilizers: int,
+) -> str:
+    """Render the detector activations round-by-round.
+    Stim emits detectors in a flat row-major order: round 0 stabilisers first,
+    then round 1, and so on. We label by round and stabiliser type so the LLM
+    can read the temporal structure.
+    """
+    bits = list(syndrome_bits)
+    per_round = num_x_stabilizers + num_z_stabilizers
+    lines = ["Syndrome (round-by-round):"]
+    if per_round == 0 or rounds == 0 or len(bits) == 0:
+        lines.append("    (no detectors fired)")
+        return "\n".join(lines)
+    for r in range(rounds):
+        offset = r * per_round
+        if offset >= len(bits):
+            break
+        chunk = bits[offset : offset + per_round]
+        x_chunk = chunk[:num_x_stabilizers]
+        z_chunk = chunk[num_x_stabilizers : num_x_stabilizers + num_z_stabilizers]
+        lines.append(
+            f"    Round {r + 1} X-stabilizers: "
+            + " ".join(str(b) for b in x_chunk)
+        )
+        lines.append(
+            f"    Round {r + 1} Z-stabilizers: "
+            + " ".join(str(b) for b in z_chunk)
+        )
+    # Trailing block for the final destructive measurement, if any extras.
+    used = rounds * per_round
+    if used < len(bits):
+        tail = bits[used:]
+        lines.append("    Final-round detectors: " + " ".join(str(b) for b in tail))
+    return "\n".join(lines)
+def build_prompt(
+    *,
+    distance: int,
+    rounds: int,
+    p: float,
+    syndrome_bits: list[int],
+    num_x_stabilizers: int,
+    num_z_stabilizers: int,
+    num_data_qubits: int,
+) -> str:
+    """Assemble the full prompt the LLM sees on each step.
+    Keeping this function pure (no I/O, no globals) means the SFT pipeline
+    and the GRPO rollout use byte-identical inputs - a critical invariant.
+    """
+    syndrome_block = format_syndrome_block(
+        syndrome_bits=syndrome_bits,
+        rounds=rounds,
+        num_x_stabilizers=num_x_stabilizers,
+        num_z_stabilizers=num_z_stabilizers,
+    )
+    return (
+        _ROLE.format(distance=distance)
+        + "\n\n"
+        + _PHYSICS_SUMMARY
+        + "\n\n"
+        + f"Code parameters: distance={distance}, rounds={rounds}, "
+        + f"physical_error_rate={p:g}, data_qubits=0..{num_data_qubits - 1}.\n\n"
+        + syndrome_block
+        + "\n\n"
+        + _OUTPUT_SPEC
+        + "\n\n"
+        + _REASONING_TRIGGER
+    )
+# --------------------------------------------------------------------------- #
+# Output parsing                                                               #
+# --------------------------------------------------------------------------- #
+_X_PATTERN = re.compile(r"X_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
+_Z_PATTERN = re.compile(r"Z_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
+@dataclass(frozen=True)
+class ParseResult:
+    x_errors: list[int]
+    z_errors: list[int]
+    parse_success: bool        # True iff BOTH X_ERRORS and Z_ERRORS parsed cleanly
+    parse_partial: bool        # True iff exactly one of the two parsed cleanly
+    raw_response: str
+    @property
+    def format_score(self) -> float:
+        """Score for Reward 4 (format compliance)."""
+        if self.parse_success:
+            return 1.0
+        if self.parse_partial:
+            return 0.5
+        return 0.0
+def _parse_int_list(s: str, max_qubit: int) -> tuple[list[int], bool]:
+    """Parse a comma/space-separated integer list. Drops out-of-range and dups.
+    Returns ``(qubits_sorted_unique, all_tokens_were_valid)``.
+    """
+    if not s.strip():
+        return [], True
+    raw_tokens = re.split(r"[\s,]+", s.strip())
+    out: list[int] = []
+    all_clean = True
+    for tok in raw_tokens:
+        if not tok:
+            continue
+        try:
+            v = int(tok)
+        except ValueError:
+            all_clean = False
+            continue
+        if 0 <= v < max_qubit:
+            out.append(v)
+        else:
+            all_clean = False
+    return sorted(set(out)), all_clean
+def parse_action(raw_response: str, num_data_qubits: int) -> ParseResult:
+    """Convert the LLM's raw text to a ``ParseResult``.
+    Tolerant of trailing chain-of-thought, surrounding code fences, and
+    casing, but strict on the existence of both ``X_ERRORS`` and ``Z_ERRORS``.
+    """
+    if not isinstance(raw_response, str):
+        return ParseResult([], [], False, False, raw_response="")
+    # If the model wrapped its answer in ```...``` blocks, focus on the last one.
+    fenced = re.findall(r"```(?:[^\n]*)\n(.*?)```", raw_response, re.DOTALL)
+    search_text = fenced[-1] if fenced else raw_response
+    x_match = _X_PATTERN.search(search_text)
+    z_match = _Z_PATTERN.search(search_text)
+    x_errors: list[int] = []
+    z_errors: list[int] = []
+    x_clean = z_clean = False
+    if x_match is not None:
+        x_errors, x_clean = _parse_int_list(x_match.group(1), num_data_qubits)
+    if z_match is not None:
+        z_errors, z_clean = _parse_int_list(z_match.group(1), num_data_qubits)
+    x_present = x_match is not None and x_clean
+    z_present = z_match is not None and z_clean
+    parse_success = x_present and z_present
+    parse_partial = (x_present ^ z_present) or (
+        # Both keys present but at least one had garbage tokens.
+        (x_match is not None and z_match is not None) and not parse_success
+    )
+    return ParseResult(
+        x_errors=x_errors,
+        z_errors=z_errors,
+        parse_success=parse_success,
+        parse_partial=parse_partial,
+        raw_response=raw_response,
+    )
+def format_completion(x_errors: Iterable[int], z_errors: Iterable[int]) -> str:
+    """The canonical SFT target string. Inverse of :func:`parse_action`."""
+    x_str = ",".join(str(q) for q in sorted(set(x_errors)))
+    z_str = ",".join(str(q) for q in sorted(set(z_errors)))
+    return f"X_ERRORS=[{x_str}] Z_ERRORS=[{z_str}]"

qubit_medic/server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Server-side modules: physics, rewards, curriculum, FastAPI app.
+Sub-modules are imported lazily on first attribute access to avoid
+circular imports during partial initialisation.
+"""

qubit_medic/server/app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""Qubit-Medic FastAPI server.
+Built on **openenv-core** ``create_fastapi_app`` so the canonical OpenEnv
+routes (``/reset``, ``/step``, ``/state``, ``/health``, ``/schema``,
+``/metadata``, ``/mcp``) are wired automatically by the framework.
+We add a few extras on top:
+* ``GET  /healthz``   - the Day-0 deployment-substrate liveness probe
+  (returns Stim/PyMatching/openenv versions). Used by the recurring
+  4-hour HF Spaces wakeup ping.
+* ``POST /decode``    - PyMatching baseline demo: takes a hand-crafted
+  syndrome and returns the matching-decoder's prediction. Useful for
+  the Gradio playground.
+Run with ``python -m qubit_medic.server.app`` or
+``uvicorn qubit_medic.server.app:app --host 0.0.0.0 --port 7860``.
+"""
+from __future__ import annotations
+import logging
+import os
+import sys
+from typing import Optional
+from fastapi import Body, HTTPException
+from openenv.core import create_fastapi_app
+from qubit_medic.config import DEFAULT_HOST, DEFAULT_PORT
+from qubit_medic.server.environment import DecoderEnvironment
+from qubit_medic.server.openenv_adapter import (
+    QubitMedicAction,
+    QubitMedicEnvironment,
+    QubitMedicObservation,
+)
+logger = logging.getLogger("qubit_medic.server")
+logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
+# --------------------------------------------------------------------------- #
+# Build the OpenEnv-compliant FastAPI app                                     #
+# --------------------------------------------------------------------------- #
+app = create_fastapi_app(
+    env=QubitMedicEnvironment,
+    action_cls=QubitMedicAction,
+    observation_cls=QubitMedicObservation,
+)
+app.title = "Qubit-Medic OpenEnv"
+app.version = os.getenv("QUBIT_MEDIC_VERSION", "1.0.0")
+app.description = (
+    "RL training environment for LLM-based quantum error-correction "
+    "decoders. Built on Stim + PyMatching with five independent verifiable "
+    "rewards (logical correction, syndrome consistency, Hamming overlap, "
+    "format compliance, PyMatching beat-rate). Wraps "
+    "qubit_medic.server.environment.DecoderEnvironment in "
+    "openenv.core.Environment - see /metadata, /schema, /docs."
+)
+# --------------------------------------------------------------------------- #
+# Day-0 + demo extras                                                          #
+# --------------------------------------------------------------------------- #
+# Lazy-built *legacy* DecoderEnvironment for /decode demos. The OpenEnv
+# adapter has its own per-instance DecoderEnvironment; we keep this one
+# around for the simple synchronous `/decode` baseline endpoint.
+_legacy_env: Optional[DecoderEnvironment] = None
+def _get_legacy_env() -> DecoderEnvironment:
+    global _legacy_env
+    if _legacy_env is None:
+        _legacy_env = DecoderEnvironment()
+        _legacy_env._cache_for("L1_warmup")  # noqa: SLF001
+        _legacy_env._cache_for("L2_target")  # noqa: SLF001
+    return _legacy_env
+@app.get("/healthz")
+def healthz() -> dict:
+    """Lightweight liveness probe (Day-0 deployment-substrate test).
+    Returns the versions of Stim, PyMatching, and openenv so curl-ing
+    this in a browser or from Colab proves both that networking works
+    AND that the heavy quantum + RL deps actually loaded. Used by the
+    recurring 4-hour HF Spaces wakeup ping.
+    """
+    import stim
+    try:
+        import pymatching as _pm
+        pm_v = getattr(_pm, "__version__", "unknown")
+    except Exception as exc:  # pragma: no cover - defensive
+        pm_v = f"import-error: {exc}"
+    try:
+        import openenv as _oe
+        oe_v = getattr(_oe, "__version__", "unknown")
+    except Exception as exc:  # pragma: no cover - defensive
+        oe_v = f"import-error: {exc}"
+    return {
+        "ok": True,
+        "service": "qubit-medic",
+        "version": app.version,
+        "stim_version": stim.__version__,
+        "pymatching_version": pm_v,
+        "openenv_version": oe_v,
+        "python_version": sys.version.split()[0],
+    }
+@app.post("/decode")
+def decode(
+    syndrome: list[int] = Body(..., embed=True),
+    level: str = Body("L2_target", embed=True),
+) -> dict:
+    """Decode an arbitrary syndrome with PyMatching (baseline) and return
+    its predicted Pauli frame and observable flip.
+    Intended for the live Gradio demo: a notebook or web page can POST a
+    hand-crafted syndrome here and visualise the matching-decoder result.
+    """
+    import numpy as np
+    env = _get_legacy_env()
+    cache = env._cache_for(level)  # noqa: SLF001
+    arr = np.asarray(syndrome, dtype=np.uint8)
+    if arr.shape[0] != cache.layout.num_detectors:
+        raise HTTPException(
+            status_code=400,
+            detail=f"syndrome length {arr.shape[0]} != "
+                   f"{cache.layout.num_detectors} expected for {level}",
+        )
+    from qubit_medic.server.physics import (
+        predicted_observable_flip,
+        pymatching_predicted_pauli_frame,
+    )
+    pm_obs = int(cache.matching.decode(arr)[0])
+    px, pz = pymatching_predicted_pauli_frame(cache.matching, arr, cache.layout)
+    return {
+        "level": level,
+        "syndrome": syndrome,
+        "pymatching_observable_flip": pm_obs,
+        "pymatching_x_errors": px,
+        "pymatching_z_errors": pz,
+        "implied_observable_from_x_errors": predicted_observable_flip(
+            px, cache.layout
+        ),
+    }
+# --------------------------------------------------------------------------- #
+# Local entry point                                                            #
+# --------------------------------------------------------------------------- #
+def _main() -> None:
+    import uvicorn
+    uvicorn.run(
+        "qubit_medic.server.app:app",
+        host=os.getenv("QUBIT_MEDIC_HOST", DEFAULT_HOST),
+        port=int(os.getenv("QUBIT_MEDIC_PORT", str(DEFAULT_PORT))),
+        log_level=os.getenv("LOG_LEVEL", "info").lower(),
+    )
+if __name__ == "__main__":
+    _main()

qubit_medic/server/curriculum.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Adaptive curriculum scheduler (Section 4.4 of the plan).
+Maintains a moving-average logical-correction rate per level and promotes
+the agent to harder levels once the threshold is met. Implements the
+Section 4.4 mixing rules:
+* Stay at L1 until L1 hits 80%.
+* Then mix L1/L2 with weights 30/70 until L2 hits 70%.
+* Then unlock L3 at 30% weight (with L1/L2 sharing the remaining 70%).
+The scheduler is *override-able* - eval scripts pass ``forced_level`` to
+hold one configuration steady.
+"""
+from __future__ import annotations
+import random
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Optional
+from qubit_medic.config import CURRICULUM, CurriculumLevel, level_by_name
+# --------------------------------------------------------------------------- #
+# Per-level moving average                                                     #
+# --------------------------------------------------------------------------- #
+@dataclass
+class _MovingWindow:
+    window_size: int = 100
+    history: deque[float] = field(default_factory=deque)
+    def push(self, value: float) -> None:
+        self.history.append(value)
+        while len(self.history) > self.window_size:
+            self.history.popleft()
+    def mean(self) -> float:
+        return sum(self.history) / len(self.history) if self.history else 0.0
+    def __len__(self) -> int:
+        return len(self.history)
+# --------------------------------------------------------------------------- #
+# Scheduler                                                                    #
+# --------------------------------------------------------------------------- #
+@dataclass
+class CurriculumScheduler:
+    """Picks a curriculum level for each new episode."""
+    rng: random.Random = field(default_factory=lambda: random.Random(42))
+    windows: dict[str, _MovingWindow] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        for lvl in CURRICULUM:
+            self.windows.setdefault(lvl.name, _MovingWindow())
+    # ----- public API -----------------------------------------------------
+    def update(self, level_name: str, logical_correction: float) -> None:
+        """Record one episode's logical-correction outcome."""
+        self.windows[level_name].push(float(logical_correction))
+    def sample(self, forced_level: Optional[str] = None) -> CurriculumLevel:
+        """Return the level to use for the next episode."""
+        if forced_level is not None:
+            return level_by_name(forced_level)
+        l1, l2, l3 = (level_by_name(n) for n in ("L1_warmup", "L2_target", "L3_stretch"))
+        l1_rate = self.windows["L1_warmup"].mean()
+        l2_rate = self.windows["L2_target"].mean()
+        l1_n = len(self.windows["L1_warmup"])
+        l2_n = len(self.windows["L2_target"])
+        # Phase A: still working on L1.
+        if l1_n < 30 or l1_rate < l1.promotion_threshold:
+            return l1
+        # Phase B: L1 unlocked, mixing L1 (30%) and L2 (70%).
+        if l2_n < 30 or l2_rate < l2.promotion_threshold:
+            return l1 if self.rng.random() < 0.30 else l2
+        # Phase C: L3 unlocked, splits 20% L1, 50% L2, 30% L3.
+        roll = self.rng.random()
+        if roll < 0.20:
+            return l1
+        if roll < 0.70:
+            return l2
+        return l3
+    # ----- introspection (used by /state endpoint and logs) ---------------
+    def stats(self) -> dict[str, dict[str, float]]:
+        return {
+            name: {
+                "moving_mean": w.mean(),
+                "samples": float(len(w)),
+            }
+            for name, w in self.windows.items()
+        }

qubit_medic/server/environment.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""DecoderEnvironment: the OpenEnv-style env that the LLM trainer talks to.
+This is the heart of the server (Sections 2.4 + 2.5 of the plan):
+* ``reset()``: pick a curriculum level, build a circuit, sample a syndrome,
+  return a :class:`DecoderObservation`.
+* ``step(raw_response)``: parse the LLM's text, score five rewards, return
+  a :class:`StepResult` whose ``info`` dict carries the per-component
+  breakdown.
+Episodes are single-step (Section 2.5): the LLM emits one prediction and
+the episode ends.
+"""
+from __future__ import annotations
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+import pymatching
+from qubit_medic.config import (
+    EPISODE_TIMEOUT_SECONDS,
+    PRIMARY_SEED,
+    REWARD_WEIGHTS,
+)
+from qubit_medic.models import (
+    DecoderAction,
+    DecoderObservation,
+    DecoderState,
+    StepResult,
+)
+from qubit_medic.prompts import build_prompt, parse_action
+from qubit_medic.server import physics
+from qubit_medic.server.curriculum import CurriculumScheduler
+from qubit_medic.server.physics import (
+    CircuitLayout,
+    SyndromeSample,
+    build_circuit,
+    build_dem,
+    dem_digest,
+    extract_layout,
+    per_round_x_z_counts,
+    sample_episode,
+)
+from qubit_medic.server.rewards import (
+    RewardBreakdown,
+    compute_all_rewards,
+    compute_final_detector_supports,
+)
+# --------------------------------------------------------------------------- #
+# Per-level cached compilation - building Stim/PyMatching is the slow step    #
+# --------------------------------------------------------------------------- #
+@dataclass
+class _LevelCache:
+    """Compiled Stim/PyMatching artefacts for one curriculum level."""
+    circuit: object
+    dem: object
+    matching: pymatching.Matching
+    layout: CircuitLayout
+    final_detector_supports: dict
+    dem_digest: str
+    @classmethod
+    def build(cls, level) -> "_LevelCache":
+        c = build_circuit(level)
+        d = build_dem(c)
+        m = pymatching.Matching.from_detector_error_model(d)
+        layout = extract_layout(c)
+        supports = compute_final_detector_supports(layout)
+        return cls(
+            circuit=c,
+            dem=d,
+            matching=m,
+            layout=layout,
+            final_detector_supports=supports,
+            dem_digest=dem_digest(d),
+        )
+# --------------------------------------------------------------------------- #
+# DecoderEnvironment                                                           #
+# --------------------------------------------------------------------------- #
+@dataclass
+class _ActiveEpisode:
+    """In-flight episode bookkeeping."""
+    state: DecoderState
+    sample: SyndromeSample
+    layout: CircuitLayout
+    final_detector_supports: dict
+    started_at: float
+class DecoderEnvironment:
+    """OpenEnv-style env for surface-code decoding.
+    Thread-safe by virtue of a single ``_lock``: the FastAPI server is
+    expected to be I/O bound, and per-call latency is well under a
+    millisecond, so a coarse lock is fine and dramatically simplifies the
+    state machine.
+    """
+    def __init__(self, *, base_seed: int = PRIMARY_SEED) -> None:
+        self._lock = threading.Lock()
+        self._scheduler = CurriculumScheduler(rng=__import__("random").Random(base_seed))
+        self._caches: dict[str, _LevelCache] = {}
+        self._episode_counter = 0
+        self._base_seed = base_seed
+        self._active: dict[int, _ActiveEpisode] = {}
+    # ----- cache helpers --------------------------------------------------
+    def _cache_for(self, level_name: str):
+        cache = self._caches.get(level_name)
+        if cache is not None:
+            return cache
+        from qubit_medic.config import level_by_name
+        cache = _LevelCache.build(level_by_name(level_name))
+        self._caches[level_name] = cache
+        return cache
+    # ----- public API -----------------------------------------------------
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        forced_level: Optional[str] = None,
+    ) -> DecoderObservation:
+        with self._lock:
+            self._episode_counter += 1
+            ep_id = self._episode_counter
+            shot_seed = seed if seed is not None else self._base_seed + ep_id
+            level = self._scheduler.sample(forced_level=forced_level)
+            cache = self._cache_for(level.name)
+            sample = sample_episode(
+                circuit=cache.circuit,
+                matching=cache.matching,
+                layout=cache.layout,
+                seed=shot_seed,
+            )
+            state = DecoderState(
+                episode_id=ep_id,
+                seed=shot_seed,
+                curriculum_level=level.name,
+                distance=level.distance,
+                rounds=level.rounds,
+                p=level.p,
+                syndrome_bits=sample.syndrome_bits,
+                true_x_errors=sample.pymatching_x_errors,
+                true_z_errors=sample.pymatching_z_errors,
+                actual_observable_flip=sample.actual_observable_flip,
+                pymatching_observable_pred=sample.pymatching_observable_pred,
+                x_observable_support=[],  # memory_z task: no X observable
+                z_observable_support=list(cache.layout.z_observable_support),
+                num_data_qubits=cache.layout.num_data_qubits,
+                num_stabilizers=cache.layout.num_ancilla_qubits,
+                circuit_text=str(cache.circuit),
+                dem_text=str(cache.dem),
+            )
+            self._active[ep_id] = _ActiveEpisode(
+                state=state,
+                sample=sample,
+                layout=cache.layout,
+                final_detector_supports=cache.final_detector_supports,
+                started_at=time.monotonic(),
+            )
+            n_x, n_z = per_round_x_z_counts(cache.layout)
+            prompt = build_prompt(
+                distance=level.distance,
+                rounds=level.rounds,
+                p=level.p,
+                syndrome_bits=sample.syndrome_bits,
+                num_x_stabilizers=n_x,
+                num_z_stabilizers=n_z,
+                num_data_qubits=cache.layout.num_data_qubits,
+            )
+            return DecoderObservation(
+                prompt=prompt,
+                syndrome_bits=sample.syndrome_bits,
+                distance=level.distance,
+                rounds=level.rounds,
+                p=level.p,
+                curriculum_level=level.name,
+                episode_id=ep_id,
+                dem_digest=cache.dem_digest,
+            )
+    def step(self, raw_response: str, episode_id: int) -> StepResult:
+        with self._lock:
+            episode = self._active.pop(episode_id, None)
+            if episode is None:
+                # Calling step() on an unknown episode ID is a hard error -
+                # the trainer didn't follow reset/step pairing.
+                raise KeyError(f"unknown or already-finished episode {episode_id}")
+            elapsed = time.monotonic() - episode.started_at
+            timed_out = elapsed > EPISODE_TIMEOUT_SECONDS
+            parsed = parse_action(
+                raw_response=raw_response,
+                num_data_qubits=episode.layout.num_data_qubits,
+            )
+            if timed_out:
+                # Hard timeout: zero reward, mark format compliance as zero,
+                # close the episode cleanly (Section 2.6).
+                breakdown = RewardBreakdown(
+                    logical_correction=0.0,
+                    syndrome_consistency=0.0,
+                    hamming_overlap=0.0,
+                    format_compliance=0.0,
+                    pymatching_beat=0.0,
+                    total=0.0,
+                )
+                action = DecoderAction(
+                    raw_response=raw_response,
+                    parse_success=False,
+                )
+            else:
+                # Convert LLM-space qubit IDs (0..N-1) to Stim IDs before
+                # scoring; rewards operate in the Stim coordinate system.
+                from qubit_medic.prompts import ParseResult
+                parsed_stim = ParseResult(
+                    x_errors=episode.layout.llm_to_stim(parsed.x_errors),
+                    z_errors=episode.layout.llm_to_stim(parsed.z_errors),
+                    parse_success=parsed.parse_success,
+                    parse_partial=parsed.parse_partial,
+                    raw_response=parsed.raw_response,
+                )
+                breakdown = compute_all_rewards(
+                    parsed=parsed_stim,
+                    sample=episode.sample,
+                    layout=episode.layout,
+                    final_detector_supports=episode.final_detector_supports,
+                    weights=REWARD_WEIGHTS,
+                )
+                action = DecoderAction(
+                    x_error_qubits=parsed.x_errors,
+                    z_error_qubits=parsed.z_errors,
+                    raw_response=raw_response,
+                    parse_success=parsed.parse_success,
+                )
+            self._scheduler.update(
+                episode.state.curriculum_level,
+                logical_correction=breakdown.logical_correction,
+            )
+            episode.state.last_reward_breakdown = breakdown.as_dict()
+            n_x, n_z = per_round_x_z_counts(episode.layout)
+            prompt = build_prompt(
+                distance=episode.state.distance,
+                rounds=episode.state.rounds,
+                p=episode.state.p,
+                syndrome_bits=episode.state.syndrome_bits,
+                num_x_stabilizers=n_x,
+                num_z_stabilizers=n_z,
+                num_data_qubits=episode.layout.num_data_qubits,
+            )
+            obs = DecoderObservation(
+                prompt=prompt,
+                syndrome_bits=episode.state.syndrome_bits,
+                distance=episode.state.distance,
+                rounds=episode.state.rounds,
+                p=episode.state.p,
+                curriculum_level=episode.state.curriculum_level,
+                episode_id=episode.state.episode_id,
+                dem_digest=episode.state.dem_text[:8],
+            )
+            info = {
+                "rewards": breakdown.as_dict(),
+                "parsed_action": action.model_dump(),
+                "actual_observable_flip": episode.sample.actual_observable_flip,
+                "pymatching_observable_pred": episode.sample.pymatching_observable_pred,
+                "pymatching_x_errors": episode.sample.pymatching_x_errors,
+                "pymatching_z_errors": episode.sample.pymatching_z_errors,
+                "elapsed_seconds": elapsed,
+                "timed_out": timed_out,
+                "curriculum_stats": self._scheduler.stats(),
+            }
+            return StepResult(
+                observation=obs,
+                reward=breakdown.total,
+                done=True,  # single-step episodes
+                truncated=timed_out,
+                info=info,
+            )
+    # ----- introspection --------------------------------------------------
+    def health(self) -> dict:
+        with self._lock:
+            return {
+                "ok": True,
+                "episodes_started": self._episode_counter,
+                "active_episodes": len(self._active),
+                "curriculum": self._scheduler.stats(),
+                "cached_levels": list(self._caches.keys()),
+            }

qubit_medic/server/openenv_adapter.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""OpenEnv-compliant adapter around :class:`DecoderEnvironment`.
+This wrapper satisfies the submission requirement *"Use OpenEnv (latest
+release). Build on top of the framework; don't reinvent the wheel."* by
+exposing our underlying :class:`qubit_medic.server.environment.DecoderEnvironment`
+through the official ``openenv.core.Environment`` base class.
+The adapter is intentionally thin: it just translates between OpenEnv's
+``Action`` / ``Observation`` / ``State`` Pydantic shapes and our internal
+``DecoderObservation`` / ``DecoderAction`` / ``StepResult``. All the
+physics, reward scoring, curriculum, and episode bookkeeping continue to
+live in :class:`DecoderEnvironment` - that code is *the* tested,
+production path.
+Usage
+-----
+The OpenEnv-compliant FastAPI app is created with::
+    from openenv.core import create_fastapi_app
+    from qubit_medic.server.openenv_adapter import (
+        QubitMedicEnvironment, QubitMedicAction, QubitMedicObservation,
+    )
+    app = create_fastapi_app(
+        env=QubitMedicEnvironment,
+        action_cls=QubitMedicAction,
+        observation_cls=QubitMedicObservation,
+    )
+This registers the canonical OpenEnv routes:
+* ``POST /reset``    - body ``{"seed": int?, "episode_id": str?}``
+* ``POST /step``     - body ``{"action": {...QubitMedicAction...},
+  "timeout_s": float?, "request_id": str?}``
+* ``GET  /state``    - returns the current :class:`QubitMedicState`
+* ``GET  /health``   - liveness probe
+* ``GET  /schema``   - JSON Schema for the action/observation models
+* ``GET  /metadata`` - environment metadata
+* ``POST /mcp``      - Model Context Protocol endpoint
+* ``GET  /docs``     - Swagger UI (auto-generated by FastAPI)
+We additionally mount our own ``/healthz`` (Day-0 contract) and
+``/decode`` (PyMatching baseline demo) on the returned app from
+``qubit_medic.server.app``.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from openenv.core import Action, Environment, Observation, State
+from openenv.core.env_server.types import EnvironmentMetadata
+from pydantic import ConfigDict, Field
+from qubit_medic.server.environment import DecoderEnvironment
+# --------------------------------------------------------------------------- #
+# Process-wide singleton                                                      #
+# --------------------------------------------------------------------------- #
+# OpenEnv's HTTP server (simulation mode) instantiates a *fresh* Environment
+# via the factory on every /reset and /step call. Our episode bookkeeping
+# (the `_active` dict) lives inside DecoderEnvironment, so we route every
+# QubitMedicEnvironment instance through the same DecoderEnvironment.
+# This keeps reset() -> step() pairing intact across stateless HTTP calls
+# while remaining fully compatible with OpenEnv's WebSocket session model
+# (each WS session still gets its own QubitMedicEnvironment wrapper).
+_INNER_SINGLETON: Optional[DecoderEnvironment] = None
+def _get_shared_inner() -> DecoderEnvironment:
+    """Return the process-wide DecoderEnvironment, building it lazily."""
+    global _INNER_SINGLETON
+    if _INNER_SINGLETON is None:
+        env = DecoderEnvironment()
+        env._cache_for("L1_warmup")  # noqa: SLF001 - intentional pre-warm
+        env._cache_for("L2_target")  # noqa: SLF001
+        _INNER_SINGLETON = env
+    return _INNER_SINGLETON
+# --------------------------------------------------------------------------- #
+# OpenEnv-flavoured Action / Observation / State                              #
+# --------------------------------------------------------------------------- #
+class QubitMedicAction(Action):
+    """LLM-emitted action: the raw text the model generated.
+    The server parses this into ``x_error_qubits`` / ``z_error_qubits`` via
+    :func:`qubit_medic.prompts.parse_action`. We keep the wire format
+    *just the raw string* so the server retains full control over parsing
+    (and so the trainer's reward function can audit unparseable outputs).
+    The trainer is also free to populate ``parsed_x_errors`` /
+    ``parsed_z_errors`` directly when it wants to bypass the LLM (useful
+    for baseline policies and unit tests).
+    """
+    # Inherit Action.model_config (extra='forbid', validate_assignment=True).
+    raw_response: str = Field(
+        default="",
+        description="Raw LLM completion text. Server parses to x/z error lists.",
+    )
+    parsed_x_errors: Optional[list[int]] = Field(
+        default=None,
+        description="Optional pre-parsed X-error qubit ids (LLM-space). "
+                    "When provided, the server skips text parsing.",
+    )
+    parsed_z_errors: Optional[list[int]] = Field(
+        default=None,
+        description="Optional pre-parsed Z-error qubit ids (LLM-space).",
+    )
+    episode_id: Optional[int] = Field(
+        default=None,
+        description="Server-assigned episode id from the matching reset(). "
+                    "If omitted, the most-recent active episode is used.",
+    )
+class QubitMedicObservation(Observation):
+    """OpenEnv observation - mirrors :class:`DecoderObservation` plus the
+    standard OpenEnv ``done`` / ``reward`` fields.
+    The ``info`` dict (returned by ``step``) carries the per-component
+    reward breakdown, the ground-truth observable flip, and the PyMatching
+    baseline prediction so the trainer can score auxiliary metrics.
+    """
+    model_config = ConfigDict(extra="forbid", validate_assignment=True,
+                              arbitrary_types_allowed=True)
+    prompt: str = Field(default="", description="Pre-formatted LLM prompt.")
+    syndrome_bits: list[int] = Field(default_factory=list,
+                                      description="Detector activations (0/1).")
+    distance: int = Field(default=0, description="Code distance for this episode.")
+    rounds: int = Field(default=0, description="Number of stabilizer rounds.")
+    p: float = Field(default=0.0, description="SI1000 base error rate.")
+    curriculum_level: str = Field(default="",
+                                   description="Curriculum level name.")
+    episode_id: int = Field(default=0,
+                             description="Server-assigned episode counter.")
+    dem_digest: str = Field(default="",
+                             description="Short hash of the detector error model.")
+    info: dict[str, Any] = Field(default_factory=dict,
+                                  description="Per-step extras (reward "
+                                              "breakdown, ground-truth flip, "
+                                              "PyMatching baseline, etc.).")
+class QubitMedicState(State):
+    """Externally-visible state. We expose only the curriculum + episode
+    counters; physics-truth fields stay server-side to prevent reward
+    hacking (see :mod:`qubit_medic.models.DecoderState` doc-comment)."""
+    model_config = ConfigDict(extra="allow", validate_assignment=True,
+                              arbitrary_types_allowed=True)
+    episodes_started: int = 0
+    active_episodes: int = 0
+    cached_levels: list[str] = Field(default_factory=list)
+    curriculum: dict[str, Any] = Field(default_factory=dict)
+    last_reward_breakdown: Optional[dict[str, float]] = None
+# --------------------------------------------------------------------------- #
+# Environment wrapper                                                         #
+# --------------------------------------------------------------------------- #
+class QubitMedicEnvironment(Environment[QubitMedicAction,
+                                         QubitMedicObservation,
+                                         QubitMedicState]):
+    """OpenEnv-compliant view of :class:`DecoderEnvironment`.
+    Single-step episodes (``done=True`` after every ``step``). The OpenEnv
+    HTTP server gets a fresh instance per WebSocket session if
+    ``SUPPORTS_CONCURRENT_SESSIONS=True``; we set it to ``False`` because
+    our DecoderEnvironment uses a single Stim cache + a coarse lock, which
+    is simpler than per-session state and good enough for the GRPO
+    training loop.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = False
+    def __init__(self) -> None:
+        super().__init__()
+        # Share the underlying DecoderEnvironment across every wrapper
+        # instance the HTTP server creates - see _get_shared_inner.
+        self._inner = _get_shared_inner()
+        self._last_episode_id: Optional[int] = None
+        self._last_reward_breakdown: Optional[dict[str, float]] = None
+    # ----- abstract API --------------------------------------------------- #
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> QubitMedicObservation:
+        forced_level = kwargs.get("forced_level")
+        obs = self._inner.reset(seed=seed, forced_level=forced_level)
+        self._last_episode_id = obs.episode_id
+        self._last_reward_breakdown = None
+        return QubitMedicObservation(
+            prompt=obs.prompt,
+            syndrome_bits=list(obs.syndrome_bits),
+            distance=obs.distance,
+            rounds=obs.rounds,
+            p=obs.p,
+            curriculum_level=obs.curriculum_level,
+            episode_id=obs.episode_id,
+            dem_digest=obs.dem_digest,
+            done=False,
+            reward=None,
+            info={"event": "reset"},
+        )
+    def step(
+        self,
+        action: QubitMedicAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> QubitMedicObservation:
+        ep = action.episode_id if action.episode_id is not None else self._last_episode_id
+        if ep is None:
+            raise RuntimeError(
+                "step() called before reset(); no active episode to score."
+            )
+        # If the trainer pre-parsed the action, format a synthetic raw
+        # response in the canonical "X: ... | Z: ..." shape so the server's
+        # parser produces the same x/z lists.
+        if action.parsed_x_errors is not None or action.parsed_z_errors is not None:
+            xs = action.parsed_x_errors or []
+            zs = action.parsed_z_errors or []
+            raw = f"<answer>X: {','.join(map(str, xs))} | Z: {','.join(map(str, zs))}</answer>"
+        else:
+            raw = action.raw_response
+        result = self._inner.step(raw_response=raw, episode_id=ep)
+        self._last_reward_breakdown = result.info.get("rewards")
+        return QubitMedicObservation(
+            prompt=result.observation.prompt,
+            syndrome_bits=list(result.observation.syndrome_bits),
+            distance=result.observation.distance,
+            rounds=result.observation.rounds,
+            p=result.observation.p,
+            curriculum_level=result.observation.curriculum_level,
+            episode_id=result.observation.episode_id,
+            dem_digest=result.observation.dem_digest,
+            done=result.done,
+            reward=float(result.reward),
+            info=result.info,
+        )
+    @property
+    def state(self) -> QubitMedicState:
+        h = self._inner.health()
+        return QubitMedicState(
+            episode_id=str(self._last_episode_id)
+                if self._last_episode_id is not None else None,
+            step_count=int(h.get("episodes_started", 0)),
+            episodes_started=int(h.get("episodes_started", 0)),
+            active_episodes=int(h.get("active_episodes", 0)),
+            cached_levels=list(h.get("cached_levels", [])),
+            curriculum=dict(h.get("curriculum", {})),
+            last_reward_breakdown=self._last_reward_breakdown,
+        )
+    # ----- nice-to-haves -------------------------------------------------- #
+    def get_metadata(self) -> EnvironmentMetadata:
+        return EnvironmentMetadata(
+            name="QubitMedicEnvironment",
+            description=(
+                "RL training environment for LLM-based quantum error-"
+                "correction decoders. Built on Stim + PyMatching. Five "
+                "verifiable rewards (logical correction, syndrome consistency, "
+                "Hamming overlap, format compliance, PyMatching beat-rate)."
+            ),
+            version="1.0.0",
+        )
+    def close(self) -> None:  # nothing to clean up
+        return None

qubit_medic/server/physics.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""Stim + PyMatching wrapper - the physics engine (Section 2.4 of the plan).
+This module never makes decoding decisions: it builds circuits, samples
+syndromes, computes baselines, and exposes the observable's support on the
+data qubits so the reward functions can score predictions deterministically.
+Two design choices worth flagging up-front:
+* The LLM's action is a **terminal Pauli frame** on data qubits (the X and Z
+  errors on each data qubit at the moment of final measurement). This
+  representation is exact for the rotated memory_z task and lets us reuse
+  Stim/PyMatching ground-truth machinery. The trade-off is documented in
+  ``rewards.py``: the syndrome-consistency reward (Reward 2) only constrains
+  the *final-round* detectors. Earlier rounds are silent w.r.t. an
+  end-of-circuit Pauli frame; that is intentional and made explicit in the
+  reward's docstring.
+* "Ground-truth error pattern" for Reward 3 is taken to be the
+  **PyMatching-most-probable error pattern** explaining the syndrome
+  (extracted via ``Matching.decode_to_edges_array``). This is the
+  near-optimal canonical choice and matches what the AlphaQubit baseline
+  comparison uses. The README's *honesty note* repeats this.
+"""
+from __future__ import annotations
+import hashlib
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import pymatching
+import stim
+from qubit_medic.config import (
+    CODE_TASK,
+    CurriculumLevel,
+    SI1000Rates,
+)
+# --------------------------------------------------------------------------- #
+# Circuit + DEM construction                                                   #
+# --------------------------------------------------------------------------- #
+def build_circuit(level: CurriculumLevel) -> stim.Circuit:
+    """Generate a Stim ``rotated_memory_z`` circuit at the given level."""
+    rates = SI1000Rates.from_p(level.p)
+    return stim.Circuit.generated(
+        CODE_TASK,
+        distance=level.distance,
+        rounds=level.rounds,
+        **rates.as_stim_kwargs(),
+    )
+def build_dem(circuit: stim.Circuit) -> stim.DetectorErrorModel:
+    """Decompose-errors=True is mandatory for PyMatching."""
+    return circuit.detector_error_model(decompose_errors=True)
+def dem_digest(dem: stim.DetectorErrorModel) -> str:
+    """8-char digest of the DEM, useful for grouping training logs."""
+    return hashlib.sha256(str(dem).encode("utf-8")).hexdigest()[:8]
+# --------------------------------------------------------------------------- #
+# Layout introspection - figure out data qubits, ancillas, observable support  #
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class CircuitLayout:
+    """Static facts about a circuit, computed once per episode.
+    Two indexings coexist:
+    * **Stim IDs** (``data_qubits``) are the physical qubit IDs Stim emits
+      (e.g. ``(1, 3, 5, 8, 10, 12, 15, 17, 19)`` for distance-3). These are
+      what Stim/PyMatching speak.
+    * **LLM IDs** are consecutive ``0..num_data_qubits-1``. These are what
+      the prompt advertises and what the LLM emits, because consecutive
+      small ints are dramatically easier for a language model to handle.
+    :meth:`llm_to_stim` and :meth:`stim_to_llm` perform the remap. *All*
+    server-internal scoring uses Stim IDs; the boundary at the prompt
+    formatter / parser converts.
+    """
+    data_qubits: tuple[int, ...]
+    """Stim IDs of data qubits (measured by terminal ``M``), sorted."""
+    data_qubit_coords: tuple[tuple[float, float], ...]
+    """(x, y) coordinate of each data qubit, in the same order as
+    ``data_qubits``. Used by Reward 3 to snap PyMatching edges to qubits."""
+    ancilla_qubits: tuple[int, ...]
+    """Physical qubit IDs that hold stabiliser measurements (``MR``)."""
+    z_observable_support: tuple[int, ...]
+    """Data qubits whose Z value is XOR'd into the logical Z observable.
+    An X error on any of these flips the observable."""
+    detector_round: tuple[int, ...]
+    """For each detector index, the round it nominally belongs to (0-based,
+    extracted from the ``DETECTOR(x, y, t)`` coordinate)."""
+    detector_coords: tuple[tuple[float, float], ...]
+    """(x, y) coordinate of each detector, used by Reward 3."""
+    detector_is_x_type: tuple[bool, ...]
+    """Whether the detector watches an X-stabiliser. For the rotated surface
+    code Stim places X-stabilisers at coordinates with ``(x + y) mod 4 == 2``
+    and Z-stabilisers at ``(x + y) mod 4 == 0`` (verified empirically against
+    Stim 1.15's ``surface_code:rotated_memory_z``)."""
+    final_detectors: tuple[int, ...]
+    """Indices of detectors that correspond to the *last* timeslice - those
+    are the only detectors a terminal Pauli frame can affect (Reward 2)."""
+    num_data_qubits: int
+    num_ancilla_qubits: int
+    num_detectors: int
+    num_observables: int
+    # ----- LLM <-> Stim qubit-ID remapping ---------------------------------
+    def llm_to_stim(self, llm_ids: list[int]) -> list[int]:
+        """Convert consecutive LLM IDs to physical Stim IDs.
+        Out-of-range IDs are silently dropped (the parser already enforces
+        the upper bound, but we double-check here as a defence-in-depth).
+        """
+        out: list[int] = []
+        n = len(self.data_qubits)
+        for i in llm_ids:
+            if 0 <= i < n:
+                out.append(self.data_qubits[i])
+        return out
+    def stim_to_llm(self, stim_ids: list[int]) -> list[int]:
+        """Inverse of :meth:`llm_to_stim` - used to render targets in the
+        SFT data and the imitator policy."""
+        lookup = {q: i for i, q in enumerate(self.data_qubits)}
+        return [lookup[q] for q in stim_ids if q in lookup]
+def _walk_measurement_records(
+    circuit: stim.Circuit,
+) -> tuple[list[int], list[Optional[str]]]:
+    """Replay the circuit (no sampling) to map each measurement record to a
+    qubit. Returns parallel lists: qubits[i] = qubit id, instr[i] = gate."""
+    qubits: list[int] = []
+    instrs: list[Optional[str]] = []
+    def _walk(c: stim.Circuit, repeats: int = 1) -> None:
+        for _ in range(repeats):
+            for inst in c:
+                if isinstance(inst, stim.CircuitRepeatBlock):
+                    _walk(inst.body_copy(), inst.repeat_count)
+                    continue
+                name = inst.name
+                if name in {
+                    "M", "MX", "MY", "MZ",
+                    "MR", "MRX", "MRY", "MRZ",
+                    "MPP",
+                }:
+                    for t in inst.targets_copy():
+                        if t.is_qubit_target:
+                            qubits.append(t.qubit_value)
+                            instrs.append(name)
+    _walk(circuit)
+    return qubits, instrs
+def extract_layout(circuit: stim.Circuit) -> CircuitLayout:
+    """Walk the circuit once to build a full :class:`CircuitLayout`."""
+    flat = circuit.flattened()
+    measurement_qubits, measurement_instrs = _walk_measurement_records(circuit)
+    # Data qubits = those measured by terminal ``M`` (destructive, no reset).
+    data_qubits_in_order: list[int] = []
+    seen_data = set()
+    for q, instr in zip(measurement_qubits, measurement_instrs):
+        if instr == "M" and q not in seen_data:
+            data_qubits_in_order.append(q)
+            seen_data.add(q)
+    data_qubits = tuple(sorted(seen_data))
+    # Ancilla qubits = everything measured by MR (reset after measurement).
+    ancilla_qubits = tuple(
+        sorted({q for q, instr in zip(measurement_qubits, measurement_instrs)
+                if instr == "MR"})
+    )
+    # Observable support: walk OBSERVABLE_INCLUDE entries and resolve their
+    # rec[-k] back to qubit IDs via the measurement record table.
+    obs_support: dict[int, set[int]] = {}
+    for inst in flat:
+        if inst.name == "OBSERVABLE_INCLUDE":
+            args = inst.gate_args_copy()
+            obs_idx = int(args[0]) if args else 0
+            for t in inst.targets_copy():
+                if t.is_measurement_record_target:
+                    actual = len(measurement_qubits) + t.value  # value is negative
+                    if 0 <= actual < len(measurement_qubits):
+                        obs_support.setdefault(obs_idx, set()).add(
+                            measurement_qubits[actual]
+                        )
+    z_obs = tuple(sorted(obs_support.get(0, set())))
+    # Qubit coordinates from QUBIT_COORDS instructions.
+    qubit_coords: dict[int, tuple[float, float]] = {}
+    for inst in flat:
+        if inst.name == "QUBIT_COORDS":
+            args = inst.gate_args_copy()
+            x = float(args[0]) if len(args) >= 1 else 0.0
+            y = float(args[1]) if len(args) >= 2 else 0.0
+            for t in inst.targets_copy():
+                if t.is_qubit_target:
+                    qubit_coords[t.qubit_value] = (x, y)
+    data_qubit_coords = tuple(qubit_coords.get(q, (0.0, 0.0)) for q in data_qubits)
+    # Detector coordinates - last value of the tuple is the round index.
+    det_coords_raw = circuit.get_detector_coordinates()
+    num_dets = circuit.num_detectors
+    rounds_per_det: list[int] = []
+    is_x_type: list[bool] = []
+    detector_coords: list[tuple[float, float]] = []
+    for i in range(num_dets):
+        c = det_coords_raw.get(i, ())
+        if not c:
+            rounds_per_det.append(0)
+            is_x_type.append(False)
+            detector_coords.append((0.0, 0.0))
+            continue
+        round_idx = int(c[-1]) if len(c) >= 3 else 0
+        rounds_per_det.append(round_idx)
+        x = float(c[0]) if len(c) >= 1 else 0.0
+        y = float(c[1]) if len(c) >= 2 else 0.0
+        detector_coords.append((x, y))
+        # X-stabilisers sit at (x + y) % 4 == 2 in Stim's generator.
+        is_x_type.append((int(x + y) % 4) == 2)
+    final_round = max(rounds_per_det) if rounds_per_det else 0
+    final_dets = tuple(i for i, r in enumerate(rounds_per_det) if r == final_round)
+    return CircuitLayout(
+        data_qubits=data_qubits,
+        data_qubit_coords=data_qubit_coords,
+        ancilla_qubits=ancilla_qubits,
+        z_observable_support=z_obs,
+        detector_round=tuple(rounds_per_det),
+        detector_coords=tuple(detector_coords),
+        detector_is_x_type=tuple(is_x_type),
+        final_detectors=final_dets,
+        num_data_qubits=len(data_qubits),
+        num_ancilla_qubits=len(ancilla_qubits),
+        num_detectors=num_dets,
+        num_observables=circuit.num_observables,
+    )
+# --------------------------------------------------------------------------- #
+# Sampling and decoding                                                        #
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class SyndromeSample:
+    """One noisy episode: detector activations, ground-truth observable
+    flip, and PyMatching's prediction (used by Rewards 1 and 5)."""
+    syndrome_bits: list[int]
+    actual_observable_flip: int
+    pymatching_observable_pred: int
+    pymatching_x_errors: list[int]   # Pauli frame at end of circuit (X part)
+    pymatching_z_errors: list[int]   # Pauli frame at end of circuit (Z part)
+def sample_episode(
+    circuit: stim.Circuit,
+    matching: pymatching.Matching,
+    layout: CircuitLayout,
+    seed: Optional[int] = None,
+) -> SyndromeSample:
+    """Sample one shot, decode it with PyMatching, and bundle the result."""
+    sampler = circuit.compile_detector_sampler(seed=seed)
+    detection, observables = sampler.sample(1, separate_observables=True)
+    detection_row = detection[0].astype(np.uint8)
+    observable_flip = int(observables[0, 0]) if observables.shape[1] else 0
+    # PyMatching's prediction (observable level).
+    pred_obs = int(matching.decode(detection_row)[0])
+    # PyMatching's predicted physical Pauli frame on data qubits.
+    pred_x, pred_z = pymatching_predicted_pauli_frame(
+        matching=matching, syndrome=detection_row, layout=layout,
+    )
+    return SyndromeSample(
+        syndrome_bits=detection_row.tolist(),
+        actual_observable_flip=observable_flip,
+        pymatching_observable_pred=pred_obs,
+        pymatching_x_errors=pred_x,
+        pymatching_z_errors=pred_z,
+    )
+def pymatching_predicted_pauli_frame(
+    matching: pymatching.Matching,
+    syndrome: np.ndarray,
+    layout: CircuitLayout,
+) -> tuple[list[int], list[int]]:
+    """Convert PyMatching's per-edge prediction into a data-qubit Pauli frame.
+    The matching graph's edges correspond to error mechanisms in the DEM.
+    Each edge connects two detectors (or a detector and a boundary). The
+    data qubit responsible for the edge sits geometrically between the two
+    detectors on the surface-code grid - we recover it by snapping the
+    midpoint of the detector coordinates to the nearest data qubit.
+    This frame is used as ground-truth for Reward 3 (Hamming overlap).
+    Z-stabiliser endpoints (``(x+y) mod 4 == 0``) catch X errors on data
+    qubits; X-stabiliser endpoints catch Z errors. Boundary edges are
+    snapped to the unique data qubit adjacent to that boundary.
+    """
+    try:
+        edges = matching.decode_to_edges_array(syndrome)
+    except Exception:
+        return [], []
+    if edges is None or len(edges) == 0:
+        return [], []
+    data_qubits = layout.data_qubits
+    data_coords = layout.data_qubit_coords
+    det_coords = layout.detector_coords
+    det_is_x = layout.detector_is_x_type
+    n_dets = len(det_coords)
+    def _snap(x: float, y: float) -> int:
+        best_q = data_qubits[0]
+        best_d = float("inf")
+        for q, (qx, qy) in zip(data_qubits, data_coords):
+            d = (qx - x) ** 2 + (qy - y) ** 2
+            if d < best_d:
+                best_d = d
+                best_q = q
+        return best_q
+    x_errs: set[int] = set()
+    z_errs: set[int] = set()
+    for edge in edges:
+        a, b = int(edge[0]), int(edge[1])
+        ca = det_coords[a] if 0 <= a < n_dets else None
+        cb = det_coords[b] if 0 <= b < n_dets else None
+        if ca is None and cb is None:
+            continue
+        if cb is None:
+            mid_x, mid_y = ca
+            ref_is_x = det_is_x[a]
+        elif ca is None:
+            mid_x, mid_y = cb
+            ref_is_x = det_is_x[b]
+        else:
+            mid_x = (ca[0] + cb[0]) / 2.0
+            mid_y = (ca[1] + cb[1]) / 2.0
+            ref_is_x = det_is_x[a] if 0 <= a < n_dets else det_is_x[b]
+        snap = _snap(mid_x, mid_y)
+        if ref_is_x:
+            z_errs.add(snap)
+        else:
+            x_errs.add(snap)
+    return sorted(x_errs), sorted(z_errs)
+# --------------------------------------------------------------------------- #
+# Predicted-observable computation (used by Reward 1)                          #
+# --------------------------------------------------------------------------- #
+def predicted_observable_flip(
+    predicted_x_qubits: list[int],
+    layout: CircuitLayout,
+) -> int:
+    """Compute the implied logical-Z flip from a predicted Pauli frame.
+    Only X errors on data qubits in ``z_observable_support`` matter for the
+    Z observable - Z errors on data qubits commute with the destructive Z
+    measurement and so cannot flip the observable.
+    """
+    support = set(layout.z_observable_support)
+    parity = 0
+    for q in predicted_x_qubits:
+        if q in support:
+            parity ^= 1
+    return parity
+def rectify_pauli_frame_to_observable(
+    x_errors: list[int],
+    z_errors: list[int],
+    target_observable_flip: int,
+    layout: CircuitLayout,
+) -> tuple[list[int], list[int]]:
+    """Adjust a predicted X-error frame so its implied observable matches.
+    Used by the SFT data generator and the PyMatching imitator policy: the
+    snap-to-data-qubit edge mapping (:func:`pymatching_predicted_pauli_frame`)
+    is only ~95% faithful, but PyMatching's *observable* prediction is exact.
+    We patch the X frame by toggling the smallest-degree data qubit on the
+    observable support whenever the implied parity disagrees with the
+    target. Z errors are untouched because they don't affect a Z observable.
+    """
+    implied = predicted_observable_flip(x_errors, layout)
+    if implied == target_observable_flip:
+        return list(x_errors), list(z_errors)
+    support = list(layout.z_observable_support)
+    if not support:
+        return list(x_errors), list(z_errors)
+    x_set = set(x_errors)
+    intersect = sorted(x_set & set(support))
+    if intersect:
+        # Remove the smallest one currently flipping the observable.
+        x_set.discard(intersect[0])
+    else:
+        # Add the smallest support qubit to introduce a flip.
+        x_set.add(support[0])
+    return sorted(x_set), list(z_errors)
+# --------------------------------------------------------------------------- #
+# Stabiliser counts - derived from layout                                      #
+# --------------------------------------------------------------------------- #
+def detector_round_split(layout: CircuitLayout, syndrome_bits: list[int]) -> dict[int, list[int]]:
+    """Group detector bits by their nominal round (used for prompt formatting)."""
+    out: dict[int, list[int]] = {}
+    for idx, bit in enumerate(syndrome_bits):
+        r = layout.detector_round[idx] if idx < len(layout.detector_round) else 0
+        out.setdefault(r, []).append(bit)
+    return out
+def per_round_x_z_counts(layout: CircuitLayout) -> tuple[int, int]:
+    """Best-effort count of X-type and Z-type stabiliser detectors per round.
+    For a rotated surface code at distance d there are (d^2-1)/2 of each
+    type. We compute that from the layout to be robust.
+    """
+    # Take one fully-populated round (the one with the most detectors).
+    round_counts: dict[int, list[bool]] = {}
+    for idx, r in enumerate(layout.detector_round):
+        round_counts.setdefault(r, []).append(layout.detector_is_x_type[idx])
+    if not round_counts:
+        return 0, 0
+    full_round = max(round_counts.values(), key=len)
+    n_x = sum(1 for v in full_round if v)
+    n_z = sum(1 for v in full_round if not v)
+    return n_x, n_z

qubit_medic/server/rewards.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""The five reward functions (Section 3 of the plan).
+Design contract (from Section 3.6):
+* Each reward is a pure function ``(action, state, layout) -> float in [0, 1]``.
+* Rewards never observe each other - they're independent by construction so
+  the LLM can't satisfy one at the expense of another without genuine task
+  understanding.
+* The combined reward is a weighted sum (weights in :mod:`qubit_medic.config`)
+  clamped to ``[0, 1]``.
+* Every per-component score is reported in the ``info`` dict so logs can
+  surface reward-hacking early (Section 3.7).
+A note on Reward 2 and Reward 3 ground truth - see ``physics.py``: the LLM
+predicts a *terminal Pauli frame*, which fully determines the logical-Z
+observable but only constrains the *final-round* detectors. Earlier rounds'
+detectors are intentionally unscored. Reward 3 compares against PyMatching's
+near-optimal Pauli-frame prediction (the canonical decoder reference used in
+AlphaQubit's Nature paper).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from qubit_medic.config import REWARD_WEIGHTS
+from qubit_medic.prompts import ParseResult
+from qubit_medic.server.physics import (
+    CircuitLayout,
+    SyndromeSample,
+    predicted_observable_flip,
+)
+# --------------------------------------------------------------------------- #
+# Reward 1: logical correction success                                         #
+# --------------------------------------------------------------------------- #
+def reward_logical_correction(
+    parsed: ParseResult,
+    sample: SyndromeSample,
+    layout: CircuitLayout,
+) -> float:
+    """Did the predicted correction preserve the logical state?
+    Apply the predicted X errors as a Pauli frame at end-of-circuit and
+    compute the implied observable flip. If this matches the actual
+    observable flip recorded by Stim, the logical state was preserved.
+    Outputs 1.0 if so, else 0.0.
+    This is the unfakeable reward - it depends only on Stim's ground truth.
+    """
+    implied = predicted_observable_flip(parsed.x_errors, layout)
+    return 1.0 if implied == sample.actual_observable_flip else 0.0
+# --------------------------------------------------------------------------- #
+# Reward 2: syndrome consistency                                               #
+# --------------------------------------------------------------------------- #
+def _syndrome_from_pauli_frame(
+    x_errors: list[int],
+    layout: CircuitLayout,
+    final_detector_supports: dict[int, frozenset[int]],
+) -> dict[int, int]:
+    """Compute the implied bits for FINAL-round detectors only.
+    A terminal X error on data qubit ``q`` flips a final-round Z-stabiliser
+    detector iff ``q`` is in that detector's support.
+    """
+    out: dict[int, int] = {}
+    x_set = set(x_errors)
+    for det_idx, support in final_detector_supports.items():
+        out[det_idx] = 1 if len(x_set & support) % 2 == 1 else 0
+    return out
+def reward_syndrome_consistency(
+    parsed: ParseResult,
+    sample: SyndromeSample,
+    layout: CircuitLayout,
+    final_detector_supports: dict[int, frozenset[int]],
+) -> float:
+    """How well does the predicted Pauli frame reproduce the FINAL detectors?
+    Computes Hamming similarity between ``predicted_final_bits`` (induced by
+    the predicted X errors) and ``observed_final_bits``. Returns
+    ``1 - hamming_distance / num_final_detectors``.
+    Rationale (Section 3.2): without this term, an LLM that lucky-guesses
+    the right qubits could get Reward 1 occasionally; this signal forces it
+    to also explain the data the syndrome carries.
+    """
+    final_dets = layout.final_detectors
+    if not final_dets:
+        return 0.0
+    implied = _syndrome_from_pauli_frame(
+        parsed.x_errors, layout, final_detector_supports
+    )
+    distance = 0
+    for det_idx in final_dets:
+        observed = sample.syndrome_bits[det_idx]
+        predicted = implied.get(det_idx, 0)
+        if observed != predicted:
+            distance += 1
+    return 1.0 - distance / len(final_dets)
+def compute_final_detector_supports(
+    layout: CircuitLayout,
+    syndrome_bits_unused: list[int] | None = None,  # API symmetry
+    *,
+    detector_to_data_qubits: dict[int, frozenset[int]] | None = None,
+) -> dict[int, frozenset[int]]:
+    """Map each final-round detector to the set of data qubits whose
+    terminal X error flips it.
+    For the rotated memory_z code, each Z-stabiliser final detector watches
+    the four (or two/one on the boundary) data qubits adjacent to it on the
+    grid. We compute adjacency by Euclidean distance; data qubits at
+    distance ``sqrt(2)`` from a Z-stabiliser ancilla coordinate are
+    incident.
+    """
+    if detector_to_data_qubits is not None:
+        return detector_to_data_qubits
+    out: dict[int, frozenset[int]] = {}
+    for det_idx in layout.final_detectors:
+        dx, dy = layout.detector_coords[det_idx]
+        adj: set[int] = set()
+        for q, (qx, qy) in zip(layout.data_qubits, layout.data_qubit_coords):
+            if abs((qx - dx) ** 2 + (qy - dy) ** 2 - 2.0) < 1e-6:
+                adj.add(q)
+        out[det_idx] = frozenset(adj)
+    return out
+# --------------------------------------------------------------------------- #
+# Reward 3: Hamming overlap with reference Pauli frame                         #
+# --------------------------------------------------------------------------- #
+def _jaccard(a: list[int], b: list[int]) -> float:
+    """Jaccard index. Returns 1.0 when both sets are empty (perfect agreement)."""
+    sa, sb = set(a), set(b)
+    if not sa and not sb:
+        return 1.0
+    inter = len(sa & sb)
+    union = len(sa | sb)
+    return inter / union if union else 1.0
+def reward_hamming_overlap(
+    parsed: ParseResult,
+    sample: SyndromeSample,
+    layout: CircuitLayout,
+) -> float:
+    """Average of Jaccard(X) and Jaccard(Z) against the reference frame.
+    Reference is PyMatching's per-edge predicted Pauli frame
+    (``sample.pymatching_x_errors`` / ``..._z_errors``). This is the dense
+    partial-credit signal of Section 3.3 - even if Reward 1 fires zero,
+    being *close* to the canonical solution still gets credit, smoothing
+    the reward landscape during early training.
+    """
+    jx = _jaccard(parsed.x_errors, sample.pymatching_x_errors)
+    jz = _jaccard(parsed.z_errors, sample.pymatching_z_errors)
+    return 0.5 * (jx + jz)
+# --------------------------------------------------------------------------- #
+# Reward 4: format compliance                                                  #
+# --------------------------------------------------------------------------- #
+def reward_format_compliance(parsed: ParseResult) -> float:
+    """1.0 if both keys parsed, 0.5 if exactly one, 0.0 if neither."""
+    return parsed.format_score
+# --------------------------------------------------------------------------- #
+# Reward 5: PyMatching beat-rate bonus                                         #
+# --------------------------------------------------------------------------- #
+def reward_pymatching_beat(
+    parsed: ParseResult,
+    sample: SyndromeSample,
+    layout: CircuitLayout,
+) -> float:
+    """1.0 iff PyMatching got this syndrome wrong AND the LLM got it right.
+    This is the headline metric (Section 3.5). Most of training it'll be
+    near zero; the trajectory of its mean over steps is the proof we've
+    moved past pure imitation.
+    """
+    pm_correct = sample.pymatching_observable_pred == sample.actual_observable_flip
+    if pm_correct:
+        return 0.0
+    llm_implied = predicted_observable_flip(parsed.x_errors, layout)
+    return 1.0 if llm_implied == sample.actual_observable_flip else 0.0
+# --------------------------------------------------------------------------- #
+# Combined reward                                                              #
+# --------------------------------------------------------------------------- #
+@dataclass(frozen=True)
+class RewardBreakdown:
+    """Per-component scores plus the weighted total."""
+    logical_correction: float
+    syndrome_consistency: float
+    hamming_overlap: float
+    format_compliance: float
+    pymatching_beat: float
+    total: float
+    def as_dict(self) -> dict[str, float]:
+        return {
+            "logical_correction": self.logical_correction,
+            "syndrome_consistency": self.syndrome_consistency,
+            "hamming_overlap": self.hamming_overlap,
+            "format_compliance": self.format_compliance,
+            "pymatching_beat": self.pymatching_beat,
+            "total": self.total,
+        }
+def compute_all_rewards(
+    parsed: ParseResult,
+    sample: SyndromeSample,
+    layout: CircuitLayout,
+    final_detector_supports: dict[int, frozenset[int]],
+    weights: dict[str, float] = REWARD_WEIGHTS,
+) -> RewardBreakdown:
+    """Compute all five rewards and the weighted total.
+    Returns a :class:`RewardBreakdown` whose ``as_dict`` is what the env's
+    ``info`` payload contains. The trainer logs each component separately.
+    """
+    r1 = reward_logical_correction(parsed, sample, layout)
+    r2 = reward_syndrome_consistency(parsed, sample, layout, final_detector_supports)
+    r3 = reward_hamming_overlap(parsed, sample, layout)
+    r4 = reward_format_compliance(parsed)
+    r5 = reward_pymatching_beat(parsed, sample, layout)
+    total = (
+        weights["logical_correction"] * r1
+        + weights["syndrome_consistency"] * r2
+        + weights["hamming_overlap"] * r3
+        + weights["format_compliance"] * r4
+        + weights["pymatching_beat"] * r5
+    )
+    total = max(0.0, min(1.0, total))
+    return RewardBreakdown(
+        logical_correction=r1,
+        syndrome_consistency=r2,
+        hamming_overlap=r3,
+        format_compliance=r4,
+        pymatching_beat=r5,
+        total=total,
+    )

qubit_medic/wandb_utils.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""Central Weights & Biases integration for Qubit-Medic.
+Design goals
+------------
+1. **Single source of truth** for the W&B project name, default tags, and
+   the ``config`` dump that every run logs. Trainers, eval scripts, and
+   notebooks all funnel through :func:`init_run` so dashboards always
+   line up.
+2. **Safe to import without wandb installed.** The package's training
+   deps (``wandb``) live in ``requirements-train.txt`` and are absent on
+   the lean Spaces image. Anything in this module degrades gracefully
+   when the import fails - the rest of the project keeps working.
+3. **Disable-by-env-var.** Set ``WANDB_DISABLED=1`` (or
+   ``QUBIT_MEDIC_WANDB=0``) and every helper here becomes a no-op,
+   regardless of whether the package is installed. CI runs and offline
+   testing rely on this.
+4. **Rich first-class logging.** We expose dedicated helpers for the
+   things this project cares about:
+       * Per-reward component scalars (5 lines per step, not just total)
+       * Curriculum-level moving averages (one line per level)
+       * Parse success / partial / failure rates
+       * Generation sample tables (prompt / completion / per-reward)
+       * Eval summary tables (one row per (policy, level))
+   The trainers and eval script only have to call these helpers; the
+   Pythonic context manager handles init, summary, and finish.
+"""
+from __future__ import annotations
+import contextlib
+import dataclasses
+import os
+import socket
+import sys
+import time
+from typing import Any, Iterable, Mapping, Optional, Sequence
+from qubit_medic.config import (
+    CURRICULUM, GRPO_GEN_PER_PROMPT, GRPO_KL_COEF, GRPO_LR, GRPO_MAX_COMPLETION_LEN,
+    GRPO_MAX_PROMPT_LEN, GRPO_STEPS, LORA_ALPHA, LORA_R, LORA_TARGET_MODULES,
+    MODEL_ID, PRIMARY_SEED, REWARD_WEIGHTS, SFT_BATCH_SIZE, SFT_EPOCHS,
+    SFT_GRAD_ACCUM, SFT_LR, SFT_MAX_SEQ_LEN, WANDB_DEFAULT_TAGS, WANDB_ENTITY,
+    WANDB_LOG_GENERATIONS_EVERY, WANDB_PROJECT, WANDB_SAMPLE_GENERATIONS,
+)
+# --------------------------------------------------------------------------- #
+# Lazy import + on/off toggle                                                 #
+# --------------------------------------------------------------------------- #
+_WANDB_MODULE = None
+_RUN: Any = None
+def _import_wandb():
+    """Import wandb on first use. Returns ``None`` if it isn't installed."""
+    global _WANDB_MODULE
+    if _WANDB_MODULE is None:
+        try:
+            import wandb  # type: ignore[import-not-found]
+            _WANDB_MODULE = wandb
+        except ImportError:
+            _WANDB_MODULE = False  # sentinel: "tried, failed"
+    return _WANDB_MODULE if _WANDB_MODULE is not False else None
+def is_disabled() -> bool:
+    """Honours ``WANDB_DISABLED`` and ``QUBIT_MEDIC_WANDB=0``."""
+    if os.environ.get("WANDB_DISABLED", "").lower() in {"1", "true", "yes"}:
+        return True
+    if os.environ.get("QUBIT_MEDIC_WANDB", "1").lower() in {"0", "false", "no"}:
+        return True
+    return False
+def is_available() -> bool:
+    """``True`` iff wandb is importable AND not disabled by env var."""
+    return _import_wandb() is not None and not is_disabled()
+def get_run():
+    """Return the active W&B run object, or ``None`` if not initialised."""
+    return _RUN
+# --------------------------------------------------------------------------- #
+# Init / finish                                                                #
+# --------------------------------------------------------------------------- #
+def _system_metadata() -> dict:
+    """Static metadata that's helpful on the dashboard but isn't a hyperparam."""
+    info = {
+        "python_version": sys.version.split()[0],
+        "hostname": socket.gethostname(),
+        "argv": " ".join(sys.argv),
+        "pid": os.getpid(),
+    }
+    try:
+        import torch
+        info["torch_version"] = torch.__version__
+        info["cuda_available"] = bool(torch.cuda.is_available())
+        if torch.cuda.is_available():
+            info["cuda_device"] = torch.cuda.get_device_name(0)
+    except Exception:
+        pass
+    try:
+        import stim
+        info["stim_version"] = stim.__version__
+    except Exception:
+        pass
+    try:
+        import pymatching
+        info["pymatching_version"] = pymatching.__version__
+    except Exception:
+        pass
+    try:
+        import trl, transformers, peft
+        info["trl_version"] = trl.__version__
+        info["transformers_version"] = transformers.__version__
+        info["peft_version"] = peft.__version__
+    except Exception:
+        pass
+    return info
+def _build_default_config(extra: Optional[Mapping[str, Any]] = None) -> dict:
+    """The config every run logs - hyperparameters + reward weights + curriculum."""
+    cfg: dict[str, Any] = {
+        "model_id": MODEL_ID,
+        "primary_seed": PRIMARY_SEED,
+        "lora_r": LORA_R,
+        "lora_alpha": LORA_ALPHA,
+        "lora_target_modules": list(LORA_TARGET_MODULES),
+        "sft": {
+            "epochs": SFT_EPOCHS,
+            "batch_size": SFT_BATCH_SIZE,
+            "grad_accum": SFT_GRAD_ACCUM,
+            "lr": SFT_LR,
+            "max_seq_len": SFT_MAX_SEQ_LEN,
+        },
+        "grpo": {
+            "steps": GRPO_STEPS,
+            "gen_per_prompt": GRPO_GEN_PER_PROMPT,
+            "lr": GRPO_LR,
+            "kl_coef": GRPO_KL_COEF,
+            "max_prompt_len": GRPO_MAX_PROMPT_LEN,
+            "max_completion_len": GRPO_MAX_COMPLETION_LEN,
+        },
+        "reward_weights": dict(REWARD_WEIGHTS),
+        "curriculum": [
+            {
+                "name": lvl.name, "distance": lvl.distance, "rounds": lvl.rounds,
+                "p": lvl.p, "promotion_threshold": lvl.promotion_threshold,
+            }
+            for lvl in CURRICULUM
+        ],
+        "system": _system_metadata(),
+    }
+    if extra:
+        cfg.update(extra)
+    return cfg
+def init_run(
+    run_name: str,
+    job_type: str,
+    *,
+    tags: Optional[Sequence[str]] = None,
+    extra_config: Optional[Mapping[str, Any]] = None,
+    notes: Optional[str] = None,
+    group: Optional[str] = None,
+):
+    """Initialise (or no-op) a W&B run.
+    Parameters
+    ----------
+    run_name:
+        Human-readable run name (e.g. ``"sft-warmup-2026-04-25"``).
+    job_type:
+        One of ``"sft"``, ``"grpo"``, ``"eval"``, ``"format-test"``,
+        ``"baseline"``. Used to group runs on the dashboard.
+    tags:
+        Extra tags appended to :data:`qubit_medic.config.WANDB_DEFAULT_TAGS`.
+    extra_config:
+        Hyperparameters specific to this run (e.g. SFT epochs override).
+    notes:
+        Free-text notes shown on the dashboard.
+    group:
+        Optional W&B group (used to bundle SFT + GRPO + eval runs of the
+        same experiment).
+    Returns
+    -------
+    The wandb Run object, or ``None`` if W&B is unavailable / disabled.
+    """
+    global _RUN
+    wandb = _import_wandb()
+    if wandb is None or is_disabled():
+        if wandb is None:
+            print("[wandb] not installed; skipping init "
+                  "(install with `pip install wandb` to enable logging)",
+                  file=sys.stderr)
+        else:
+            print("[wandb] disabled by env var; skipping init", file=sys.stderr)
+        _RUN = None
+        return None
+    all_tags = list(WANDB_DEFAULT_TAGS) + list(tags or [])
+    cfg = _build_default_config(extra=extra_config)
+    cfg["job_type"] = job_type
+    _RUN = wandb.init(
+        project=WANDB_PROJECT,
+        entity=WANDB_ENTITY,
+        name=run_name,
+        job_type=job_type,
+        tags=all_tags,
+        config=cfg,
+        notes=notes,
+        group=group,
+        reinit=True,
+    )
+    print(f"[wandb] run live at {_RUN.url}", file=sys.stderr)
+    return _RUN
+def finish_run() -> None:
+    """Cleanly close the current W&B run, if any."""
+    global _RUN
+    wandb = _import_wandb()
+    if wandb is None or _RUN is None:
+        _RUN = None
+        return
+    try:
+        wandb.finish()
+    finally:
+        _RUN = None
+@contextlib.contextmanager
+def run_context(run_name: str, job_type: str, **kwargs):
+    """Context-manager wrapper around :func:`init_run` / :func:`finish_run`."""
+    init_run(run_name, job_type, **kwargs)
+    try:
+        yield get_run()
+    finally:
+        finish_run()
+# --------------------------------------------------------------------------- #
+# Generic logging helpers                                                     #
+# --------------------------------------------------------------------------- #
+def log(metrics: Mapping[str, Any], *, step: Optional[int] = None,
+        commit: bool = True) -> None:
+    """No-op-safe ``wandb.log`` wrapper."""
+    wandb = _import_wandb()
+    if wandb is None or _RUN is None:
+        return
+    try:
+        wandb.log(dict(metrics), step=step, commit=commit)
+    except Exception as exc:  # pragma: no cover - defensive
+        print(f"[wandb] log failed: {exc}", file=sys.stderr)
+def update_summary(values: Mapping[str, Any]) -> None:
+    """Write to ``run.summary`` (the run's headline numbers)."""
+    if _RUN is None:
+        return
+    try:
+        for k, v in values.items():
+            _RUN.summary[k] = v
+    except Exception as exc:  # pragma: no cover
+        print(f"[wandb] summary update failed: {exc}", file=sys.stderr)
+# --------------------------------------------------------------------------- #
+# Project-specific helpers                                                    #
+# --------------------------------------------------------------------------- #
+_REWARD_KEYS = (
+    "logical_correction",
+    "syndrome_consistency",
+    "hamming_overlap",
+    "format_compliance",
+    "pymatching_beat",
+    "total",
+)
+def log_reward_breakdown(
+    breakdowns: Sequence[Mapping[str, float]],
+    *,
+    step: Optional[int] = None,
+    prefix: str = "rl",
+) -> None:
+    """Log mean / min / max for each of the five reward components.
+    ``breakdowns`` is a list of dicts, one per generation in the most-recent
+    GRPO step (length = ``GRPO_GEN_PER_PROMPT * batch``). We log mean and
+    standard deviation so the dashboard has both signal and noise.
+    """
+    if not breakdowns or _RUN is None:
+        return
+    out: dict[str, float] = {}
+    for k in _REWARD_KEYS:
+        vals = [float(b.get(k, 0.0)) for b in breakdowns]
+        n = max(1, len(vals))
+        mean = sum(vals) / n
+        var = sum((v - mean) ** 2 for v in vals) / n
+        out[f"{prefix}/reward/{k}_mean"] = mean
+        out[f"{prefix}/reward/{k}_std"] = var ** 0.5
+        out[f"{prefix}/reward/{k}_max"] = max(vals)
+        out[f"{prefix}/reward/{k}_min"] = min(vals)
+    log(out, step=step)
+def log_parse_stats(
+    parse_results: Iterable,  # iterable of qubit_medic.prompts.ParseResult
+    *,
+    step: Optional[int] = None,
+    prefix: str = "rl",
+) -> None:
+    """Log parse-success / partial / failure rates for the most-recent batch."""
+    if _RUN is None:
+        return
+    parse_results = list(parse_results)
+    n = max(1, len(parse_results))
+    success = sum(1 for r in parse_results if getattr(r, "parse_success", False))
+    partial = sum(1 for r in parse_results
+                  if not getattr(r, "parse_success", False)
+                  and getattr(r, "parse_partial", False))
+    log({
+        f"{prefix}/parse/success_rate": success / n,
+        f"{prefix}/parse/partial_rate": partial / n,
+        f"{prefix}/parse/failure_rate": (n - success - partial) / n,
+        f"{prefix}/parse/sample_count": n,
+    }, step=step)
+def log_curriculum(
+    curriculum_stats: Mapping[str, Mapping[str, float]],
+    *,
+    step: Optional[int] = None,
+    prefix: str = "rl",
+) -> None:
+    """Log the per-level moving-average from the env health endpoint.
+    ``curriculum_stats`` is what
+    :meth:`qubit_medic.server.curriculum.CurriculumScheduler.snapshot`
+    returns; one inner dict per level with keys ``moving_mean`` / ``samples``.
+    """
+    if _RUN is None or not curriculum_stats:
+        return
+    out: dict[str, float] = {}
+    for level_name, stats in curriculum_stats.items():
+        out[f"{prefix}/curriculum/{level_name}_mean"] = float(stats.get("moving_mean", 0.0))
+        out[f"{prefix}/curriculum/{level_name}_samples"] = float(stats.get("samples", 0.0))
+    log(out, step=step)
+def log_generation_table(
+    rows: Sequence[Mapping[str, Any]],
+    *,
+    step: Optional[int],
+    table_name: str = "rl/generations",
+    columns: Optional[Sequence[str]] = None,
+) -> None:
+    """Log a wandb.Table of (prompt, completion, reward, ...) rows.
+    Each row is a flat dict; the column set is the union of all keys (or
+    the explicit ``columns`` arg). Used to surface qualitative samples
+    in addition to the scalar curves.
+    """
+    wandb = _import_wandb()
+    if wandb is None or _RUN is None or not rows:
+        return
+    cols = list(columns) if columns is not None else sorted(
+        {k for row in rows for k in row.keys()}
+    )
+    try:
+        table = wandb.Table(columns=cols)
+        for row in rows:
+            table.add_data(*[row.get(c, None) for c in cols])
+        log({table_name: table}, step=step)
+    except Exception as exc:  # pragma: no cover
+        print(f"[wandb] table log failed: {exc}", file=sys.stderr)
+def log_eval_summary(
+    summary: Mapping[str, Any],
+    *,
+    step: Optional[int] = None,
+    prefix: str = "eval",
+) -> None:
+    """Log the dict produced by ``scripts/eval._summary`` as scalars."""
+    if _RUN is None:
+        return
+    out: dict[str, Any] = {}
+    for k, v in summary.items():
+        if isinstance(v, (int, float)):
+            out[f"{prefix}/{k}"] = v
+    log(out, step=step)
+    update_summary({f"{prefix}/{k}": v for k, v in summary.items()
+                    if isinstance(v, (int, float, str, bool))})
+def log_artifact(
+    path: str, *, name: str, artifact_type: str,
+    description: Optional[str] = None,
+) -> None:
+    """Save a file or directory as a W&B artifact."""
+    wandb = _import_wandb()
+    if wandb is None or _RUN is None:
+        return
+    try:
+        art = wandb.Artifact(name, type=artifact_type, description=description)
+        if os.path.isdir(path):
+            art.add_dir(path)
+        else:
+            art.add_file(path)
+        _RUN.log_artifact(art)
+    except Exception as exc:  # pragma: no cover
+        print(f"[wandb] artifact log failed: {exc}", file=sys.stderr)
+# --------------------------------------------------------------------------- #
+# CLI integration helpers                                                     #
+# --------------------------------------------------------------------------- #
+def derive_report_to(report_to: str) -> str:
+    """Translate the user-facing ``--report-to`` flag.
+    If the user passes ``"wandb"`` but wandb is unavailable, fall back to
+    ``"none"`` rather than crashing the trainer. Lets the same script run
+    on Colab (with wandb) and CI (without).
+    """
+    if report_to == "wandb" and not is_available():
+        print("[wandb] requested via --report-to but unavailable; falling back to 'none'",
+              file=sys.stderr)
+        return "none"
+    return report_to
+def make_run_name(prefix: str, suffix: Optional[str] = None) -> str:
+    """Build a default run name like ``sft-warmup-20260425-2105``."""
+    stamp = time.strftime("%Y%m%d-%H%M%S")
+    bits = [prefix, stamp]
+    if suffix:
+        bits.append(suffix)
+    return "-".join(bits)
+__all__ = [
+    "derive_report_to",
+    "finish_run",
+    "get_run",
+    "init_run",
+    "is_available",
+    "is_disabled",
+    "log",
+    "log_artifact",
+    "log_curriculum",
+    "log_eval_summary",
+    "log_generation_table",
+    "log_parse_stats",
+    "log_reward_breakdown",
+    "make_run_name",
+    "run_context",
+    "update_summary",
+    "WANDB_LOG_GENERATIONS_EVERY",
+    "WANDB_SAMPLE_GENERATIONS",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+# Pin the versions called out in the plan (Section 1.1).
+# These versions are known compatible with Python 3.11 + CUDA 12.x.
+# DO NOT bump without re-running scripts/validate_env.py.
+# --- Quantum simulation ---
+stim>=1.13,<2.0
+pymatching>=2.2,<3.0
+# --- Environment / serving ---
+fastapi>=0.110
+uvicorn[standard]>=0.27
+pydantic>=2.5,<3.0
+httpx>=0.27
+numpy>=1.26,<2.1
+# --- Plotting (used by scripts/plot_results.py) ---
+matplotlib>=3.8
+pillow>=10
+# --- Test runner ---
+pytest>=8
+# --- OpenEnv (HuggingFace's RL-env framework). Required by the submission
+#     guidelines ("Use OpenEnv (latest release). Build on top of the
+#     framework; don't reinvent the wheel."). Our server wraps the
+#     DecoderEnvironment with `openenv.core.Environment`-compatible
+#     Action/Observation/State models so TRL can drive it via
+#     `environment_factory=` (see qubit_medic/server/openenv_adapter.py
+#     and scripts/train_grpo.py). Lightweight; safe in the Spaces image.
+openenv-core>=0.2.1
+# --- Training (omit on CPU-only deploy) ---
+# Heavy ML deps live in requirements-train.txt; the env server itself does
+# not import any of them. Keeping them out of the Docker image keeps the
+# Spaces upload under the free-tier image-size limit.