Spaces:

ronitraj
/

QuantumScribe

Sleeping

App Files Files Community

ronitraj commited on 12 days ago

Commit

fa68719

verified ·

1 Parent(s): 1c5a86c

deploy via scripts/deploy_to_space.py

Browse files

Files changed (25) hide show

qubit_medic/__pycache__/__init__.cpython-312.pyc +0 -0
qubit_medic/__pycache__/__init__.cpython-314.pyc +0 -0
qubit_medic/__pycache__/config.cpython-312.pyc +0 -0
qubit_medic/__pycache__/config.cpython-314.pyc +0 -0
qubit_medic/__pycache__/models.cpython-312.pyc +0 -0
qubit_medic/__pycache__/models.cpython-314.pyc +0 -0
qubit_medic/__pycache__/prompts.cpython-312.pyc +0 -0
qubit_medic/__pycache__/training_stack.cpython-312.pyc +0 -0
qubit_medic/__pycache__/wandb_utils.cpython-312.pyc +0 -0
qubit_medic/client/__pycache__/__init__.cpython-312.pyc +0 -0
qubit_medic/client/__pycache__/client.cpython-312.pyc +0 -0
qubit_medic/client/client.py +40 -2
qubit_medic/config.py +201 -28
qubit_medic/prompts.py +190 -87
qubit_medic/server/__pycache__/__init__.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/app.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/curriculum.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/environment.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/openenv_adapter.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/physics.cpython-312.pyc +0 -0
qubit_medic/server/__pycache__/rewards.cpython-312.pyc +0 -0
qubit_medic/server/app.py +76 -0
qubit_medic/server/environment.py +44 -3
qubit_medic/server/rewards.py +76 -23
qubit_medic/wandb_utils.py +12 -2

qubit_medic/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (818 Bytes). View file

qubit_medic/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (815 Bytes). View file

qubit_medic/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (9.72 kB). View file

qubit_medic/__pycache__/config.cpython-314.pyc ADDED Viewed

Binary file (10.4 kB). View file

qubit_medic/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (5.47 kB). View file

qubit_medic/__pycache__/models.cpython-314.pyc ADDED Viewed

Binary file (5.62 kB). View file

qubit_medic/__pycache__/prompts.cpython-312.pyc ADDED Viewed

Binary file (13.6 kB). View file

qubit_medic/__pycache__/training_stack.cpython-312.pyc ADDED Viewed

Binary file (6.59 kB). View file

qubit_medic/__pycache__/wandb_utils.cpython-312.pyc ADDED Viewed

Binary file (20.3 kB). View file

qubit_medic/client/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (414 Bytes). View file

qubit_medic/client/__pycache__/client.cpython-312.pyc ADDED Viewed

Binary file (9.23 kB). View file

qubit_medic/client/client.py CHANGED Viewed

@@ -28,6 +28,10 @@ class _ClientProtocol(Protocol):
     def reset(self, *, seed: Optional[int] = None,
               forced_level: Optional[str] = None) -> DecoderObservation: ...
     def step(self, *, raw_response: str, episode_id: int) -> StepResult: ...
     def health(self) -> dict: ...
     def close(self) -> None: ...
@@ -89,6 +93,20 @@ class DecoderClient:
             info=dict(obs_payload.get("info", {})),
         )
     def health(self) -> dict:
         r = self._client.get("/health")
         r.raise_for_status()
@@ -100,6 +118,14 @@ class DecoderClient:
         return r.json()
     def close(self) -> None:
         self._client.close()
@@ -117,11 +143,23 @@ class LocalDecoderClient:
     def step(self, *, raw_response: str, episode_id: int) -> StepResult:
         return self._env.step(raw_response=raw_response, episode_id=episode_id)
     def health(self) -> dict:
         return self._env.health()
-    def close(self) -> None:  # nothing to clean up
-        pass
 def make_default_client() -> _ClientProtocol:

     def reset(self, *, seed: Optional[int] = None,
               forced_level: Optional[str] = None) -> DecoderObservation: ...
     def step(self, *, raw_response: str, episode_id: int) -> StepResult: ...
+    # Compliance Section 3 (audit, 2026-04): the client surface must
+    # mirror the server endpoints. state() returns a JSON-serialisable
+    # snapshot; close() releases per-episode bookkeeping.
+    def state(self) -> dict: ...
     def health(self) -> dict: ...
     def close(self) -> None: ...
             info=dict(obs_payload.get("info", {})),
         )
+    def state(self) -> dict:
+        """GET /state on the OpenEnv server.
+        Compliance Section 3 (audit, 2026-04): the client must mirror
+        the server endpoints. We use GET (the OpenEnv canonical method)
+        first, then fall back to POST (the audit-required method we
+        also mounted) if some server build only exposes one of them.
+        """
+        r = self._client.get("/state")
+        if r.status_code == 405:  # method not allowed -> try POST
+            r = self._client.post("/state")
+        r.raise_for_status()
+        return r.json()
     def health(self) -> dict:
         r = self._client.get("/health")
         r.raise_for_status()
         return r.json()
     def close(self) -> None:
+        # Best-effort: tell the server we're done (the POST /close route
+        # is mounted by qubit_medic.server.app) and then release the
+        # local httpx connection pool. If the server doesn't expose
+        # /close, swallow the 404 - this remains an idempotent cleanup.
+        try:
+            self._client.post("/close")
+        except Exception:
+            pass
         self._client.close()
     def step(self, *, raw_response: str, episode_id: int) -> StepResult:
         return self._env.step(raw_response=raw_response, episode_id=episode_id)
+    def state(self) -> dict:
+        """Compliance Section 3 (audit, 2026-04): expose env state via
+        the same client surface as the HTTP variant. Delegates to the
+        in-process :meth:`DecoderEnvironment.state`."""
+        return self._env.state()
     def health(self) -> dict:
         return self._env.health()
+    def close(self) -> None:
+        # Compliance Section 3 (audit, 2026-04): close releases any
+        # per-episode bookkeeping on the inner DecoderEnvironment so a
+        # subsequent reset() starts from a clean active-episode dict.
+        try:
+            self._env.close()
+        except Exception:
+            pass
 def make_default_client() -> _ClientProtocol:

qubit_medic/config.py CHANGED Viewed

@@ -111,7 +111,12 @@ CURRICULUM: tuple[CurriculumLevel, ...] = (
         name="L1_warmup",
         distance=DISTANCE_PRIMARY,
         rounds=1,
-        p=0.0001,
         promotion_threshold=0.80,
         eval_size=100,
     ),
@@ -139,9 +144,9 @@ CURRICULUM: tuple[CurriculumLevel, ...] = (
 # --------------------------------------------------------------------------- #
 REWARD_WEIGHTS: dict[str, float] = {
-    "logical_correction": 0.40,    # Reward 1 - the unfakeable ground truth
     "syndrome_consistency": 0.20,  # Reward 2 - prevents lucky-guess attacks
-    "hamming_overlap": 0.20,       # Reward 3 - dense partial credit
     "format_compliance": 0.10,     # Reward 4 - parser must succeed
     "pymatching_beat": 0.10,       # Reward 5 - the headline metric
 }
@@ -163,29 +168,185 @@ PRIMARY_SEED: int = SEEDS[0]
 # --------------------------------------------------------------------------- #
 MODEL_ID: str = "Qwen/Qwen2.5-3B-Instruct"
-"""3B params, 4-bit quantised + LoRA fits in a Colab T4."""
 LORA_R: int = 16
-LORA_ALPHA: int = 32
 LORA_TARGET_MODULES: tuple[str, ...] = ("q_proj", "k_proj", "v_proj", "o_proj")
 SFT_EPOCHS: int = 1
 SFT_BATCH_SIZE: int = 4
-SFT_GRAD_ACCUM: int = 4
-SFT_LR: float = 2e-4
-SFT_DATASET_SIZE: int = 5_000
-SFT_MAX_SEQ_LEN: int = 2048
-GRPO_STEPS: int = 2_000
-GRPO_GEN_PER_PROMPT: int = 4
-GRPO_LR: float = 1e-5
-GRPO_KL_COEF: float = 0.04
-GRPO_MAX_PROMPT_LEN: int = 512
-GRPO_MAX_COMPLETION_LEN: int = 256
-GRPO_CHECKPOINT_EVERY: int = 250
-GRPO_LOG_EVERY: int = 50
 # Decoding sampler defaults at evaluation/format-test time.
 SAMPLE_TEMPERATURE: float = 0.7
 SAMPLE_TOP_P: float = 0.95
@@ -208,10 +369,14 @@ DEFAULT_PORT: int = 7860  # Hugging Face Spaces' default exposed port
 # all log to the same project / dashboard. Override per-run on the CLI.
 import os as _os  # noqa: E402  (local import to keep top of module clean)
-WANDB_PROJECT: str = _os.environ.get("WANDB_PROJECT", "qubit-medic")
-"""Default W&B project name. Override with ``WANDB_PROJECT=...``."""
-WANDB_ENTITY: str | None = _os.environ.get("WANDB_ENTITY") or None
 """W&B team or username. ``None`` -> wandb's default entity for the user."""
 WANDB_DEFAULT_TAGS: tuple[str, ...] = (
@@ -224,17 +389,25 @@ WANDB_DEFAULT_TAGS: tuple[str, ...] = (
 """Tags applied to every W&B run (per-script tags appended on top)."""
 WANDB_LOG_GENERATIONS_EVERY: int = 50
-"""Log a sample-completion table every N GRPO steps."""
-WANDB_SAMPLE_GENERATIONS: int = 8
-"""Number of generations included in each sample-completion table."""
-WANDB_INLOOP_EVAL_EVERY: int = 200
 """Run an in-loop evaluation pass (deterministic, ``WANDB_INLOOP_EVAL_EPISODES``
-syndromes) every N GRPO steps. Set to 0 to disable."""
-WANDB_INLOOP_EVAL_EPISODES: int = 50
-"""Number of held-out syndromes per in-loop eval pass (kept small for speed)."""
 # --------------------------------------------------------------------------- #

         name="L1_warmup",
         distance=DISTANCE_PRIMARY,
         rounds=1,
+        # 0.0005 (was 0.0001) — at the original budget, L1 syndromes were
+        # almost always trivial, dragging the SFT class balance down even
+        # under per-level rejection sampling. Bumping to 0.0005 keeps L1
+        # strictly easier than L2 (p=0.001) while giving the model real
+        # non-empty examples to learn from at the warmup stage.
+        p=0.0005,
         promotion_threshold=0.80,
         eval_size=100,
     ),
 # --------------------------------------------------------------------------- #
 REWARD_WEIGHTS: dict[str, float] = {
+    "logical_correction": 0.35,    # Reward 1 - the unfakeable ground truth
+    "hamming_overlap": 0.25,       # Reward 3 - dense partial credit
     "syndrome_consistency": 0.20,  # Reward 2 - prevents lucky-guess attacks
     "format_compliance": 0.10,     # Reward 4 - parser must succeed
     "pymatching_beat": 0.10,       # Reward 5 - the headline metric
 }
 # --------------------------------------------------------------------------- #
 MODEL_ID: str = "Qwen/Qwen2.5-3B-Instruct"
+"""Locked primary model. 3B params, 4-bit quantised + LoRA fits in a Colab T4.
+Backup is ``Qwen/Qwen2.5-7B-Instruct`` - only swap if format-test < 30%."""
+MODEL_BACKUP_ID: str = "Qwen/Qwen2.5-7B-Instruct"
+"""Only swap to this if the pre-onsite format test fails."""
+# ---- LoRA (shared SFT + GRPO) -------------------------------------------- #
 LORA_R: int = 16
+LORA_ALPHA: int = 32  # 2x rank, standard ratio
+LORA_DROPOUT: float = 0.10
+"""Bumped 0.05 -> 0.10 (2026-04 SFT regularisation) because the prior
+SFT runs converged to a single-output mode (every checkpoint reported
+output_diversity=1) which left GRPO unable to compute non-zero
+within-group reward variance. 0.10 is the spec's first-pass dropout;
+the post-SFT diversity preflight will bump to 0.15 if needed."""
 LORA_TARGET_MODULES: tuple[str, ...] = ("q_proj", "k_proj", "v_proj", "o_proj")
+# ---- SFT warmup phase (master spec, section 1; 2026-04 regularisation) -- #
+# 2026-04 changes (diversity-preserving regularisation): SFT collapsed to
+# a constant-output model under the prior settings (LR=2e-4 + dropout=0.05
+# + max_steps=200 left every checkpoint at output_diversity=1). New
+# defaults trade some ceiling LCR for diversity headroom so GRPO has a
+# reward signal to climb.
 SFT_EPOCHS: int = 1
 SFT_BATCH_SIZE: int = 4
+SFT_GRAD_ACCUM: int = 4              # effective batch = 16
+SFT_LR: float = 1e-4
+"""Halved 2e-4 -> 1e-4 to slow the slide into mode collapse."""
+SFT_LR_SCHEDULER: str = "constant_with_warmup"  # 20-step warmup then constant
+SFT_WARMUP_STEPS: int = 20
+SFT_WEIGHT_DECAY: float = 0.01
+SFT_LABEL_SMOOTHING: float = 0.05
+"""TrainingArguments.label_smoothing_factor; spreads the loss across
+non-target tokens so the model is less rewarded for memorising the
+single highest-likelihood completion."""
+SFT_OPTIMIZER: str = "adamw_8bit"
+SFT_DATASET_SIZE: int = 3_000        # 3,000 train + 100 held-out validation
+SFT_VAL_HOLDOUT: int = 100
+SFT_MAX_SEQ_LEN: int = 1024          # ~300 prompt + ~80 completion + headroom
+SFT_MAX_STEPS: int = 50
+"""Cut 200 -> 50 so SFT stops well before the model can grind itself
+into a single-output mode. The format-only knowledge fits in <50
+steps and post-SFT diversity preflight is the gate to GRPO."""
+SFT_EVAL_EVERY: int = 25             # legacy fallback if no schedule given
+SFT_SAVE_EVERY: int = 25
+SFT_LOG_EVERY: int = 10
+SFT_PREFLIGHT_DIVERSITY_FLOOR: int = 2
+"""eval/output_diversity threshold. If two consecutive evals both report
+output_diversity below this floor, the diversity-collapse early stop
+fires and SFT exits with reason=diversity_collapse."""
+SFT_DIVERSITY_COLLAPSE_RUN_LEN: int = 2
+"""Number of consecutive sub-floor evals required before stopping."""
+SFT_MAX_NEW_TOKENS: int = 200        # generation cap during eval
+# Was 128; bumped to 200 because Qwen2.5-Instruct's cold-start reasoning
+# (### Analysis: 1. ... 2. ... 3. ...) regularly runs to 100+ tokens
+# before reaching the format line in early SFT steps. With 128, every
+# step-5 sample truncated mid-reasoning and format_compliance read 0.
+# 200 gives ~70 tokens of headroom past a typical reasoning + format
+# completion (~70 tokens total) so truncation never masks the model's
+# real behaviour.
+# --- Variable eval cadence ------------------------------------------------- #
+# Early evals are quick sanity checks (small sample, format-only) so a
+# broken parser / generation drift gets caught before ~10 min of compute is
+# burned. Late evals are real measurements with the full sample size.
+# Catching format-compliance failure at step 15 instead of step 50 saves
+# ~7 minutes per fire.
+#
+# Each entry: (step, sample_size, mode) where mode is "format_only" or
+# "full". format_only skips the diversity probe and the physics-heavy
+# logical_correction / hamming / syndrome metrics, so the eval costs
+# ~30 seconds instead of ~2 minutes.
+SFT_EVAL_SCHEDULE: tuple[tuple[int, int, str], ...] = (
+    # 2026-04: schedule rebuilt to fit the SFT_MAX_STEPS=50 budget. Two
+    # full evals plus a fast format probe gives the diversity-collapse
+    # early-stop two consecutive data points before the run ends, which
+    # is the minimum to fire the new run-length-2 stop rule.
+    (5,   30,  "format_only"),
+    (15,  50,  "full"),
+    (25,  100, "full"),
+    (40,  100, "full"),
+    (50,  100, "full"),
+)
+SFT_PRINT_SAMPLE_OUTPUTS: int = 5    # raw outputs printed at each eval
+# Early-stop thresholds (master spec, section 3).
+SFT_EARLY_STOP_FORMAT: float = 0.95
+SFT_EARLY_STOP_CORRECTION: float = 0.80
+SFT_EARLY_STOP_DIVERSITY: int = 3
+SFT_MAX_WALL_SECONDS: float = 30 * 60.0  # 30-minute hard ceiling
+# HuggingFace Trainer subfolder (step-50 save) used to initialise GRPO.
+# ``python -m scripts.train_grpo`` defaults to this path; pipeline scripts
+# also pass it explicitly.
+SFT_CHECKPOINT_PATH_FOR_GRPO: str = "checkpoints/sft_warmup/checkpoint-50"
+# ---- GRPO RL phase (master spec, section 5; 2026-04 spec rewrite) -------- #
+# All numbers below were re-pinned by the 2026-04 GRPO spec. The previous
+# defaults (GRPO_STEPS=2000, LR=1e-5, KL=0.04, max_prompt=512,
+# max_completion=256, temperature=0.7) produced a degenerate "always say
+# []" policy in <100 steps because reward variance collapsed and KL
+# saturated the loss. The new defaults emphasise diversity:
+#
+#   - higher temperature (1.2) + top_k + repetition_penalty -> non-collapsed rollouts
+#   - shorter max_completion_length (50) -> the answer is one short line anyway
+#   - longer max_prompt_length (1500) -> distance-3 syndromes already use
+#     ~280 tokens; distance-5 / curriculum L3 needs the headroom
+#   - lower KL coefficient (0.02) -> reward signal not dominated by KL drift
+#   - 1500 steps -> wall-clock fits the 13h cap with margin
+GRPO_STEPS: int = 1_500
+GRPO_GEN_PER_PROMPT: int = 4         # GRPO needs >=2 for advantage
+GRPO_BATCH_SIZE: int = 1             # per-device prompts per step
+GRPO_GRAD_ACCUM: int = 8             # effective batch = 8 prompts
+GRPO_LR: float = 2e-5                # bumped from 1e-5; reward signal is sparse
+GRPO_LR_SCHEDULER: str = "constant"  # no warmup, no decay
+GRPO_KL_COEF: float = 0.02           # half the TRL default; alarm if KL > 0.3
+GRPO_MAX_PROMPT_LEN: int = 1_500     # surface-code prompts can run long
+GRPO_MAX_COMPLETION_LEN: int = 50    # answer is one line: X_ERRORS=[..] Z_ERRORS=[..]
+# ---- Diversity-focused rollout sampling (critical) ----------------------- #
+# These apply to GRPO ROLLOUT generation only. Eval uses temperature=0
+# (greedy) regardless of these. The combination temperature=1.2 + top_p=0.95
+# + top_k=50 + repetition_penalty=1.1 was selected because:
+#   * temperature=1.2 broadens the per-token distribution past the SFT
+#     mode-collapsed favourite ("X_ERRORS=[] Z_ERRORS=[]").
+#   * top_p=0.95 keeps tail tokens in but truncates the long tail.
+#   * top_k=50 caps the candidate set so we don't sample garbage.
+#   * repetition_penalty=1.1 discourages the model from repeating the
+#     exact same byte sequence within a 4-completion group (reduces
+#     "all 4 generations identical" rate, which kills GRPO's gradient).
+GRPO_TEMPERATURE: float = 1.2
+GRPO_TOP_P: float = 0.95
+GRPO_TOP_K: int = 50
+GRPO_REPETITION_PENALTY: float = 1.1
+GRPO_DO_SAMPLE: bool = True
+# ---- Checkpoint cadence + retention -------------------------------------- #
+GRPO_CHECKPOINT_EVERY: int = 100
+GRPO_SAVE_TOTAL_LIMIT: int = 3       # keep 3 most recent rolling checkpoints
+GRPO_LOG_EVERY: int = 5              # real-time visibility (every 5 steps)
+GRPO_OPTIMIZER: str = "adamw_8bit"
+GRPO_KL_ALARM: float = 0.3           # >this triggers manual triage
+GRPO_KL_HARD_CEIL: float = 0.5       # >this -> kill the run
+# ---- Wall-clock safety --------------------------------------------------- #
+GRPO_WALL_SECONDS: float = 46_800.0   # 13 hours. Save+exit if exceeded.
+# ---- Frozen eval set ----------------------------------------------------- #
+# The 200-syndrome eval set is regenerated from the env at GRPO start with
+# this seed. Same seed as SFT validation (sft_validation.jsonl) so eval
+# distributions are comparable across SFT and GRPO. The set is cached on
+# disk under data/grpo_validation.jsonl so reruns hit identical syndromes.
+GRPO_VAL_SEED: int = 4_284
+GRPO_VAL_EPISODES: int = 200
+GRPO_VAL_PATH: str = "data/grpo_validation.jsonl"
+# ---- Sample-table logging ------------------------------------------------ #
+GRPO_SAMPLE_LOG_EVERY: int = 50
+GRPO_SAMPLE_LOG_N: int = 5
+# ---- Anti-hacking: mode-collapse inspection hook ------------------------- #
+# Every N steps, we sample the most-recent N rollouts and check what
+# fraction of prompts had ALL 4 generations identical. If too many
+# prompts collapsed, raise the rollout temperature by a fixed step.
+GRPO_INSPECTION_HOOK_EVERY: int = 100
+GRPO_INSPECTION_SAMPLE_N: int = 10
+GRPO_INSPECTION_COLLAPSE_THRESHOLD: int = 7  # "> 7 of 10"
+GRPO_TEMP_BUMP_ON_COLLAPSE: float = 0.2
+# ---- Decision-rule thresholds (warnings only; no auto-action) ----------- #
+GRPO_DECISION_REWARD_STD_FLOOR: float = 0.03
+GRPO_DECISION_REWARD_STD_CHECK_STEP: int = 50
+GRPO_DECISION_BEAT_RATE_CHECK_STEP: int = 500
+GRPO_DECISION_FORMAT_FLOOR: float = 0.95
+GRPO_DECISION_GRAD_NORM_CEIL: float = 50.0
+GRPO_DECISION_GRAD_NORM_RUN_LEN: int = 3       # consecutive logs
 # Decoding sampler defaults at evaluation/format-test time.
+# (Used by greedy eval paths: temp/top_p only matter when do_sample=True.)
 SAMPLE_TEMPERATURE: float = 0.7
 SAMPLE_TOP_P: float = 0.95
 # all log to the same project / dashboard. Override per-run on the CLI.
 import os as _os  # noqa: E402  (local import to keep top of module clean)
+WANDB_PROJECT: str = _os.environ.get("WANDB_PROJECT", "QuantumScribe-GRPO")
+"""Default W&B project name. Override with ``WANDB_PROJECT=...``.
+Changed 2026-04 from ``"QuantumScribe"`` to ``"QuantumScribe-GRPO"`` per
+the GRPO spec rewrite. SFT runs that should land in the original project
+should set ``WANDB_PROJECT=QuantumScribe`` at the shell."""
+WANDB_ENTITY: str | None = _os.environ.get("WANDB_ENTITY", "ronitraj") or None
 """W&B team or username. ``None`` -> wandb's default entity for the user."""
 WANDB_DEFAULT_TAGS: tuple[str, ...] = (
 """Tags applied to every W&B run (per-script tags appended on top)."""
 WANDB_LOG_GENERATIONS_EVERY: int = 50
+"""Log a sample-completion table every N GRPO steps (master spec sec. 7)."""
+WANDB_SAMPLE_GENERATIONS: int = 5
+"""Number of generations included in each sample-completion table.
+Master spec, section 7: 'Save 5 randomly sampled rollouts ... and their rewards.'"""
+WANDB_INLOOP_EVAL_EVERY: int = 100
 """Run an in-loop evaluation pass (deterministic, ``WANDB_INLOOP_EVAL_EPISODES``
+syndromes) every N GRPO steps. Tightened from 250 -> 100 by the 2026-04 GRPO
+spec rewrite so collapse / drift gets caught within a 5-minute window
+instead of a 15-minute window."""
+WANDB_INLOOP_EVAL_EPISODES: int = 200
+"""Held-out syndromes per in-loop eval pass. Bumped from 100 -> 200 by the
+2026-04 spec rewrite so eval-stat error bars are tight enough to read
+pymatching_beat_rate movement (which is sub-5% in early training)."""
+WANDB_COMPARE_EVERY: int = 500
+"""Run the PyMatching head-to-head comparison every N steps (master spec sec. 7)."""
 # --------------------------------------------------------------------------- #

qubit_medic/prompts.py CHANGED Viewed

@@ -1,19 +1,23 @@
-"""Prompt formatter and action parser (Section 2.3 + Section 2.5 of the plan).
-The prompt is engineered around five sections:
-    1. Role declaration
-    2. Physics summary (~50 tokens, plain English)
-    3. Syndrome data (round-by-round, labelled)
-    4. Output format spec (one example included)
-    5. Reasoning trigger ("think step by step ...")
-Total budget ~250-300 tokens for the prompt; ~150 for the response.
-The parser is deliberately permissive on whitespace and bracket style but
-strict on the existence of the two key tokens ``X_ERRORS`` and ``Z_ERRORS``.
-A partial-credit hook is exposed so Reward 4 can hand out 0.5 for "partly
-parseable".
 """
 from __future__ import annotations
@@ -23,33 +27,30 @@ from typing import Iterable
 # --------------------------------------------------------------------------- #
-# Prompt formatting                                                            #
 # --------------------------------------------------------------------------- #
-_ROLE = (
-    "You are a quantum error-correction decoder. You are decoding errors in "
-    "a distance-{distance} rotated surface code memory experiment."
-)
-_PHYSICS_SUMMARY = (
-    "Stabilizers are parity checks measured every round. A *syndrome bit* "
-    "is 1 when a stabilizer's measurement disagrees with its previous round, "
-    "indicating a nearby physical error. Your job is to look at the syndrome "
-    "history and output the smallest physical error pattern (X-flips and "
-    "Z-flips on data qubits, identified by integer IDs) that explains it."
-)
-_OUTPUT_SPEC = (
-    "Output format (REQUIRED, exact):\n"
-    "    X_ERRORS=[id1,id2,...] Z_ERRORS=[id1,id2,...]\n"
-    "Use empty lists when no errors of that type. Example with no errors:\n"
-    "    X_ERRORS=[] Z_ERRORS=[]"
-)
-_REASONING_TRIGGER = (
-    "Think step by step about which qubits could have caused this syndrome, "
-    "then output your prediction in the required format."
-)
 def format_syndrome_block(
@@ -58,19 +59,33 @@ def format_syndrome_block(
     num_x_stabilizers: int,
     num_z_stabilizers: int,
 ) -> str:
-    """Render the detector activations round-by-round.
-    Stim emits detectors in a flat row-major order: round 0 stabilisers first,
-    then round 1, and so on. We label by round and stabiliser type so the LLM
-    can read the temporal structure.
     """
     bits = list(syndrome_bits)
     per_round = num_x_stabilizers + num_z_stabilizers
-    lines = ["Syndrome (round-by-round):"]
     if per_round == 0 or rounds == 0 or len(bits) == 0:
-        lines.append("    (no detectors fired)")
-        return "\n".join(lines)
     for r in range(rounds):
         offset = r * per_round
         if offset >= len(bits):
@@ -79,18 +94,15 @@ def format_syndrome_block(
         x_chunk = chunk[:num_x_stabilizers]
         z_chunk = chunk[num_x_stabilizers : num_x_stabilizers + num_z_stabilizers]
         lines.append(
-            f"    Round {r + 1} X-stabilizers: "
-            + " ".join(str(b) for b in x_chunk)
         )
         lines.append(
-            f"    Round {r + 1} Z-stabilizers: "
-            + " ".join(str(b) for b in z_chunk)
         )
-    # Trailing block for the final destructive measurement, if any extras.
     used = rounds * per_round
     if used < len(bits):
         tail = bits[used:]
-        lines.append("    Final-round detectors: " + " ".join(str(b) for b in tail))
     return "\n".join(lines)
@@ -104,10 +116,10 @@ def build_prompt(
     num_z_stabilizers: int,
     num_data_qubits: int,
 ) -> str:
-    """Assemble the full prompt the LLM sees on each step.
-    Keeping this function pure (no I/O, no globals) means the SFT pipeline
-    and the GRPO rollout use byte-identical inputs - a critical invariant.
     """
     syndrome_block = format_syndrome_block(
         syndrome_bits=syndrome_bits,
@@ -115,27 +127,53 @@ def build_prompt(
         num_x_stabilizers=num_x_stabilizers,
         num_z_stabilizers=num_z_stabilizers,
     )
-    return (
-        _ROLE.format(distance=distance)
-        + "\n\n"
-        + _PHYSICS_SUMMARY
-        + "\n\n"
-        + f"Code parameters: distance={distance}, rounds={rounds}, "
-        + f"physical_error_rate={p:g}, data_qubits=0..{num_data_qubits - 1}.\n\n"
-        + syndrome_block
-        + "\n\n"
-        + _OUTPUT_SPEC
-        + "\n\n"
-        + _REASONING_TRIGGER
     )
 # --------------------------------------------------------------------------- #
-# Output parsing                                                               #
 # --------------------------------------------------------------------------- #
-_X_PATTERN = re.compile(r"X_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
-_Z_PATTERN = re.compile(r"Z_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
 @dataclass(frozen=True)
@@ -145,13 +183,19 @@ class ParseResult:
     parse_success: bool        # True iff BOTH X_ERRORS and Z_ERRORS parsed cleanly
     parse_partial: bool        # True iff exactly one of the two parsed cleanly
     raw_response: str
     @property
     def format_score(self) -> float:
-        """Score for Reward 4 (format compliance)."""
-        if self.parse_success:
             return 1.0
-        if self.parse_partial:
             return 0.5
         return 0.0
@@ -160,6 +204,8 @@ def _parse_int_list(s: str, max_qubit: int) -> tuple[list[int], bool]:
     """Parse a comma/space-separated integer list. Drops out-of-range and dups.
     Returns ``(qubits_sorted_unique, all_tokens_were_valid)``.
     """
     if not s.strip():
         return [], True
@@ -182,25 +228,77 @@ def _parse_int_list(s: str, max_qubit: int) -> tuple[list[int], bool]:
 def parse_action(raw_response: str, num_data_qubits: int) -> ParseResult:
-    """Convert the LLM's raw text to a ``ParseResult``.
-    Tolerant of trailing chain-of-thought, surrounding code fences, and
-    casing, but strict on the existence of both ``X_ERRORS`` and ``Z_ERRORS``.
     """
     if not isinstance(raw_response, str):
-        return ParseResult([], [], False, False, raw_response="")
-    # If the model wrapped its answer in ```...``` blocks, focus on the last one.
-    fenced = re.findall(r"```(?:[^\n]*)\n(.*?)```", raw_response, re.DOTALL)
-    search_text = fenced[-1] if fenced else raw_response
-    x_match = _X_PATTERN.search(search_text)
-    z_match = _Z_PATTERN.search(search_text)
     x_errors: list[int] = []
     z_errors: list[int] = []
     x_clean = z_clean = False
     if x_match is not None:
         x_errors, x_clean = _parse_int_list(x_match.group(1), num_data_qubits)
     if z_match is not None:
@@ -214,12 +312,17 @@ def parse_action(raw_response: str, num_data_qubits: int) -> ParseResult:
         (x_match is not None and z_match is not None) and not parse_success
     )
     return ParseResult(
         x_errors=x_errors,
         z_errors=z_errors,
         parse_success=parse_success,
         parse_partial=parse_partial,
         raw_response=raw_response,
     )

+"""Locked prompt template + parser (master spec, sections 4 + parser).
+This module is the *single source of truth* for what the LLM sees during
+SFT and GRPO. The exact wording is fixed: anything that drifts the prompt
+between phases throws away the SFT investment because RL builds on the
+format SFT taught.
+Spec sections honoured:
+* Section 4 - "The exact prompt template (locked, for both SFT and RL)"
+* Section 4 - "The {syndrome_block} format" (round-by-round, X first then Z)
+* Section 4 - "The parser specification (critical)"
+Parser highlights
+-----------------
+* Case-insensitive on ``X_ERRORS``/``Z_ERRORS`` keys.
+* Tolerant of trailing chain-of-thought, code fences, and whitespace.
+* **Takes the LAST occurrence** of ``X_ERRORS`` so the literal example
+  inside the prompt's "Examples:" block is never confused for the answer.
+* Validates each id against ``[0, max_qubit_id]`` and dedups within a list.
+* Returns a partial-credit score (1.0 / 0.5 / 0.0) for Reward 4.
 """
 from __future__ import annotations
 # --------------------------------------------------------------------------- #
+# Prompt template (LOCKED - see master spec, section 4)                       #
 # --------------------------------------------------------------------------- #
+_PROMPT_TEMPLATE = """You are an expert quantum error correction decoder. Your job is to identify which data qubits experienced errors based on syndrome measurements.
+A surface code protects 1 logical qubit using {num_data_qubits} data qubits arranged in a {distance}x{distance} grid. Stabilizer measurements detect errors: a '1' means that stabilizer fired (detected something wrong nearby); a '0' means it looks fine. Errors must be deduced from the pattern of stabilizers that fired.
+Code distance: {distance}
+Number of stabilizer rounds: {rounds}
+Physical error rate: {p}
+X-stabilizer count per round: {num_x_stabilizers}
+Z-stabilizer count per round: {num_z_stabilizers}
+{syndrome_block}
+Identify which data qubits (numbered 0-{max_qubit_id}) had X-errors and Z-errors. Most syndromes have 0-2 errors; an empty list means no errors of that type.
+Output exactly ONE line and nothing else. Do not write reasoning, markdown, bullets, analysis, or explanations. Your entire response must match this exact format:
+X_ERRORS=[qubit_ids] Z_ERRORS=[qubit_ids]
+Valid one-line examples:
+X_ERRORS=[] Z_ERRORS=[]
+X_ERRORS=[] Z_ERRORS=[4]
+X_ERRORS=[2] Z_ERRORS=[5,6]"""
 def format_syndrome_block(
     num_x_stabilizers: int,
     num_z_stabilizers: int,
 ) -> str:
+    """Render detector activations round-by-round, exactly per the spec.
+    Format example for distance-3, rounds=3:
+        Round 1 X-stabilizers: 0 0 1 0
+        Round 1 Z-stabilizers: 0 0 0 0
+        Round 2 X-stabilizers: 0 0 1 0
+        Round 2 Z-stabilizers: 0 0 0 0
+        Round 3 X-stabilizers: 0 0 0 0
+        Round 3 Z-stabilizers: 0 0 0 0
+    Every round on its own line, X first then Z, space-separated bits, no
+    indent, no commas. Rounds are always emitted in full even when all
+    bits are zero so the LLM sees consistent shape.
+    Stim's detector layout for the rotated-memory experiment is row-major:
+    round 0 stabilizers first, then round 1, and so on. For each round it
+    interleaves the per-type detectors in the order Stim's circuit was
+    generated (we treat the first ``num_x_stabilizers`` per round as X
+    and the rest as Z, matching ``per_round_x_z_counts``).
     """
     bits = list(syndrome_bits)
     per_round = num_x_stabilizers + num_z_stabilizers
     if per_round == 0 or rounds == 0 or len(bits) == 0:
+        return "(no detectors fired)"
+    lines: list[str] = []
     for r in range(rounds):
         offset = r * per_round
         if offset >= len(bits):
         x_chunk = chunk[:num_x_stabilizers]
         z_chunk = chunk[num_x_stabilizers : num_x_stabilizers + num_z_stabilizers]
         lines.append(
+            f"Round {r + 1} X-stabilizers: " + " ".join(str(int(b)) for b in x_chunk)
         )
         lines.append(
+            f"Round {r + 1} Z-stabilizers: " + " ".join(str(int(b)) for b in z_chunk)
         )
     used = rounds * per_round
     if used < len(bits):
         tail = bits[used:]
+        lines.append("Final-round detectors: " + " ".join(str(int(b)) for b in tail))
     return "\n".join(lines)
     num_z_stabilizers: int,
     num_data_qubits: int,
 ) -> str:
+    """Assemble the locked prompt the LLM sees on each step.
+    Pure function (no I/O, no globals) so the SFT pipeline and GRPO
+    rollout produce byte-identical prompt strings - a critical invariant.
     """
     syndrome_block = format_syndrome_block(
         syndrome_bits=syndrome_bits,
         num_x_stabilizers=num_x_stabilizers,
         num_z_stabilizers=num_z_stabilizers,
     )
+    return _PROMPT_TEMPLATE.format(
+        num_data_qubits=num_data_qubits,
+        distance=distance,
+        rounds=rounds,
+        p=p,
+        num_x_stabilizers=num_x_stabilizers,
+        num_z_stabilizers=num_z_stabilizers,
+        syndrome_block=syndrome_block,
+        max_qubit_id=num_data_qubits - 1,
     )
 # --------------------------------------------------------------------------- #
+# Output parsing (LOCKED - see master spec, section 4 "Parser specification") #
 # --------------------------------------------------------------------------- #
+#
+# Two-tier parser:
+#   * STRICT  - canonical "X_ERRORS=[...] Z_ERRORS=[...]". Only this form
+#     scores 1.0 on Reward 4 (format_compliance), so the GRPO signal still
+#     pushes the model toward the locked spec wording.
+#   * LENIENT - also accepts ":" instead of "=", "()" instead of "[]",
+#               "X-ERRORS" / "X ERRORS" key spellings, and tolerates
+#               \boxed{...} / **...** wrapping. Used so eval/metrics see
+#               the model's actual *answer* whenever it is extractable,
+#               instead of silently treating parse failures as
+#               "predict no errors" (which hides the bug at p=0.001 where
+#               ~95% of syndromes are trivial and an empty prediction is
+#               accidentally correct).
+# Strict canonical form: "=" + "[]" - required for Reward 4 = 1.0.
+_X_PATTERN_STRICT = re.compile(r"X_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
+_Z_PATTERN_STRICT = re.compile(r"Z_ERRORS\s*=\s*\[([^\]]*)\]", re.IGNORECASE)
+# Lenient form: "=" or ":" separator, "[]" or "()" brackets, and the key may
+# be spelt "X_ERRORS" / "X-ERRORS" / "X ERRORS" / "XERRORS".
+_X_PATTERN_LENIENT = re.compile(
+    r"X[\s_\-]*ERRORS\s*[=:]\s*[\[\(]([^\]\)]*)[\]\)]",
+    re.IGNORECASE,
+)
+_Z_PATTERN_LENIENT = re.compile(
+    r"Z[\s_\-]*ERRORS\s*[=:]\s*[\[\(]([^\]\)]*)[\]\)]",
+    re.IGNORECASE,
+)
+# Key locator (lenient) - finds where any X-errors keyword starts so we can
+# slice past in-prompt examples and home in on the model's actual answer.
+_X_KEY = re.compile(r"X[\s_\-]*ERRORS", re.IGNORECASE)
 @dataclass(frozen=True)
     parse_success: bool        # True iff BOTH X_ERRORS and Z_ERRORS parsed cleanly
     parse_partial: bool        # True iff exactly one of the two parsed cleanly
     raw_response: str
+    strict_format: bool = False  # True iff matched the canonical "=" + "[]" form
     @property
     def format_score(self) -> float:
+        """Score for Reward 4 (format compliance).
+        Only the canonical strict form earns 1.0, so the GRPO reward stays
+        anchored to the locked spec wording. Lenient parses or partials
+        score 0.5; total miss scores 0.0.
+        """
+        if self.parse_success and self.strict_format:
             return 1.0
+        if self.parse_success or self.parse_partial:
             return 0.5
         return 0.0
     """Parse a comma/space-separated integer list. Drops out-of-range and dups.
     Returns ``(qubits_sorted_unique, all_tokens_were_valid)``.
+    A token is "invalid" if it isn't an integer or falls outside ``[0, max_qubit)``.
+    Duplicates within a list count as silently de-duped, not invalid.
     """
     if not s.strip():
         return [], True
 def parse_action(raw_response: str, num_data_qubits: int) -> ParseResult:
+    """Convert the LLM's raw text to a :class:`ParseResult`.
+    Two-pass algorithm:
+      1. Receive the full model response string; normalise common LaTeX/
+         markdown wrappers (``\\boxed{...}``, ``**bold**``).
+      2. If the model wrapped output in fenced code blocks, focus on the
+         LAST fenced block.
+      3. Locate all X-errors keys; slice forward from the LAST one (so the
+         example block in the prompt never wins).
+      4. Try the STRICT pattern (``X_ERRORS=[...]``) first. If both X and Z
+         lists match, ``strict_format=True``.
+      5. Otherwise try the LENIENT pattern (``=`` or ``:``, ``[]`` or ``()``)
+         so a near-miss like ``X_ERRORS: [1]`` still surfaces the model's
+         intended prediction.
+      6. Validate every parsed integer is in ``[0, max_qubit_id]``; reject
+         duplicates within a list.
+      7. ``parse_success`` requires BOTH lists to parse cleanly;
+         ``parse_partial`` is set when exactly one parsed cleanly OR both
+         keys appear but tokens were dirty.
+    The lenient fallback exists for *eval/diagnostic honesty*, not to
+    weaken the training signal: ``format_score`` (Reward 4) only returns
+    1.0 when ``strict_format`` is also True.
     """
     if not isinstance(raw_response, str):
+        return ParseResult([], [], False, False, raw_response="", strict_format=False)
+    # 1: normalise common wrappers so the regex sees the inner content.
+    normalised = raw_response
+    # Strip \boxed{...} (LaTeX) - keep inner text.
+    normalised = re.sub(r"\\boxed\{([^{}]*)\}", r"\1", normalised)
+    # Strip surrounding **bold** / *italic* markers around the format block.
+    normalised = re.sub(r"\*+([A-Za-z_][^*]{0,40})\*+", r"\1", normalised)
+    # 2: fence handling - prefer last fenced block if present.
+    fenced = re.findall(r"```(?:[^\n]*)\n(.*?)```", normalised, re.DOTALL)
+    search_text = fenced[-1] if fenced else normalised
+    # 3: find the LAST X-errors key occurrence.
+    x_keys = list(_X_KEY.finditer(search_text))
+    if x_keys:
+        last_x_pos = x_keys[-1].start()
+        slice_text = search_text[last_x_pos:]
+        # If the last key has no payload (truncated), fall back one.
+        if (
+            not _X_PATTERN_STRICT.search(slice_text)
+            and not _X_PATTERN_LENIENT.search(slice_text)
+            and len(x_keys) > 1
+        ):
+            last_x_pos = x_keys[-2].start()
+            slice_text = search_text[last_x_pos:]
+    else:
+        slice_text = search_text
+    # 4-5: try strict, then lenient.
+    x_match = _X_PATTERN_STRICT.search(slice_text)
+    z_matches_strict = list(_Z_PATTERN_STRICT.finditer(slice_text))
+    z_match = z_matches_strict[-1] if z_matches_strict else None
+    strict_x = x_match is not None
+    strict_z = z_match is not None
+    if x_match is None:
+        x_match = _X_PATTERN_LENIENT.search(slice_text)
+    if z_match is None:
+        z_matches_lenient = list(_Z_PATTERN_LENIENT.finditer(slice_text))
+        z_match = z_matches_lenient[-1] if z_matches_lenient else None
+    # 6: extract + validate qubit IDs.
     x_errors: list[int] = []
     z_errors: list[int] = []
     x_clean = z_clean = False
     if x_match is not None:
         x_errors, x_clean = _parse_int_list(x_match.group(1), num_data_qubits)
     if z_match is not None:
         (x_match is not None and z_match is not None) and not parse_success
     )
+    # strict_format is true only when BOTH X and Z hit the canonical pattern
+    # cleanly (no garbage tokens, no out-of-range qubits).
+    strict_format = bool(strict_x and strict_z and parse_success)
     return ParseResult(
         x_errors=x_errors,
         z_errors=z_errors,
         parse_success=parse_success,
         parse_partial=parse_partial,
         raw_response=raw_response,
+        strict_format=strict_format,
     )

qubit_medic/server/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (393 Bytes). View file

qubit_medic/server/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (9.51 kB). View file

qubit_medic/server/__pycache__/curriculum.cpython-312.pyc ADDED Viewed

Binary file (5.55 kB). View file

qubit_medic/server/__pycache__/environment.cpython-312.pyc ADDED Viewed

Binary file (14.2 kB). View file

qubit_medic/server/__pycache__/openenv_adapter.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

qubit_medic/server/__pycache__/physics.cpython-312.pyc ADDED Viewed

Binary file (19.9 kB). View file

qubit_medic/server/__pycache__/rewards.cpython-312.pyc ADDED Viewed

Binary file (12.7 kB). View file

qubit_medic/server/app.py CHANGED Viewed

@@ -6,6 +6,8 @@ routes (``/reset``, ``/step``, ``/state``, ``/health``, ``/schema``,
 We add a few extras on top:
 * ``GET  /healthz``   - the Day-0 deployment-substrate liveness probe
   (returns Stim/PyMatching/openenv versions). Used by the recurring
   4-hour HF Spaces wakeup ping.
@@ -24,6 +26,7 @@ import sys
 from typing import Optional
 from fastapi import Body, HTTPException
 from openenv.core import create_fastapi_app
 from qubit_medic.config import DEFAULT_HOST, DEFAULT_PORT
@@ -60,6 +63,44 @@ app.description = (
 )
 # --------------------------------------------------------------------------- #
 # Day-0 + demo extras                                                          #
 # --------------------------------------------------------------------------- #
@@ -79,6 +120,41 @@ def _get_legacy_env() -> DecoderEnvironment:
     return _legacy_env
 @app.get("/healthz")
 def healthz() -> dict:
     """Lightweight liveness probe (Day-0 deployment-substrate test).

 We add a few extras on top:
+* ``GET  /``          - HTML landing page (HF Spaces **App** tab); links to
+  ``/docs``, ``/healthz``, ``/metadata`` (avoids 404 on the root URL).
 * ``GET  /healthz``   - the Day-0 deployment-substrate liveness probe
   (returns Stim/PyMatching/openenv versions). Used by the recurring
   4-hour HF Spaces wakeup ping.
 from typing import Optional
 from fastapi import Body, HTTPException
+from fastapi.responses import HTMLResponse
 from openenv.core import create_fastapi_app
 from qubit_medic.config import DEFAULT_HOST, DEFAULT_PORT
 )
+@app.get("/", response_class=HTMLResponse, include_in_schema=False)
+def root() -> str:
+    """Space + browser landing page (HF opens ``/`` in the App tab).
+    The OpenEnv API lives under ``/reset``, ``/step``, etc.; there was no
+    root handler, so visitors saw 404. This page links to docs and health.
+    """
+    return """<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <title>Qubit-Medic OpenEnv</title>
+  <style>
+    body { font-family: system-ui, sans-serif; max-width: 40rem; margin: 2rem auto; padding: 0 1rem; line-height: 1.5; color: #1e293b; }
+    h1 { font-size: 1.5rem; }
+    ul { padding-left: 1.2rem; }
+    a { color: #2563eb; }
+    code { background: #f1f5f9; padding: 0.1em 0.3em; border-radius: 4px; }
+  </style>
+</head>
+<body>
+  <h1>Qubit-Medic — OpenEnv server</h1>
+  <p>This Space exposes a <strong>JSON API</strong> for the quantum error-decoding
+  environment (Stim + PyMatching, OpenEnv contract). There is no full-page
+  Gradio UI here; use the links below.</p>
+  <ul>
+    <li><a href="/docs">Interactive API docs (Swagger)</a></li>
+    <li><a href="/redoc">ReDoc</a></li>
+    <li><a href="/healthz">Liveness <code>GET /healthz</code></a> — versions probe</li>
+    <li><a href="/metadata">OpenEnv <code>GET /metadata</code></a></li>
+  </ul>
+  <p>Typical flow: <code>POST /reset</code> then <code>POST /step</code> with
+  the model&rsquo;s text action — see the schema in <code>/docs</code>.</p>
+</body>
+</html>"""
 # --------------------------------------------------------------------------- #
 # Day-0 + demo extras                                                          #
 # --------------------------------------------------------------------------- #
     return _legacy_env
+# --------------------------------------------------------------------------- #
+# Compliance Section 2 (audit 2026-04): POST /state and POST /close.          #
+# --------------------------------------------------------------------------- #
+# OpenEnv's create_fastapi_app already mounts GET /state and (via the
+# canonical contract) does not expose /close at all. The participant-guide
+# audit explicitly requires POST /state and POST /close, so we surface
+# both as additional routes that delegate to the legacy DecoderEnvironment
+# singleton (the same one /decode already uses). The OpenEnv-canonical
+# GET /state route is preserved untouched.
+# --------------------------------------------------------------------------- #
+@app.post("/state")
+def post_state() -> dict:
+    """POST mirror of the OpenEnv GET /state route.
+    Returns a JSON-serialisable snapshot of env state. Uses the inner
+    :meth:`DecoderEnvironment.state` (added in Section 1 compliance work)
+    which excludes ground-truth fields by construction.
+    """
+    return _get_legacy_env().state()
+@app.post("/close")
+def post_close() -> dict:
+    """POST /close: drop in-flight episodes on the legacy env singleton.
+    The singleton is rebuilt lazily on the next /reset, so calling /close
+    repeatedly is idempotent. Returns a small JSON dict so the caller can
+    confirm the request landed.
+    """
+    _get_legacy_env().close()
+    return {"ok": True, "closed": True}
 @app.get("/healthz")
 def healthz() -> dict:
     """Lightweight liveness probe (Day-0 deployment-substrate test).

qubit_medic/server/environment.py CHANGED Viewed

@@ -201,9 +201,17 @@ class DecoderEnvironment:
         with self._lock:
             episode = self._active.pop(episode_id, None)
             if episode is None:
-                # Calling step() on an unknown episode ID is a hard error -
-                # the trainer didn't follow reset/step pairing.
-                raise KeyError(f"unknown or already-finished episode {episode_id}")
             elapsed = time.monotonic() - episode.started_at
             timed_out = elapsed > EPISODE_TIMEOUT_SECONDS
@@ -312,3 +320,36 @@ class DecoderEnvironment:
                 "curriculum": self._scheduler.stats(),
                 "cached_levels": list(self._caches.keys()),
             }

         with self._lock:
             episode = self._active.pop(episode_id, None)
             if episode is None:
+                # Calling step() on an unknown episode ID is a clean
+                # ValueError (compliance Section 1 of the participant-guide
+                # audit: the env must "raise a clean ValueError, not a
+                # Python traceback"). The trainer didn't follow reset/step
+                # pairing, or the episode already ended; either way we
+                # surface a typed exception so the FastAPI layer can turn
+                # it into a 400 response instead of a 500.
+                raise ValueError(
+                    f"unknown or already-finished episode {episode_id}; "
+                    f"call reset() before step()."
+                )
             elapsed = time.monotonic() - episode.started_at
             timed_out = elapsed > EPISODE_TIMEOUT_SECONDS
                 "curriculum": self._scheduler.stats(),
                 "cached_levels": list(self._caches.keys()),
             }
+    def state(self) -> dict:
+        """Return a JSON-serialisable snapshot of the env's externally-
+        visible state (compliance Section 1 of the participant-guide
+        audit: ``state()`` returns a JSON-serialisable object, not a raw
+        Python object).
+        Crucially this never includes the ground-truth fields stored on
+        the per-episode :class:`DecoderState` (true error patterns,
+        actual_observable_flip, pymatching_observable_pred, circuit_text,
+        dem_text). Those stay in ``self._active[ep].state`` and are only
+        consumed by the reward functions.
+        """
+        with self._lock:
+            return {
+                "episodes_started": int(self._episode_counter),
+                "active_episodes": int(len(self._active)),
+                "active_episode_ids": [int(ep) for ep in self._active.keys()],
+                "cached_levels": list(self._caches.keys()),
+                "curriculum": self._scheduler.stats(),
+                "base_seed": int(self._base_seed),
+            }
+    def close(self) -> None:
+        """Drop any in-flight episodes and clear caches.
+        Compliance Section 1: the gym-style API requires ``close()``.
+        After ``close()`` the env can still be re-used by calling
+        ``reset()`` again - we don't tear down the curriculum scheduler
+        or release the lock; we only release per-episode bookkeeping.
+        """
+        with self._lock:
+            self._active.clear()

qubit_medic/server/rewards.py CHANGED Viewed

@@ -84,13 +84,21 @@ def reward_syndrome_consistency(
 ) -> float:
     """How well does the predicted Pauli frame reproduce the FINAL detectors?
-    Computes Hamming similarity between ``predicted_final_bits`` (induced by
-    the predicted X errors) and ``observed_final_bits``. Returns
     ``1 - hamming_distance / num_final_detectors``.
     Rationale (Section 3.2): without this term, an LLM that lucky-guesses
-    the right qubits could get Reward 1 occasionally; this signal forces it
-    to also explain the data the syndrome carries.
     """
     final_dets = layout.final_detectors
     if not final_dets:
@@ -104,7 +112,17 @@ def reward_syndrome_consistency(
         predicted = implied.get(det_idx, 0)
         if observed != predicted:
             distance += 1
-    return 1.0 - distance / len(final_dets)
 def compute_final_detector_supports(
@@ -141,13 +159,37 @@ def compute_final_detector_supports(
 # --------------------------------------------------------------------------- #
-def _jaccard(a: list[int], b: list[int]) -> float:
-    """Jaccard index. Returns 1.0 when both sets are empty (perfect agreement)."""
-    sa, sb = set(a), set(b)
-    if not sa and not sb:
-        return 1.0
-    inter = len(sa & sb)
-    union = len(sa | sb)
     return inter / union if union else 1.0
@@ -156,16 +198,19 @@ def reward_hamming_overlap(
     sample: SyndromeSample,
     layout: CircuitLayout,
 ) -> float:
-    """Average of Jaccard(X) and Jaccard(Z) against the reference frame.
-    Reference is PyMatching's per-edge predicted Pauli frame
-    (``sample.pymatching_x_errors`` / ``..._z_errors``). This is the dense
-    partial-credit signal of Section 3.3 - even if Reward 1 fires zero,
-    being *close* to the canonical solution still gets credit, smoothing
-    the reward landscape during early training.
     """
-    jx = _jaccard(parsed.x_errors, sample.pymatching_x_errors)
-    jz = _jaccard(parsed.z_errors, sample.pymatching_z_errors)
     return 0.5 * (jx + jz)
@@ -175,8 +220,16 @@ def reward_hamming_overlap(
 def reward_format_compliance(parsed: ParseResult) -> float:
-    """1.0 if both keys parsed, 0.5 if exactly one, 0.0 if neither."""
-    return parsed.format_score
 # --------------------------------------------------------------------------- #

 ) -> float:
     """How well does the predicted Pauli frame reproduce the FINAL detectors?
+    Computes Hamming similarity between ``predicted_final_bits`` (induced
+    by the predicted X errors) and ``observed_final_bits``. Returns
     ``1 - hamming_distance / num_final_detectors``.
     Rationale (Section 3.2): without this term, an LLM that lucky-guesses
+    the right qubits could get Reward 1 occasionally; this signal forces
+    it to also explain the data the syndrome carries.
+    2026-04 anti-collapse cap (FIX 1, RL spec rewrite): if the prediction
+    is empty AND the observed syndrome is non-empty (at least one
+    detector fired), cap the score at 0.5. Without this cap, the
+    "always predict empty" policy can still pull a high syndrome-
+    consistency score on the prompts where the implied final-round bits
+    happen to coincide with zeros, which kept GRPO trapped in the
+    constant-empty mode.
     """
     final_dets = layout.final_detectors
     if not final_dets:
         predicted = implied.get(det_idx, 0)
         if observed != predicted:
             distance += 1
+    base = 1.0 - distance / len(final_dets)
+    # Anti-collapse cap: empty prediction + non-empty observed syndrome
+    # is a "did nothing while alarms were firing" failure mode. Cap at
+    # 0.5 so the empty policy can never approach the full 1.0 even when
+    # the implied final-round bits happen to coincide.
+    pred_is_empty = (not parsed.x_errors) and (not parsed.z_errors)
+    has_active_syndrome = any(int(b) != 0 for b in sample.syndrome_bits)
+    if pred_is_empty and has_active_syndrome:
+        return min(base, 0.5)
+    return base
 def compute_final_detector_supports(
 # --------------------------------------------------------------------------- #
+def _set_aware_jaccard(true_set: list[int], pred_set: list[int]) -> float:
+    """Set-aware Jaccard: penalises BOTH false alarms and missed errors.
+    2026-04 spec rewrite (FIX 1). The four-case rule is what makes
+    "predict empty everywhere" stop being a near-optimal strategy:
+    +-------------+-----------+-----------------------------------------+
+    | true_set    | pred_set  | score                                   |
+    +-------------+-----------+-----------------------------------------+
+    | empty       | empty     | 1.0   (perfect, "no errors -> no edit") |
+    | empty       | non-empty | 0.0   false alarm                       |
+    | non-empty   | empty     | 0.0   missed errors  <-- the key change |
+    | non-empty   | non-empty | |inter| / |union|  (standard Jaccard)   |
+    +-------------+-----------+-----------------------------------------+
+    Critically the third case used to score 1.0 under the prior plain
+    Jaccard (because both sets were treated symmetrically; "everything
+    correct, just nothing predicted" was indistinguishable from "perfect
+    agreement"). Under this rule a missed-error answer scores 0.0,
+    which moves the GRPO reward landscape so a non-trivial prediction
+    can climb out of the empty-everywhere local optimum.
+    """
+    sa, sp = set(true_set), set(pred_set)
+    if not sa and not sp:
+        return 1.0  # perfect agreement: no true errors AND no claimed errors
+    if not sa and sp:
+        return 0.0  # false alarm: claimed errors that were not there
+    if sa and not sp:
+        return 0.0  # missed errors: alarms fired but model said nothing
+    inter = len(sa & sp)
+    union = len(sa | sp)
     return inter / union if union else 1.0
     sample: SyndromeSample,
     layout: CircuitLayout,
 ) -> float:
+    """Average of set-aware Jaccard(X) and set-aware Jaccard(Z) against
+    the reference Pauli frame carried by ``SyndromeSample``.
+    The reference frame lives on
+    ``sample.pymatching_x_errors`` / ``sample.pymatching_z_errors`` —
+    in this codebase that frame is treated as the ground-truth target
+    (the SFT/GRPO dataset builders fill it from the same source as the
+    JSONL ``true_x_errors`` / ``true_z_errors`` fields). Per-axis score
+    uses the set-aware rule (see :func:`_set_aware_jaccard`), so missed
+    errors no longer score 1.0 just because the prediction set is empty.
     """
+    jx = _set_aware_jaccard(sample.pymatching_x_errors, parsed.x_errors)
+    jz = _set_aware_jaccard(sample.pymatching_z_errors, parsed.z_errors)
     return 0.5 * (jx + jz)
 def reward_format_compliance(parsed: ParseResult) -> float:
+    """Binary {0.0, 1.0}: 1.0 iff the parser fully extracted both lists.
+    2026-04 spec rewrite (FIX 1): partial credit (0.5) is removed. With
+    partial credit on, the model could still earn ~half the format
+    weight on garbage outputs that resembled the canonical form, which
+    is part of what kept the reward landscape too flat for GRPO to
+    escape the empty-everywhere mode. The new rule rewards only a
+    cleanly-parsed answer.
+    """
+    return 1.0 if parsed.parse_success else 0.0
 # --------------------------------------------------------------------------- #

qubit_medic/wandb_utils.py CHANGED Viewed

@@ -260,12 +260,22 @@ def run_context(run_name: str, job_type: str, **kwargs):
 def log(metrics: Mapping[str, Any], *, step: Optional[int] = None,
         commit: bool = True) -> None:
-    """No-op-safe ``wandb.log`` wrapper."""
     wandb = _import_wandb()
     if wandb is None or _RUN is None:
         return
     try:
-        wandb.log(dict(metrics), step=step, commit=commit)
     except Exception as exc:  # pragma: no cover - defensive
         print(f"[wandb] log failed: {exc}", file=sys.stderr)

 def log(metrics: Mapping[str, Any], *, step: Optional[int] = None,
         commit: bool = True) -> None:
+    """No-op-safe ``wandb.log`` wrapper.
+    We store training-step alignment as an explicit scalar
+    ``train/global_step`` instead of passing W&B's reserved ``step=`` value.
+    HuggingFace/TRL may advance W&B's internal step before our callback logs,
+    which otherwise produces "Tried to log to step N that is less than the
+    current step N+1" and drops eval metrics.
+    """
     wandb = _import_wandb()
     if wandb is None or _RUN is None:
         return
     try:
+        payload = dict(metrics)
+        if step is not None and "train/global_step" not in payload:
+            payload["train/global_step"] = int(step)
+        wandb.log(payload, commit=commit)
     except Exception as exc:  # pragma: no cover - defensive
         print(f"[wandb] log failed: {exc}", file=sys.stderr)