Spaces:

anugrah55
/

cernenv-trainer

Sleeping

App Files Files Community

anugrah55 commited on 13 days ago

Commit

5f78183

verified ·

1 Parent(s): ada94e4

Update CERNenv Space

Browse files

Files changed (49) hide show

.dockerignore +9 -0
.gitignore +21 -0
.python-version +1 -0
Dockerfile +31 -0
README.md +58 -4
[External] Apr ‘26 OpenEnv Hackathon Themes & Judging Criteria.txt +190 -0
[External] Meta OpenEnv Hackathon Participant Help Guide.txt +291 -0
client.py +37 -0
models.py +600 -0
openenv.yaml +6 -0
pyproject.toml +61 -0
scripts/__init__.py +0 -0
scripts/_build_spaces.py +135 -0
scripts/baseline_agents.py +305 -0
scripts/push_to_hub.py +247 -0
scripts/run_agent.py +129 -0
server/Dockerfile +50 -0
server/__init__.py +1 -0
server/app.py +52 -0
server/environment.py +363 -0
server/requirements.txt +6 -0
server/rewards/__init__.py +19 -0
server/rewards/reward_function.py +283 -0
server/rules/__init__.py +5 -0
server/rules/engine.py +203 -0
server/simulator/__init__.py +31 -0
server/simulator/latent_state.py +171 -0
server/simulator/noise.py +161 -0
server/simulator/output_generator.py +586 -0
server/simulator/transition.py +197 -0
server/tasks/__init__.py +9 -0
server/tasks/scenarios.py +422 -0
space/__init__.py +0 -0
space/env/Dockerfile +24 -0
space/env/README.md +51 -0
space/env/requirements.txt +6 -0
space/training/Dockerfile +31 -0
space/training/README.md +64 -0
space/training/__init__.py +0 -0
space/training/app.py +412 -0
space/training/requirements.txt +18 -0
training/__init__.py +1 -0
training/colab_train_unsloth.ipynb +260 -0
training/evaluate.py +152 -0
training/llm_agent.py +227 -0
training/plots.py +93 -0
training/rollouts.py +160 -0
training/training_script.py +211 -0
training/training_unsloth.py +130 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+.git
+.venv
+__pycache__
+*.pyc
+.pytest_cache
+training/runs
+training/grpo-output
+training/rollouts
+notebooks/.ipynb_checkpoints

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+__pycache__/
+*.pyc
+.venv/
+.env
+.pytest_cache/
+.coverage
+htmlcov/
+.DS_Store
+training/runs/
+training/grpo-output/
+training/rollouts/
+training/plots/
+*.png
+!docs/*.png
+!assets/*.png
+.ipynb_checkpoints/
+.uv/
+uv.lock
+dist/
+build/
+*.egg-info/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# CERNenv trainer Space (Docker, A100)
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/home/user/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface/transformers \
+    PYTHONPATH=/home/user/app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python3.11 python3.11-venv python3.11-dev python3-pip \
+        git curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python \
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python3
+RUN useradd -ms /bin/bash user
+USER user
+ENV PATH="/home/user/.local/bin:${PATH}"
+WORKDIR /home/user/app
+COPY --chown=user:user space/training/requirements.txt /tmp/requirements.txt
+RUN python -m pip install --upgrade pip && \
+    python -m pip install --user -r /tmp/requirements.txt
+COPY --chown=user:user . /home/user/app
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "space.training.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,64 @@
 ---
-title: Cernenv Trainer
-emoji: 🚀
 colorFrom: indigo
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: CERNenv Trainer
+emoji: ⚛️
 colorFrom: indigo
+colorTo: pink
 sdk: docker
+suggested_hardware: a100-large
+suggested_storage: medium
 pinned: false
+license: bsd-3-clause
+short_description: GRPO trainer for CERNenv (Unsloth + LoRA, A100)
 ---
+# CERNenv Trainer (Hugging Face Space, A100)
+Fine-tunes a small instruction-tuned LLM (Large Language Model) to act as
+an LHC (Large Hadron Collider) physicist inside the **CERNenv** OpenEnv
+environment using **GRPO** (Group-Relative Policy Optimization),
+**Unsloth**, and **LoRA** (Low-Rank Adaptation).
+## Hardware
+- Recommended: **A100 large (80 GB)**
+- Minimum: T4 / L4 (will use a smaller model + fewer episodes)
+## Required Space secrets
+| Secret | Purpose |
+| --- | --- |
+| `HF_TOKEN` | Hugging Face token with `write` access for model push |
+| `HF_USERNAME` | Hub username, used as the default model-repo owner |
+## Optional environment variables
+| Variable | Default | Notes |
+| --- | --- | --- |
+| `MODEL_NAME` | `unsloth/Qwen2.5-3B-Instruct` | Any chat model Unsloth supports |
+| `TOTAL_EPISODES` | `400` | Prompts × generations rollouts |
+| `DIFFICULTY` | `easy` | `easy` / `medium` / `hard` |
+| `MAX_STEPS` | `18` | Steps per episode |
+| `NUM_GENERATIONS` | `4` | GRPO group size |
+| `OUTPUT_DIR` | `runs/unsloth-grpo` | LoRA adapter output |
+| `PUSH_REPO` | `${HF_USERNAME}/cernenv-grpo-qwen2.5-3b` | Hub repo for adapters |
+| `AUTOSTART` | `0` | Set to `1` to start training on Space boot |
+## How to use
+This Space exposes a tiny FastAPI control panel:
+- `GET  /` — status + current run info
+- `POST /train` — start / restart a training run
+- `GET  /logs` — live tail of `training.log`
+- `GET  /metrics` — reward + success-rate snapshots
+Click **"Start training"** in the UI, or set `AUTOSTART=1` in the Space variables to kick off immediately on boot.
+When training finishes, the LoRA adapters are pushed to `PUSH_REPO`.
+## Local equivalent
+The same training run is reproducible locally with:
+```bash
+PYTHONPATH=. python -m training.training_unsloth \
+  --model_name unsloth/Qwen2.5-3B-Instruct \
+  --difficulty easy --total_episodes 400 --max_steps 18 \
+  --output_dir runs/unsloth-grpo
+```

[External] Apr ‘26 OpenEnv Hackathon Themes & Judging Criteria.txt ADDED Viewed

	@@ -0,0 +1,190 @@

+Theme #1 - Multi-Agent Interactions
+Environments for this theme involve cooperation, competition, negotiation, and coalition formation. Learning from these environments will enable agents to model the beliefs and incentives of others in partially observable settings. This drives theory-of-mind reasoning and emergent strategic behavior.
+Expected Outcome: an environment that can be used to train multi-agent task handling in a LLM
+Example environments: Market simulations, compute-allocation negotiations, collaborative puzzle worlds, mixed cooperative/competitive strategy games.
+Theme #2 - (Super) Long-Horizon Planning & Instruction Following
+You will build environments that require deep, multi-step reasoning with sparse or delayed rewards. After using these environments, the goal is to enable agents to decompose goals, track state over extended trajectories, and recover from early mistakes. The aim is to push beyond shallow next-token reasoning toward structured planning and durable internal representations.
+Expected Outcome: an environment that can capture and improve LLM behaviour on challenging long horizon tasks that need long running sessions beyond context memory limits.
+Example environments: (Think of OpenClaw workflows with Multi-turn tasks). Research-planning simulators, large-scale codebase refactoring tasks, strategic resource management worlds, long-horizon logistics optimization, extremely complicated long-horizon instruction following (e.g., 300 instructions scattered around).
+Theme #3 - World Modeling
+#3.1 Professional Tasks
+Here you will develop environments that require real interaction with tools, APIs, or dynamic systems where the model is expected to do real hard work instead of exploiting short-cuts to arrive at the desired outcome. Learning from these environments will enable agents to maintain consistent internal state, update beliefs based on outcomes, and orchestrate multi-step workflows. The goal is to strengthen causal reasoning and persistent world models.
+Expected Outcome: an environment capturing nuances of a defined partially observable world and improve LLM interaction with it
+Example environments: Dynamic browser/API ecosystems, enterprise applications, scientific workflow loops (papers → code → experiments), economic simulations with feedback, tool-discovery benchmarks.
+#3.2 Personalized Tasks
+Here we will develop an environment that offers real personalized task handling, imagine replying to personal messages or handling dinner conflicts due to work conflicts, replying to tough emails. Think any personal assistant tasks
+Expected Outcome: An environment that gives the model a realistic simulation of handling personal tasks, conflicts and managing them as delegations
+Example environments: Executive Assistant Meeting Planner, Dinner and drive planning, email and message replying, shopping, etc
+Theme #4 - Self-Improvement
+The focus here is to create environments where agents can learn to generate new challenges, escalate difficulty, and improve through self-play or adaptive curricula. Rather than optimizing fixed tasks, the goal is for agents to learn to drive their own capability growth. The objective is recursive skill amplification.
+Expected Outcome: an environment for improving self-play of a LLM over a defined set of tasks
+Example environments: Self-play negotiation arenas, auto-generated math/proof tasks, evolving coding competitions, adaptive RL curricula.
+Theme #5: Wild Card - Impress Us!
+We do not want to limit your focus if your idea doesn’t fit the boxes above, we want and WILL reward out of box tasks, please be creative but remember to add submissions that meaningfully add value to LLM training on a certain task.
+Guidelines for Problem Statement
+* It is NOT mandatory to choose the same problem statement as Round 1. Only choose the same problem statement if it aligns with the above provided Hackathon themes.
+* You can start working on your problem statement once you have finalized it. Post-training can be done onsite on 25th & 26th when you receive compute credits for HuggingFace.
+* Before the onsite, we suggest you work on building the environment, agent behaviours, reward model and evaluate if your work aligns with the judging criteria given below.
+Judging Criteria
+Minimum requirements:
+* Usage of OpenEnv (latest release)
+* Show a minimal training script for your environment using Unsloth or HF TRL in Colab
+* Write a mini-blog on HuggingFace or mini-video on YouTube talking about your submission, <2 minutes
+* Your OpenEnv compliant environment should be hosted on Hugging Face Spaces.
+Judging Overview
+* Evaluation: Teams will be scored based on the following criteria:
+1. Environment Innovation (40%): Is the environment novel, creative, or challenging? Does it meaningfully test the agent’s behavior?
+2. Storytelling (30%): Does the team clearly explain the problem, environment, and agent behavior? Is the demo engaging and easy to follow?
+3. Showing Improvement in Rewards (20%): Does the demo provide observable evidence of training progress (reward curves, metrics, or before/after behavior)?
+4. Reward and Training Script/Pipeline Setup (10%): Is the reward logic coherent, and does the pipeline produce meaningful improvement in the agent’s inference (how it acts in the environment)?
+OpenEnv Hackathon - What Judges Look For
+This guide tells you what makes a strong submission for the OpenEnv Hackathon (India 2026).
+Read it before you start building, and again before you submit.
+For the list of themes and example problems, refer to the top sections.
+NOTE: Please remember only one submission per team. If you have multiple ideas, pick the best one and go for it. Please make sure that the URL link of your environment is submitted as judges will pull the environment from the URL to evaluate it. Changes or commits after the submission deadline will not be considered.
+TL;DR
+Build an environment that an LLM could actually be trained on to get measurably better at
+something interesting. Then show that training. Then tell the story.
+A messy but ambitious environment with real training evidence beats a polished but boring one.
+Pick a problem that excites you (that energy comes through in the pitch).
+Judging Criteria
+Criterion: Environment Innovation
+Weight: 40%
+What it means:
+Is the environment novel, creative, or genuinely challenging?
+Does it meaningfully test agent behavior in a way that hasn't been done before?
+Criterion: Storytelling & Presentation
+Weight: 30%
+What it means:
+Can you clearly explain the problem, the environment, and what the agent learned?
+Is the demo engaging and easy to follow for a non-technical audience?
+Criterion: Showing Improvement in Rewards
+Weight: 20%
+What it means:
+Is there observable evidence of training progress? Reward curves, before/after behavior,
+comparison against a baseline -- anything that proves the agent learned something.
+Criterion: Reward & Training Pipeline
+Weight: 10%
+What it means:
+Is the reward logic coherent? Does the pipeline produce meaningful improvement in the trained
+agent's behavior?
+Minimum Submission Requirements
+NOTE: These are non-negotiable. Submissions missing any of these are at a serious disadvantage.
+* Use OpenEnv (latest release). Build on top of the framework; don’t reinvent the wheel.
+* A working training script using Unsloth or Hugging Face TRL, ideally as a Colab notebook so judges can re-run it.
+* Evidence that you actually trained; at minimum, loss and reward plots from a real run.
+* A short writeup: a mini-blog on Hugging Face or a < 2 minute video on YouTube explaining what your environment does and what you trained, or a short slide deck of presentation. Please make sure that all materials are linked from your README file so that judges can access them easily.
+* Push your environment to a Hugging Face Space so it’s discoverable and runnable.
+* A README that motivates the problem, explains how the env works, and shows results.
+   * README should have a link to the environment in the Hugging Face Space. It should also have all additional references to other materials (e.g. videos, blog posts, slides, presentations, etc.) that you want to include.
+* Please do not include big video files in your Env submission on HF Hub as we would like to have a small size for each env (Please use url as reference link to additional materials).
+What Makes a Submission Stand Out
+Pick an ambitious, original problem
+The themes (problems) are deliberately open. Use them as launching pads, not boxes. Judges have seen a lot of chess, snake, tic-tac-toe, and grid-world clones. To score well on innovation,
+you need a genuinely fresh angle. Some questions to ask yourself:
+* Does this environment exist to teach an LLM something it currently can’t do well?
+* Is the domain underexplored in RL/LLM training?
+* Could a researcher write a paper about training on this?
+Design a reward signal that actually teaches
+A great environment has a reward function that:
+* Provides a rich, informative signal (not just 0/1 at the end)
+* Captures something hard to measure in a clever way
+* Uses OpenEnv’s Rubric system thoughtfully (composable rubrics > monolithic scoring)
+* Is hard to game; an agent that exploits the reward without solving the task should not get high scores
+Show real training, end to end
+The bar isn’t “training script exists.” The bar is “training script runs against the environment, the
+agent learns, and you can show it.” Concretely:
+* Your training loop should connect to your environment (not a static dataset)
+* Train long enough that the curves mean something
+* Compare a trained agent vs. a random/untrained baseline; quantitative and/or qualitative
+* Include the plots and numbers in your README and writeup
+Make your plots readable
+Reviewers spend seconds, not minutes, on each plot. Help them out:
+* Label both axes (e.g. “training step” / “episode” on x, “reward” / “loss” on y) and include units where they apply
+* Save plots as .png or .jpg and commit them to the repo (don’t leave them only in a Colab cell or a deleted Wandb run) (if you ran via Wandb, please include the link to that specific run of your plots)
+* Embed the key plots in your README with a one-line caption explaining what each one shows If you have multiple runs (baseline vs. trained, ablations, etc.), put them on the same axes so the comparison is obvious
+Tell a story, not an API doc
+Your README, blog, and pitch should answer:
+1. Problem) what capability gap or interesting domain are you targeting?
+2. Environment) what does the agent see, do, and get rewarded for?
+3. Results) what changed after training? Show it.
+4. Why does it matter) who would care, and why?
+A reviewer should be able to read your README in 3~5 minutes and want to try your
+environment.
+NOTE: If you have a video, HF post, or anything else interesting, please make sure that it’s linked
+  from your README as a link.
+Engineer it cleanly (table stakes)
+Engineering quality matters less than ambition, but sloppy work hurts. Make sure you:
+* Use OpenEnv’s Environment / MCPEnvironment base classes properly
+* Respect the client / server separation (clients should never import server internals)
+* Follow the standard Gym-style API (reset, step, state)
+* Have a valid openenv.yaml manifest
+* Don’t use reserved tool names (reset, step, state, close) for MCP tools
+Final Note
+Judges are looking for environments that push the frontier of what we can train LLMs to do. Be
+ambitious. Pick a problem you find genuinely interesting; that almost always produces better
+work than chasing what you think judges want. Good luck.

[External] Meta OpenEnv Hackathon Participant Help Guide.txt ADDED Viewed

	@@ -0,0 +1,291 @@

+Hackathon Self-Serve Guide: Build an RL Environment, Train an LLM, Ship a Demo
+0) What you are building
+The core idea is not just to fine-tune a text model, but to build a specialized LLM system that can act inside an environment, get feedback, and improve through reinforcement learning. The practical stack discussed here is:
+Environment → verifier/reward functions → TRL trainer → Unsloth for efficiency → deployment on OpenEnv / Spaces.
+A strong project usually looks like one of these,
+Please refer to [External] Apr ‘26 OpenEnv Hackathon Themes for theme guidelines on selecting & forming problem statements.
+1) Start with the right project idea
+Pick a task that has all three of these properties:
+1. The model can act step by step
+2. You can verify success programmatically
+3. The task is hard enough to be interesting, but not so hard that the model never succeeds
+This last point matters a lot. RL only works if the probability of getting a good answer is greater than zero. If your task is so hard that the model never gets any reward, you will burn compute and learn nothing.
+Please refer to [External] Apr ‘26 OpenEnv Hackathon Themes for theme guidelines on selecting & forming problem statements.
+A useful rule: prefer tasks with crisp verification over tasks that only “look good” to a human. RL gets easier when the reward is objective.
+2) Understand the minimum RL loop before you build
+At a high level, your loop is:
+1. Give the model a prompt
+2. Let it generate an action, strategy, answer, or code
+3. Execute that output in an environment or verifier
+4. Convert the result into a reward
+5. Update the model so higher-reward behavior becomes more likely
+That is the practical mental model for RL here. The system samples many outputs, scores them, and shifts probability mass away from bad outputs and toward better ones.
+One especially useful framing is that RL is like a more efficient version of repeated in-context improvement. Instead of repeatedly stuffing previous examples into the context, you let backpropagation store what worked into the weights.
+3) Decide whether you need SFT first
+Use this simple rule:
+* If you have a lot of good data, use SFT
+* If you do not have data but can verify outputs, use RL
+* In many practical cases, do a little SFT first, then RL
+Why this matters:
+* SFT is generally more sample-efficient
+* RL is useful when you can test outcomes but cannot cheaply author ideal traces
+* RL often needs some warm start, formatting priming, or easy tasks first so that good rollouts happen at all
+For hackathon teams, the best path is usually:
+1. Start from a capable base/instruct model
+2. Add light formatting or task scaffolding if needed
+3. Use RL for improvement, not as magic from scratch
+4) Design the environment before you design the trainer
+Treat the environment as a first-class artifact. It should define:
+* reset(): start a fresh episode
+* step(action): apply an action and return the next result
+* state() / observation: what the agent sees
+* reward: what counts as progress or success
+OpenEnv standardizes this so the same training code can work across many environments, instead of every team inventing a different API. That is one of the main reasons to use it in a hackathon.
+Think about your environment in this order:
+1. What does the agent observe?
+2. What actions can it take?
+3. What ends an episode?
+4. How do you compute reward?
+5. How do you stop abuse, infinite loops, or cheating?
+5) Build the environment using OpenEnv
+The intended workflow is to bootstrap an environment skeleton and then fill in the behavior. OpenEnv’s CLI creates the scaffolding for you. The environment is implemented as a Python package and exposed via a FastAPI app.
+Your implementation typically defines:
+* action dataclass
+* observation dataclass
+* state representation
+* environment methods like reset and step
+* FastAPI wrapper / client-server interface
+That gives you a clean separation:
+* the environment handles world dynamics and scoring,
+* the trainer handles optimization,
+* and the model just learns to act inside the interface.
+6) Keep the task simple at first
+Do not begin with your hardest benchmark. Start with the easiest version of your environment that still proves the concept. This is where curriculum learning helps.
+A good progression:
+1. easy tasks with short horizons,
+2. medium tasks with a little more branching,
+3. harder tasks only after the model starts getting non-zero reward.
+The principle is simple: make success possible early. If the model never sees successful trajectories, learning stalls.
+7) Design rewards carefully
+Your reward function is your task specification. If it is weak, incomplete, or easy to exploit, the model will optimize the wrong thing very efficiently.
+A strong reward design usually includes multiple components, for example:
+* execution success,
+* correctness,
+* format compliance,
+* timeouts,
+* resource usage,
+* safety constraints,
+* and anti-cheating checks.
+One explicit recommendation was to use multiple independent reward functions, not just one. If you only have a single reward signal, it is easier for the model to hack it. Multiple independent checks reduce that risk.
+For example, for a coding environment:
+* reward passing tests,
+* penalize timeouts,
+* reward format compliance,
+* reject use of forbidden globals,
+* and separately verify the function contract.
+8) Protect yourself against reward hacking
+Reward hacking is one of the biggest practical failure modes. The model may learn shortcuts that maximize your reward without solving the real task. Examples mentioned include:
+* editing timers,
+* caching results,
+* abusing globals,
+* mutating protected state,
+* or exploiting environment bugs.
+What to do:
+1. Use multiple independent reward functions
+2. Lock down execution where possible
+3. Add time limits
+4. Avoid unrestricted global state
+5. Sample outputs frequently and inspect them
+6. Terminate or roll back runs if behavior drifts badly
+A particularly practical recommendation was to use a locked-down function or restricted execution approach so the model cannot rely on undeclared globals or hidden cached state.
+Also, do not just let training run forever without checking generations. Periodic human inspection is still necessary.
+9) Use process-aware feedback when you can
+Naively assigning the same final reward to every token is inefficient. If possible, use richer supervision that distinguishes good intermediate steps from bad ones. That is the idea behind process supervision.
+In practice, this can be approximated by:
+* line-by-line checks,
+* step-level verifiers,
+* program trace analysis,
+* or LLM-as-a-judge for intermediate reasoning.
+But be careful: LLM-as-a-judge can itself be gamed. Use it as one signal, not the only signal.
+For a hackathon, outcome-based verification plus a few lightweight process checks is usually the sweet spot.
+10) Pick the right training stack
+The intended stack here is:
+* TRL for RL training algorithms
+* Unsloth to make RL training and inference more efficient
+* OpenEnv to standardize environment interaction
+This combination works because:
+* OpenEnv gives you a common environment interface
+* TRL gives you RL trainers like GRPO
+* Unsloth reduces memory use and improves efficiency on top of TRL
+One of the practical examples used the same prompt repeated many times, routed through an environment, with TRL driving training and Unsloth helping with performance.
+11) Prefer GRPO / RLVR style training for verifiable tasks
+The RL setup discussed here leans toward RL with verifiable rewards:
+* instead of a learned reward model,
+* use a verifier, test harness, regex check, executor, or environment.
+GRPO was described as a more efficient evolution relative to older PPO-style setups, especially by simplifying away parts like the value model.
+For hackathon purposes, the key practical takeaway is:
+* if the task is verifiable,
+* build the verifier first,
+* then plug that verifier into RL training.
+12) Keep inference fast
+One important point: in RL for LLMs, inference can dominate total runtime. Over time, rollout generation often becomes the bottleneck, not the optimizer step.
+That means your project speed depends heavily on:
+* fast sampling,
+* tight environment loops,
+* low-overhead execution,
+* and efficient model runtime.
+This is one reason Unsloth matters in the stack, and another reason to avoid overly heavy environments early in the hackathon.
+13) Deploy your environment early
+OpenEnv environments are designed to be deployed as Hugging Face Spaces, which provide:
+* a running server,
+* a Git repository,
+* and a container registry.
+That gives you several ways to work:
+* interact with the remote Space directly,
+* install the client code from the repo,
+* pull and run the container locally,
+* or run the FastAPI app locally via Python/Uvicorn.
+Why this is good for a hackathon:
+* one shared source of truth,
+* easier collaboration,
+* easier demos,
+* easier switching between local and remote execution.
+A good habit is to deploy an early version of the environment before training seriously. That catches API and packaging issues early.
+14) Scale only after the environment is stable
+There was a dedicated tutorial flow around:
+1. environment,
+2. deployment,
+3. scaling,
+4. training with TRL and Wordle.
+Follow the same order.
+Do not start with scale. First confirm:
+* reset works,
+* step works,
+* rewards are sensible,
+* timeouts work,
+* logs are visible,
+* and the environment can be run locally and remotely.
+Only then:
+* increase batch sizes,
+* duplicate prompts or tasks,
+* expand task diversity,
+* and benchmark throughput.
+15) Monitor the right things during training
+Do not watch only one scalar. Monitor:
+* overall reward,
+* individual reward function columns,
+* success indicators,
+* timeout frequency,
+* and generated strategies over time.
+A very concrete suggestion was:
+* watch whether the reward is going up,
+* and separately watch critical columns like “function works.”
+Also inspect actual generations during training. A rising reward is not enough if the model is learning to exploit bugs.
+16) Save models correctly
+If you use QLoRA / LoRA-style training, be careful when saving. One explicit warning was:
+Do not upcast a 4-bit model to 16-bit and then merge the LoRA weights naively. That can badly damage model quality. Instead, use the proper merged-save path, or use the adapters directly.
+For participants, that means:
+* keep your training save path simple,
+* test post-training inference immediately,
+* and do not leave export until the end.
+17) How to structure your team over the hackathon
+A very effective team split is:
+Person A: Environment
+* builds reset/step/state
+* adds timeouts and safety constraints
+* makes local and remote execution work
+Person B: Verifier / Rewards
+* writes multiple reward functions
+* adds anti-hacking checks
+* makes failure cases visible
+Person C: Training
+* sets up TRL + Unsloth
+* runs experiments
+* tracks metrics and generations
+Person D: Demo / Product
+* prepares the Space demo
+* creates a simple interface
+* records examples and final benchmarks
+This split matches the way the stack naturally decomposes in practice.
+18) A practical 1-day execution plan
+Phase 1: Pick a narrow task
+Choose a small, verifiable environment. Avoid huge long-horizon tasks first.
+Phase 2: Build the environment
+Use OpenEnv init, implement reset/step/state, and get a local loop working.
+Phase 3: Build rewards
+Add at least 2–4 independent reward checks, plus timeout and anti-cheat logic.
+Phase 4: Deploy
+Push to a Space or run locally via container/Uvicorn so teammates can use the same environment.
+Phase 5: Train small
+Run a tiny TRL + Unsloth experiment first. Look at outputs, not just metrics.
+Phase 6: Inspect for hacking
+Sample generations. Check for globals, hacks, environment abuse, or suspicious shortcuts.
+Phase 7: Add curriculum
+If the model gets zero reward too often, simplify tasks or add easier start states.
+Phase 8: Train bigger
+Only after the loop is stable should you increase scale, batch size, or environment diversity.
+Phase 9: Save and demo
+Export the trained model correctly, test inference, and show before/after behavior.
+19) What judges or reviewers will likely find compelling
+The strongest hackathon projects usually show:
+* a clear environment design,
+* objective reward functions,
+* evidence that the model improved,
+* prevention against reward hacking,
+* a reproducible deployment story,
+* and a sharp demo.
+A simple but strong demo format is:
+1. baseline model attempt,
+2. reward/verifier output,
+3. trained model attempt,
+4. measurable improvement,
+5. short explanation of safeguards.
+20) Suggested problem statement theme directions
+Please Refer to [External] Apr ‘26 OpenEnv Hackathon Themes
+21) Common mistakes to avoid
+* Picking a task so hard that success probability is zero
+* Using only one reward function
+* Not checking for reward hacking
+* Training before the environment is stable
+* Relying only on average reward and not inspecting outputs
+* Forgetting timeouts and sandbox limits
+* Saving LoRA/QLoRA models incorrectly
+22) Learning Resources
+(Recommended) RL Environment Lecture Chapters:
+RL Mega Lecture
+Module 1: Why OpenEnv? (~7 min)
+▸ Workshop 8:02–15:05 — https://www.youtube.com/watch?v=1jU05MlENOI&t=482s
+▸ Sanyam: RL loop, fragmented env APIs, OpenEnv as universal interface, Gymnasium spec + Docker
+▸ Alt: Mega Lecture 40:01–46:00 — https://www.youtube.com/watch?v=Jew4lhAiqnw&t=2401s
+Module 2: Using Existing Envs (~7.5 min)
+▸ Workshop 35:33–43:05 — https://www.youtube.com/watch?v=1jU05MlENOI&t=2133s
+▸ Ben: Hub org, env collections, 3 Space interfaces (server/repo/registry), from_hub
+▸ Alt: Mega Lecture 1:24:11–1:30:00 — https://www.youtube.com/watch?v=Jew4lhAiqnw&t=5051s
+Module 3: Deploying Envs (~9 min)
+▸ Mega Lecture 1:30:00–1:39:07 — https://www.youtube.com/watch?v=Jew4lhAiqnw&t=5400s
+▸ Ben: live openenv init, scaffold, running locally, openenv push, Docker run from Space
+▸ Alt: Workshop 43:05–48:30 — https://www.youtube.com/watch?v=1jU05MlENOI&t=2585s
+Module 4: Building Your Own (~6.5 min)
+▸ Workshop 43:45–50:20 — https://www.youtube.com/watch?v=1jU05MlENOI&t=2625s
+▸ Ben: scaffold files, business logic (reset/step), models, client, publishing
+▸ Alt: Mega Lecture 1:33:30–1:39:07 — https://www.youtube.com/watch?v=Jew4lhAiqnw&t=5610s
+Module 5: Training + TRL (~14 min)
+▸ Mega Lecture 1:53:20–2:07:12 — https://www.youtube.com/watch?v=Jew4lhAiqnw&t=6800s
+▸ Lewis: Wordle GRPO walkthrough — rollout function, reward shaping, GRPOTrainer, live training
+▸ Alt: Workshop 22:24–34:12 — https://www.youtube.com/watch?v=1jU05MlENOI&t=1344s

client.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""WebSocket client for CERNenv.
+Wraps OpenEnv's ``EnvClient`` so users can ``await client.reset()`` and
+``await client.step(action)`` against a running CERNenv server.
+"""
+from __future__ import annotations
+from typing import Any, Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from models import CollisionObservation, ExperimentAction
+from server.environment import CernState
+class CernEnv(EnvClient[ExperimentAction, CollisionObservation, CernState]):
+    """Async WebSocket client for the CERN environment."""
+    def _step_payload(self, action: ExperimentAction) -> Dict[str, Any]:
+        return action.model_dump()
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[CollisionObservation]:
+        obs_data = payload.get("observation", payload)
+        observation = CollisionObservation(**obs_data)
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward", observation.reward),
+            done=payload.get("done", observation.done),
+        )
+    def _parse_state(self, payload: Dict[str, Any]) -> CernState:
+        return CernState(**payload)
+__all__ = ["CernEnv"]

models.py ADDED Viewed

	@@ -0,0 +1,600 @@

+"""
+Data models for CERNenv: an LHC (Large Hadron Collider) style particle
+physics discovery POMDP (Partially Observable Markov Decision Process).
+The agent is a Large Language Model (LLM) acting as a high-energy physicist.
+Each step it picks one structured action (configure beams, allocate
+luminosity, run a trigger, fit a spectrum, request systematics, submit a
+discovery claim, etc.) and receives a noisy detector-style observation.
+The latent particle and detector parameters are the hidden ground truth.
+"""
+from __future__ import annotations
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+from openenv.core.env_server.types import Action, Observation
+# ── Action vocabulary ───────────────────────────────────────────────────────
+class ActionType(str, Enum):
+    # ── Beam & data acquisition (DAQ) ─────────────────────────────────
+    CONFIGURE_BEAM = "configure_beam"
+    ALLOCATE_LUMINOSITY = "allocate_luminosity"
+    SET_TRIGGER = "set_trigger"
+    COLLECT_COLLISIONS = "collect_collisions"
+    # ── Reconstruction & calibration ─────────────────────────────────
+    CALIBRATE_DETECTOR = "calibrate_detector"
+    RECONSTRUCT_TRACKS = "reconstruct_tracks"
+    SELECT_CHANNEL = "select_channel"
+    # ── Analysis ──────────────────────────────────────────────────────
+    BUILD_INVARIANT_MASS = "build_invariant_mass"
+    SUBTRACT_BACKGROUND = "subtract_background"
+    FIT_RESONANCE = "fit_resonance"
+    SCAN_BUMP = "scan_bump"
+    MEASURE_ANGULAR = "measure_angular"
+    ESTIMATE_SIGNIFICANCE = "estimate_significance"
+    # ── Systematics & meta ───────────────────────────────────────────
+    REQUEST_SYSTEMATICS = "request_systematics"
+    REQUEST_THEORY_REVIEW = "request_theory_review"
+    # ── Final ─────────────────────────────────────────────────────────
+    SUBMIT_DISCOVERY_CLAIM = "submit_discovery_claim"
+DAQ_ACTIONS = frozenset({
+    ActionType.CONFIGURE_BEAM,
+    ActionType.ALLOCATE_LUMINOSITY,
+    ActionType.SET_TRIGGER,
+    ActionType.COLLECT_COLLISIONS,
+})
+RECO_ACTIONS = frozenset({
+    ActionType.CALIBRATE_DETECTOR,
+    ActionType.RECONSTRUCT_TRACKS,
+    ActionType.SELECT_CHANNEL,
+})
+ANALYSIS_ACTIONS = frozenset({
+    ActionType.BUILD_INVARIANT_MASS,
+    ActionType.SUBTRACT_BACKGROUND,
+    ActionType.FIT_RESONANCE,
+    ActionType.SCAN_BUMP,
+    ActionType.MEASURE_ANGULAR,
+    ActionType.ESTIMATE_SIGNIFICANCE,
+})
+META_ACTIONS = frozenset({
+    ActionType.REQUEST_SYSTEMATICS,
+    ActionType.REQUEST_THEORY_REVIEW,
+    ActionType.SUBMIT_DISCOVERY_CLAIM,
+})
+# ── Detector channels & physics primitives ────────────────────────────────
+class DetectorChannel(str, Enum):
+    """Final-state decay channel the agent reconstructs in.
+    Channels affect signal acceptance and background composition. Picking a
+    channel where the true particle does not decay yields low signal yield
+    no matter how much luminosity is collected — this is intentional.
+    """
+    DIPHOTON = "diphoton"          # γγ
+    DILEPTON_EE = "dilepton_ee"    # e+ e-
+    DILEPTON_MUMU = "dilepton_mumu"  # μ+ μ-
+    DIJET = "dijet"                # jj
+    FOUR_LEPTON = "four_lepton"    # 4ℓ
+    BB = "bb"                      # b b-bar
+class TriggerType(str, Enum):
+    """Hardware-level event selection."""
+    LOW_PT = "low_pt"          # broad acceptance, lots of background
+    HIGH_PT = "high_pt"        # high-mass focus, lower QCD
+    DIPHOTON_HLT = "diphoton_hlt"
+    DILEPTON_HLT = "dilepton_hlt"
+    JET_HLT = "jet_hlt"
+class BeamEnergy(str, Enum):
+    """LHC-style center-of-mass energies (TeV)."""
+    E_7 = "7TeV"
+    E_8 = "8TeV"
+    E_13 = "13TeV"
+    E_14 = "14TeV"
+# ── Tool / instrument registry (for prompts and tool-fit reward) ──────────
+class ToolCategory(str, Enum):
+    DAQ = "daq"
+    RECONSTRUCTION = "reconstruction"
+    CALIBRATION = "calibration"
+    ANALYSIS = "analysis"
+    STATISTICS = "statistics"
+    SYSTEMATICS = "systematics"
+class ToolSpec(BaseModel):
+    name: str
+    category: ToolCategory
+    description: str = ""
+    typical_runtime_hours: float = 0.5
+    typical_cost_musd: float = 0.0  # in millions of USD (compute / beam time proxy)
+    requires_gpu: bool = False
+    channels: List[str] = Field(default_factory=list)
+TOOL_REGISTRY: Dict[str, ToolSpec] = {
+    "ATLAS_HLT": ToolSpec(
+        name="ATLAS_HLT",
+        category=ToolCategory.DAQ,
+        description="ATLAS High-Level Trigger system for online event selection",
+        typical_runtime_hours=0.0,
+        channels=["diphoton", "dilepton_ee", "dilepton_mumu", "four_lepton", "dijet", "bb"],
+    ),
+    "CMS_HLT": ToolSpec(
+        name="CMS_HLT",
+        category=ToolCategory.DAQ,
+        description="CMS High-Level Trigger system",
+        typical_runtime_hours=0.0,
+        channels=["diphoton", "dilepton_ee", "dilepton_mumu", "four_lepton", "dijet", "bb"],
+    ),
+    "GEANT4": ToolSpec(
+        name="GEANT4",
+        category=ToolCategory.RECONSTRUCTION,
+        description="Detector simulation toolkit for full event reconstruction",
+        typical_runtime_hours=1.0,
+        typical_cost_musd=0.05,
+        requires_gpu=False,
+    ),
+    "Athena": ToolSpec(
+        name="Athena",
+        category=ToolCategory.RECONSTRUCTION,
+        description="ATLAS reconstruction framework",
+        typical_runtime_hours=0.8,
+    ),
+    "CMSSW": ToolSpec(
+        name="CMSSW",
+        category=ToolCategory.RECONSTRUCTION,
+        description="CMS reconstruction software",
+        typical_runtime_hours=0.8,
+    ),
+    "ECAL_calibration": ToolSpec(
+        name="ECAL_calibration",
+        category=ToolCategory.CALIBRATION,
+        description="Electromagnetic calorimeter energy-scale calibration",
+        typical_runtime_hours=0.3,
+    ),
+    "Tracker_alignment": ToolSpec(
+        name="Tracker_alignment",
+        category=ToolCategory.CALIBRATION,
+        description="Inner tracker alignment for momentum precision",
+        typical_runtime_hours=0.4,
+    ),
+    "ROOT_RooFit": ToolSpec(
+        name="ROOT_RooFit",
+        category=ToolCategory.ANALYSIS,
+        description="Maximum-likelihood spectrum fitting toolkit",
+        typical_runtime_hours=0.2,
+    ),
+    "MadGraph": ToolSpec(
+        name="MadGraph",
+        category=ToolCategory.ANALYSIS,
+        description="Matrix-element generator for signal+background templates",
+        typical_runtime_hours=1.5,
+        typical_cost_musd=0.02,
+    ),
+    "Pythia8": ToolSpec(
+        name="Pythia8",
+        category=ToolCategory.ANALYSIS,
+        description="Parton-shower and hadronisation generator",
+        typical_runtime_hours=0.5,
+    ),
+    "BumpHunter": ToolSpec(
+        name="BumpHunter",
+        category=ToolCategory.STATISTICS,
+        description="Sliding-window local-significance bump-hunting algorithm",
+        typical_runtime_hours=0.1,
+    ),
+    "CLs_fit": ToolSpec(
+        name="CLs_fit",
+        category=ToolCategory.STATISTICS,
+        description="Modified-frequentist CLs limits and significance",
+        typical_runtime_hours=0.1,
+    ),
+    "Asimov_significance": ToolSpec(
+        name="Asimov_significance",
+        category=ToolCategory.STATISTICS,
+        description="Asymptotic significance from Asimov dataset",
+        typical_runtime_hours=0.05,
+    ),
+    "JES_systematics": ToolSpec(
+        name="JES_systematics",
+        category=ToolCategory.SYSTEMATICS,
+        description="Jet energy-scale systematic study",
+        typical_runtime_hours=0.4,
+    ),
+    "Luminosity_calibration": ToolSpec(
+        name="Luminosity_calibration",
+        category=ToolCategory.SYSTEMATICS,
+        description="Van der Meer scan luminosity calibration",
+        typical_runtime_hours=0.3,
+    ),
+}
+# ── Action schema ──────────────────────────────────────────────────────────
+class ExperimentAction(Action):
+    """One structured experimental step at the LHC."""
+    action_type: ActionType = Field(
+        ...,
+        description=(
+            "Discrete LHC pipeline step. The environment enforces physics "
+            "prerequisites: you cannot fit a spectrum before collecting data, "
+            "or claim a discovery before estimating significance."
+        ),
+    )
+    method: Optional[str] = Field(
+        None,
+        description=(
+            "Optional named instrument or framework (e.g. 'ROOT_RooFit', "
+            "'BumpHunter', 'Pythia8'). Affects cost, runtime, and tool-fit reward."
+        ),
+    )
+    parameters: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "Action-specific settings such as beam energy, integrated luminosity "
+            "(fb^-1), trigger selection, decay channel, mass window, fit model."
+        ),
+    )
+    justification: Optional[str] = Field(
+        None,
+        description="Short scientific rationale for picking this step now.",
+    )
+    confidence: float = Field(
+        0.5, ge=0.0, le=1.0,
+        description="Agent confidence in the chosen step.",
+    )
+# ── Outputs ────────────────────────────────────────────────────────────────
+class OutputType(str, Enum):
+    BEAM_CONFIG = "beam_config"
+    LUMINOSITY_LOG = "luminosity_log"
+    TRIGGER_REPORT = "trigger_report"
+    COLLISION_BATCH = "collision_batch"
+    CALIBRATION_REPORT = "calibration_report"
+    RECONSTRUCTION = "reconstruction"
+    CHANNEL_SELECTION = "channel_selection"
+    INVARIANT_MASS_HIST = "invariant_mass_hist"
+    BACKGROUND_SUBTRACTION = "background_subtraction"
+    FIT_RESULT = "fit_result"
+    BUMP_SCAN = "bump_scan"
+    ANGULAR_RESULT = "angular_result"
+    SIGNIFICANCE = "significance"
+    SYSTEMATICS_REPORT = "systematics_report"
+    THEORY_REVIEW = "theory_review"
+    DISCOVERY_CLAIM = "discovery_claim"
+    FAILURE_REPORT = "failure_report"
+class IntermediateOutput(BaseModel):
+    """A single noisy detector or analysis artifact."""
+    output_type: OutputType
+    step_index: int
+    success: bool = True
+    quality_score: float = Field(1.0, ge=0.0, le=1.0)
+    summary: str = ""
+    data: Dict[str, Any] = Field(default_factory=dict)
+    uncertainty: float = Field(0.0, ge=0.0, le=1.0)
+    warnings: List[str] = Field(default_factory=list)
+    artifacts_available: List[str] = Field(default_factory=list)
+# ── Observable state components ───────────────────────────────────────────
+class ResourceUsage(BaseModel):
+    """Agent-visible resource counters."""
+    budget_used_musd: float = 0.0
+    budget_remaining_musd: float = 100.0
+    luminosity_used_fb: float = 0.0
+    luminosity_remaining_fb: float = 300.0
+    time_used_days: float = 0.0
+    time_remaining_days: float = 365.0
+    compute_hours_used: float = 0.0
+class PipelineStepRecord(BaseModel):
+    step_index: int
+    action_type: ActionType
+    method: Optional[str] = None
+    parameters: Dict[str, Any] = Field(default_factory=dict)
+    output_summary: str = ""
+    output_type: OutputType
+    success: bool = True
+    quality_score: float = 1.0
+    cost_musd: float = 0.0
+    luminosity_cost_fb: float = 0.0
+    time_cost_days: float = 0.0
+class PaperReference(BaseModel):
+    title: str
+    citation: Optional[str] = None
+    doi: Optional[str] = None
+    arxiv_id: Optional[str] = None
+    url: Optional[str] = None
+class ExpectedFinding(BaseModel):
+    finding: str
+    category: str = "claim"
+    keywords: List[str] = Field(default_factory=list)
+class TaskSpec(BaseModel):
+    """The physics question the agent is given for this episode."""
+    problem_statement: str = "Discover and characterise an unknown resonance."
+    target_collider: str = "LHC"
+    beam_energy_options: List[str] = Field(
+        default_factory=lambda: [e.value for e in BeamEnergy],
+    )
+    available_channels: List[str] = Field(
+        default_factory=lambda: [c.value for c in DetectorChannel],
+    )
+    available_triggers: List[str] = Field(
+        default_factory=lambda: [t.value for t in TriggerType],
+    )
+    available_tools: List[str] = Field(
+        default_factory=lambda: list(TOOL_REGISTRY.keys()),
+    )
+    mass_search_window_gev: List[float] = Field(default_factory=lambda: [50.0, 1000.0])
+    budget_limit_musd: float = 100.0
+    luminosity_budget_fb: float = 300.0
+    time_limit_days: float = 365.0
+    prior_observations: List[str] = Field(default_factory=list)
+    success_criteria: List[str] = Field(default_factory=list)
+    paper_references: List[PaperReference] = Field(default_factory=list)
+    expected_findings: List[ExpectedFinding] = Field(default_factory=list)
+    difficulty: str = "medium"
+class DiscoveryClaim(BaseModel):
+    """Structured final claim graded against hidden truth."""
+    claim: str = ""
+    mass_estimate_gev: Optional[float] = None
+    mass_uncertainty_gev: Optional[float] = None
+    width_estimate_gev: Optional[float] = None
+    significance_sigma: Optional[float] = None
+    decay_channel: Optional[str] = None
+    spin_hypothesis: Optional[int] = None  # 0, 1, 2
+    parity: Optional[str] = None  # "+", "-"
+    cross_section_fb: Optional[float] = None
+    confidence: float = Field(0.5, ge=0.0, le=1.0)
+    evidence_steps: List[int] = Field(default_factory=list)
+class CollisionObservation(Observation):
+    """Full observable state returned to the agent each step.
+    Excludes the hidden particle truth and hidden detector systematics.
+    """
+    task: TaskSpec = Field(default_factory=TaskSpec)
+    step_index: int = 0
+    pipeline_history: List[PipelineStepRecord] = Field(default_factory=list)
+    available_channels: List[str] = Field(default_factory=list)
+    available_triggers: List[str] = Field(default_factory=list)
+    available_tools: List[str] = Field(default_factory=list)
+    resource_usage: ResourceUsage = Field(default_factory=ResourceUsage)
+    latest_output: Optional[IntermediateOutput] = None
+    all_outputs: List[IntermediateOutput] = Field(default_factory=list)
+    candidate_masses_gev: List[float] = Field(default_factory=list)
+    candidate_significances: List[float] = Field(default_factory=list)
+    selected_channel: Optional[str] = None
+    selected_beam_energy: Optional[str] = None
+    cumulative_significance: float = 0.0
+    uncertainty_summary: Dict[str, float] = Field(default_factory=dict)
+    rule_violations: List[str] = Field(default_factory=list)
+    step_reward_breakdown: Dict[str, float] = Field(default_factory=dict)
+# ── Agent-facing prompt helpers ───────────────────────────────────────────
+AGENT_ACTION_GUIDANCE: Dict[ActionType, str] = {
+    ActionType.CONFIGURE_BEAM: (
+        "Pick the LHC center-of-mass energy. Higher energy reaches heavier "
+        "resonances but costs more per fb^-1. Required before collecting data."
+    ),
+    ActionType.ALLOCATE_LUMINOSITY: (
+        "Schedule a chunk of integrated luminosity (fb^-1). More luminosity "
+        "means more events but uses budget and time. Required before collecting."
+    ),
+    ActionType.SET_TRIGGER: (
+        "Choose a hardware/HLT trigger. Match the trigger to the channel of "
+        "interest; mismatched triggers throw away signal."
+    ),
+    ActionType.COLLECT_COLLISIONS: (
+        "Run the experiment. Returns a noisy raw event count plus background "
+        "estimate, conditioned on beam, luminosity, trigger, and channel."
+    ),
+    ActionType.CALIBRATE_DETECTOR: (
+        "Apply ECAL/tracker calibration. Reduces systematic uncertainty; "
+        "neglecting it inflates fit uncertainty later."
+    ),
+    ActionType.RECONSTRUCT_TRACKS: (
+        "Reconstruct charged-particle tracks and physics objects. Required "
+        "before any analysis-level step."
+    ),
+    ActionType.SELECT_CHANNEL: (
+        "Pick the decay channel to study (γγ, ℓℓ, jj, 4ℓ, bb). Wrong channel "
+        "= small signal acceptance regardless of luminosity."
+    ),
+    ActionType.BUILD_INVARIANT_MASS: (
+        "Construct the invariant-mass histogram in the chosen channel and "
+        "mass window."
+    ),
+    ActionType.SUBTRACT_BACKGROUND: (
+        "Fit a smooth background model and subtract it to expose any peak."
+    ),
+    ActionType.FIT_RESONANCE: (
+        "Fit a Breit-Wigner / Crystal Ball line shape. Returns mass, width, "
+        "and statistical uncertainty."
+    ),
+    ActionType.SCAN_BUMP: (
+        "Run a sliding-window bump hunt over the mass window. Reports the "
+        "most-significant candidate region."
+    ),
+    ActionType.MEASURE_ANGULAR: (
+        "Measure decay angular distribution to constrain spin/parity. "
+        "Useful only after a peak is identified."
+    ),
+    ActionType.ESTIMATE_SIGNIFICANCE: (
+        "Compute the statistical significance of a candidate signal in σ. "
+        "Required before claiming a discovery."
+    ),
+    ActionType.REQUEST_SYSTEMATICS: (
+        "Run a systematics study (JES, luminosity, calibration). Improves "
+        "uncertainty estimates and reduces overconfidence penalty."
+    ),
+    ActionType.REQUEST_THEORY_REVIEW: (
+        "Ask a theorist sub-agent to review the evidence; small extra signal "
+        "but not a substitute for missing data."
+    ),
+    ActionType.SUBMIT_DISCOVERY_CLAIM: (
+        "Submit a structured discovery claim. Graded on mass calibration, "
+        "significance, channel, spin hypothesis, and overconfidence."
+    ),
+}
+AGENT_ENVIRONMENT_RULES: List[str] = [
+    "Each successful action returns summarized evidence; do not repeat steps.",
+    "Hard prerequisites are enforced: data collection requires beam+luminosity+trigger; "
+    "analysis requires reconstruction and a chosen channel.",
+    "A discovery claim requires a fitted resonance and an estimated significance.",
+    "Tools listed in available_tools are pre-filtered for this episode; prefer them.",
+    "Submitting an overconfident wrong claim is heavily penalised.",
+]
+def build_agent_system_prompt() -> str:
+    lines = [
+        "You are an expert high-energy physicist running an analysis at the LHC.",
+        "",
+        "At each turn you observe the experiment state and pick one structured next step",
+        "to maximise the probability of correctly characterising a hidden resonance.",
+        "",
+        "Environment rules:",
+    ]
+    lines.extend(f"  - {rule}" for rule in AGENT_ENVIRONMENT_RULES)
+    lines.append("")
+    lines.append("Action guidance:")
+    lines.extend(
+        f"  - {a.value}: {AGENT_ACTION_GUIDANCE[a]}" for a in ActionType
+    )
+    lines.extend([
+        "",
+        "Respond with ONLY a single valid JSON object, no extra prose:",
+        '{"action_type": "...", "method": null, "parameters": {}, "justification": "...", "confidence": 0.8}',
+        "",
+        "For submit_discovery_claim, structure parameters['claim'] as:",
+        '{"mass_estimate_gev": 125.0, "mass_uncertainty_gev": 0.5, "width_estimate_gev": 0.004,'
+        ' "significance_sigma": 5.2, "decay_channel": "diphoton", "spin_hypothesis": 0,'
+        ' "parity": "+", "cross_section_fb": 50.0, "confidence": 0.9}',
+    ])
+    return "\n".join(lines)
+def build_agent_observation_context(
+    obs: CollisionObservation,
+    *,
+    max_tools: int = 6,
+    max_channels: int = 4,
+) -> str:
+    parts: List[str] = []
+    parts.append(
+        f"Mass search window: [{obs.task.mass_search_window_gev[0]:.0f}, "
+        f"{obs.task.mass_search_window_gev[1]:.0f}] GeV; "
+        f"difficulty={obs.task.difficulty}."
+    )
+    chans = list(dict.fromkeys(obs.available_channels or obs.task.available_channels))
+    if chans:
+        parts.append("Available channels: " + ", ".join(chans[:max_channels]))
+    tools = list(dict.fromkeys(obs.available_tools or obs.task.available_tools))
+    if tools:
+        parts.append("Available tools: " + ", ".join(tools[:max_tools]))
+    if obs.selected_channel:
+        parts.append(f"Selected channel: {obs.selected_channel}")
+    if obs.selected_beam_energy:
+        parts.append(f"Beam energy: {obs.selected_beam_energy}")
+    if obs.candidate_masses_gev:
+        masses = [f"{m:.1f}" for m in obs.candidate_masses_gev[:3]]
+        sigmas = [f"{s:.1f}" for s in obs.candidate_significances[:3]]
+        parts.append(
+            "Candidate peaks (GeV / σ): "
+            + ", ".join(f"{m}/{s}" for m, s in zip(masses, sigmas))
+        )
+    return "\n".join(parts)
+__all__ = [
+    "ActionType",
+    "DAQ_ACTIONS",
+    "RECO_ACTIONS",
+    "ANALYSIS_ACTIONS",
+    "META_ACTIONS",
+    "DetectorChannel",
+    "TriggerType",
+    "BeamEnergy",
+    "ToolCategory",
+    "ToolSpec",
+    "TOOL_REGISTRY",
+    "ExperimentAction",
+    "OutputType",
+    "IntermediateOutput",
+    "ResourceUsage",
+    "PipelineStepRecord",
+    "PaperReference",
+    "ExpectedFinding",
+    "TaskSpec",
+    "DiscoveryClaim",
+    "CollisionObservation",
+    "AGENT_ACTION_GUIDANCE",
+    "AGENT_ENVIRONMENT_RULES",
+    "build_agent_system_prompt",
+    "build_agent_observation_context",
+]

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: cernenv
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,61 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-cernenv"
+version = "0.1.0"
+description = "RL environment for autonomous particle physics agents at the LHC"
+requires-python = ">=3.10,<3.13"
+dependencies = [
+    "openenv-core[core]>=0.2.0",
+    "numpy>=1.24.0",
+    "scipy>=1.10.0",
+    "pydantic>=2.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+train = [
+    "accelerate>=1.0.0",
+    "datasets>=2.18.0",
+    "matplotlib>=3.8.0",
+    "peft>=0.10.0",
+    "torch>=2.2.0",
+    "transformers>=4.44.0",
+    "trl>=0.9.0",
+]
+[project.scripts]
+cernenv-server = "server.app:main"
+[tool.uv]
+package = false
+[tool.setuptools]
+include-package-data = true
+packages = [
+    "cernenv",
+    "cernenv.server",
+    "cernenv.server.simulator",
+    "cernenv.server.rules",
+    "cernenv.server.rewards",
+    "cernenv.server.tasks",
+    "cernenv.server.physics",
+    "cernenv.training",
+    "cernenv.tests",
+]
+[tool.setuptools.package-dir]
+cernenv = "."
+"cernenv.server" = "server"
+"cernenv.server.simulator" = "server/simulator"
+"cernenv.server.rules" = "server/rules"
+"cernenv.server.rewards" = "server/rewards"
+"cernenv.server.tasks" = "server/tasks"
+"cernenv.server.physics" = "server/physics"
+"cernenv.training" = "training"
+"cernenv.tests" = "tests"

scripts/__init__.py ADDED Viewed

File without changes

scripts/_build_spaces.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Stage env- and trainer-Space directories from the repo root.
+Each Space needs a *single* directory containing the full repo plus the
+right Dockerfile + README front-matter at its root. This script copies
+the repo into a staging directory, drops in the Space-specific
+``Dockerfile`` / ``README.md``, and prints the staging path.
+"""
+from __future__ import annotations
+import argparse
+import shutil
+import sys
+from pathlib import Path
+REPO_ROOT = Path(__file__).resolve().parent.parent
+EXCLUDES = {
+    ".venv",
+    "__pycache__",
+    ".git",
+    ".cursor",
+    ".DS_Store",
+    "runs",
+    "wandb",
+    "node_modules",
+    ".pytest_cache",
+    ".mypy_cache",
+}
+def _ignore(_dir: str, names):
+    return [n for n in names if n in EXCLUDES or n.endswith((".pyc", ".log"))]
+def _stage(stage_dir: Path) -> None:
+    if stage_dir.exists():
+        shutil.rmtree(stage_dir)
+    shutil.copytree(REPO_ROOT, stage_dir, ignore=_ignore, symlinks=False)
+def build_env_space(stage_dir: Path) -> None:
+    _stage(stage_dir)
+    dockerfile = """\
+# CERNenv environment Space (Docker, CPU)
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \\
+    PIP_NO_CACHE_DIR=1 \\
+    PYTHONPATH=/home/user/app
+RUN apt-get update && apt-get install -y --no-install-recommends \\
+        git curl ca-certificates build-essential \\
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -ms /bin/bash user
+USER user
+WORKDIR /home/user/app
+COPY --chown=user:user space/env/requirements.txt /tmp/requirements.txt
+RUN python -m pip install --upgrade pip && \\
+    python -m pip install --user -r /tmp/requirements.txt
+COPY --chown=user:user . /home/user/app
+EXPOSE 7860
+CMD [\"python\", \"-m\", \"uvicorn\", \"server.app:app\", \"--host\", \"0.0.0.0\", \"--port\", \"7860\"]
+"""
+    (stage_dir / "Dockerfile").write_text(dockerfile)
+    readme = (stage_dir / "space" / "env" / "README.md").read_text()
+    (stage_dir / "README.md").write_text(readme)
+def build_trainer_space(stage_dir: Path) -> None:
+    _stage(stage_dir)
+    dockerfile = """\
+# CERNenv trainer Space (Docker, A100)
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \\
+    PYTHONUNBUFFERED=1 \\
+    PIP_NO_CACHE_DIR=1 \\
+    HF_HOME=/home/user/.cache/huggingface \\
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface/transformers \\
+    PYTHONPATH=/home/user/app
+RUN apt-get update && apt-get install -y --no-install-recommends \\
+        python3.11 python3.11-venv python3.11-dev python3-pip \\
+        git curl ca-certificates build-essential \\
+    && rm -rf /var/lib/apt/lists/* \\
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python \\
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python3
+RUN useradd -ms /bin/bash user
+USER user
+ENV PATH=\"/home/user/.local/bin:${PATH}\"
+WORKDIR /home/user/app
+COPY --chown=user:user space/training/requirements.txt /tmp/requirements.txt
+RUN python -m pip install --upgrade pip && \\
+    python -m pip install --user -r /tmp/requirements.txt
+COPY --chown=user:user . /home/user/app
+EXPOSE 7860
+CMD [\"python\", \"-m\", \"uvicorn\", \"space.training.app:app\", \"--host\", \"0.0.0.0\", \"--port\", \"7860\"]
+"""
+    (stage_dir / "Dockerfile").write_text(dockerfile)
+    readme = (stage_dir / "space" / "training" / "README.md").read_text()
+    (stage_dir / "README.md").write_text(readme)
+def main() -> None:  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("kind", choices=["env", "trainer"])
+    parser.add_argument("--stage_dir", required=True)
+    args = parser.parse_args()
+    stage_dir = Path(args.stage_dir).resolve()
+    if args.kind == "env":
+        build_env_space(stage_dir)
+    else:
+        build_trainer_space(stage_dir)
+    print(stage_dir)
+if __name__ == "__main__":  # pragma: no cover
+    main()

scripts/baseline_agents.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""Built-in agents for evaluating CERNenv.
+These do **not** use any neural model — they are deterministic / random
+policies you can use as baselines and oracles. They consume a
+``CollisionObservation`` and return an ``ExperimentAction``.
+"""
+from __future__ import annotations
+import random
+from dataclasses import dataclass
+from typing import List, Optional, Protocol
+from models import ActionType, CollisionObservation, ExperimentAction
+class CernAgent(Protocol):
+    name: str
+    def reset(self) -> None: ...
+    def act(self, obs: CollisionObservation) -> ExperimentAction: ...
+# ── Random agent ─────────────────────────────────────────────────────────
+@dataclass
+class RandomAgent:
+    """Picks a uniformly random valid action; useful as a worst-case baseline."""
+    name: str = "random"
+    seed: int = 0
+    def __post_init__(self) -> None:
+        self._rng = random.Random(self.seed)
+    def reset(self) -> None:
+        self._rng = random.Random(self.seed)
+    def act(self, obs: CollisionObservation) -> ExperimentAction:
+        action_type = self._rng.choice(list(ActionType))
+        params: dict = {}
+        if action_type == ActionType.CONFIGURE_BEAM:
+            params = {"beam_energy": self._rng.choice(obs.task.beam_energy_options or ["13TeV"])}
+        elif action_type == ActionType.SELECT_CHANNEL:
+            params = {"channel": self._rng.choice(obs.task.available_channels or ["diphoton"])}
+        elif action_type == ActionType.SET_TRIGGER:
+            params = {"trigger": self._rng.choice(obs.task.available_triggers or ["high_pt"])}
+        elif action_type == ActionType.ALLOCATE_LUMINOSITY:
+            params = {"luminosity_fb": self._rng.uniform(20.0, 100.0)}
+        elif action_type == ActionType.COLLECT_COLLISIONS:
+            params = {"luminosity_fb": self._rng.uniform(20.0, 100.0)}
+        elif action_type == ActionType.BUILD_INVARIANT_MASS:
+            lo, hi = obs.task.mass_search_window_gev
+            params = {"mass_window_gev": [lo, hi]}
+        elif action_type == ActionType.SUBMIT_DISCOVERY_CLAIM:
+            mass = obs.candidate_masses_gev[-1] if obs.candidate_masses_gev else (
+                0.5 * (obs.task.mass_search_window_gev[0] + obs.task.mass_search_window_gev[1])
+            )
+            params = {
+                "claim": {
+                    "mass_estimate_gev": mass,
+                    "mass_uncertainty_gev": 5.0,
+                    "significance_sigma": obs.cumulative_significance,
+                    "decay_channel": obs.selected_channel or "diphoton",
+                    "spin_hypothesis": int(self._rng.choice([0, 1, 2])),
+                    "parity": self._rng.choice(["+", "-"]),
+                    "confidence": self._rng.uniform(0.4, 0.9),
+                }
+            }
+        return ExperimentAction(
+            action_type=action_type,
+            parameters=params,
+            confidence=0.4,
+            justification="random baseline",
+        )
+# ── Heuristic agent ──────────────────────────────────────────────────────
+@dataclass
+class HeuristicAgent:
+    """A scripted analysis-flow agent using high-yield channels and
+    sensible default parameters. Acts as the strong non-LLM baseline.
+    """
+    name: str = "heuristic"
+    def __post_init__(self) -> None:
+        self._reset_plan()
+    def reset(self) -> None:
+        self._reset_plan()
+    def _reset_plan(self) -> None:
+        self._plan: List[ExperimentAction] = [
+            ExperimentAction(
+                action_type=ActionType.CONFIGURE_BEAM,
+                parameters={"beam_energy": "13TeV"},
+                confidence=0.9,
+                justification="13 TeV maximises reach within budget",
+            ),
+            ExperimentAction(
+                action_type=ActionType.SELECT_CHANNEL,
+                parameters={"channel": "diphoton"},
+                confidence=0.7,
+                justification="diphoton has clean low-background signature",
+            ),
+            ExperimentAction(
+                action_type=ActionType.SET_TRIGGER,
+                parameters={"trigger": "diphoton_hlt"},
+                confidence=0.9,
+                justification="match trigger to channel",
+            ),
+            ExperimentAction(
+                action_type=ActionType.ALLOCATE_LUMINOSITY,
+                parameters={"luminosity_fb": 80.0},
+                confidence=0.8,
+                justification="bulk allocation for the first run",
+            ),
+            ExperimentAction(
+                action_type=ActionType.COLLECT_COLLISIONS,
+                parameters={"luminosity_fb": 80.0},
+                confidence=0.8,
+                justification="run physics",
+            ),
+            ExperimentAction(
+                action_type=ActionType.RECONSTRUCT_TRACKS,
+                method="Athena",
+                confidence=0.9,
+                justification="reconstruct objects",
+            ),
+            ExperimentAction(
+                action_type=ActionType.CALIBRATE_DETECTOR,
+                method="ECAL_calibration",
+                confidence=0.8,
+                justification="reduce systematic uncertainty",
+            ),
+            ExperimentAction(
+                action_type=ActionType.BUILD_INVARIANT_MASS,
+                parameters={"mass_window_gev": [80.0, 800.0], "n_bins": 60},
+                confidence=0.8,
+                justification="broad-window histogram",
+            ),
+            ExperimentAction(
+                action_type=ActionType.SUBTRACT_BACKGROUND,
+                confidence=0.7,
+                justification="smooth-fit subtraction",
+            ),
+            ExperimentAction(
+                action_type=ActionType.SCAN_BUMP,
+                method="BumpHunter",
+                confidence=0.8,
+                justification="locate candidate peak",
+            ),
+            ExperimentAction(
+                action_type=ActionType.FIT_RESONANCE,
+                method="ROOT_RooFit",
+                confidence=0.85,
+                justification="fit Breit-Wigner peak",
+            ),
+            ExperimentAction(
+                action_type=ActionType.REQUEST_SYSTEMATICS,
+                method="Luminosity_calibration",
+                confidence=0.7,
+                justification="pin down dominant systematics",
+            ),
+            ExperimentAction(
+                action_type=ActionType.ESTIMATE_SIGNIFICANCE,
+                method="Asimov_significance",
+                confidence=0.85,
+                justification="quantify discovery significance",
+            ),
+            ExperimentAction(
+                action_type=ActionType.MEASURE_ANGULAR,
+                confidence=0.7,
+                justification="probe spin",
+            ),
+        ]
+        self._idx = 0
+        self._claim_submitted = False
+    def act(self, obs: CollisionObservation) -> ExperimentAction:
+        if self._idx < len(self._plan):
+            a = self._plan[self._idx]
+            self._idx += 1
+            return a
+        if not self._claim_submitted:
+            self._claim_submitted = True
+            mass = obs.candidate_masses_gev[-1] if obs.candidate_masses_gev else 125.0
+            sig = obs.cumulative_significance or 5.0
+            return ExperimentAction(
+                action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
+                parameters={
+                    "claim": {
+                        "mass_estimate_gev": mass,
+                        "mass_uncertainty_gev": 1.0,
+                        "width_estimate_gev": 0.01,
+                        "significance_sigma": sig,
+                        "decay_channel": obs.selected_channel or "diphoton",
+                        "spin_hypothesis": 0,
+                        "parity": "+",
+                        "cross_section_fb": 50.0,
+                        "confidence": 0.8,
+                    }
+                },
+                confidence=0.85,
+                justification="submit best calibrated claim",
+            )
+        return ExperimentAction(
+            action_type=ActionType.REQUEST_THEORY_REVIEW,
+            confidence=0.3,
+            justification="filler step (claim already submitted)",
+        )
+# ── Oracle agent ─────────────────────────────────────────────────────────
+@dataclass
+class OracleAgent:
+    """An oracle that *peeks* at the latent particle truth (only available
+    for in-process evaluation; never used remotely). This is the upper bound
+    of what a perfect agent could achieve given the noise budget.
+    """
+    name: str = "oracle"
+    truth: Optional[dict] = None  # set externally before the episode
+    def reset(self) -> None:
+        self._stage = 0
+        self._claim_submitted = False
+    def act(self, obs: CollisionObservation) -> ExperimentAction:
+        truth = self.truth or {}
+        true_channel = truth.get("primary_channel", obs.selected_channel or "diphoton")
+        trigger_for_channel = {
+            "diphoton": "diphoton_hlt",
+            "dilepton_ee": "dilepton_hlt",
+            "dilepton_mumu": "dilepton_hlt",
+            "four_lepton": "dilepton_hlt",
+            "dijet": "jet_hlt",
+            "bb": "jet_hlt",
+        }.get(true_channel, "high_pt")
+        plan = [
+            ExperimentAction(action_type=ActionType.CONFIGURE_BEAM, parameters={"beam_energy": "13TeV"}, confidence=0.95),
+            ExperimentAction(action_type=ActionType.SELECT_CHANNEL, parameters={"channel": true_channel}, confidence=0.99),
+            ExperimentAction(action_type=ActionType.SET_TRIGGER, parameters={"trigger": trigger_for_channel}, confidence=0.95),
+            ExperimentAction(action_type=ActionType.ALLOCATE_LUMINOSITY, parameters={"luminosity_fb": 120.0}, confidence=0.9),
+            ExperimentAction(action_type=ActionType.COLLECT_COLLISIONS, parameters={"luminosity_fb": 120.0}, confidence=0.9),
+            ExperimentAction(action_type=ActionType.RECONSTRUCT_TRACKS, method="Athena", confidence=0.95),
+            ExperimentAction(action_type=ActionType.CALIBRATE_DETECTOR, method="ECAL_calibration", confidence=0.9),
+            ExperimentAction(
+                action_type=ActionType.BUILD_INVARIANT_MASS,
+                parameters={
+                    "mass_window_gev": [
+                        max(50.0, float(truth.get("mass_gev", 100.0)) - 50.0),
+                        float(truth.get("mass_gev", 100.0)) + 80.0,
+                    ],
+                    "n_bins": 80,
+                },
+                confidence=0.95,
+            ),
+            ExperimentAction(action_type=ActionType.SUBTRACT_BACKGROUND, confidence=0.9),
+            ExperimentAction(action_type=ActionType.FIT_RESONANCE, method="ROOT_RooFit", confidence=0.95),
+            ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS, method="Luminosity_calibration", confidence=0.9),
+            ExperimentAction(action_type=ActionType.ESTIMATE_SIGNIFICANCE, method="Asimov_significance", confidence=0.95),
+            ExperimentAction(action_type=ActionType.MEASURE_ANGULAR, confidence=0.85),
+        ]
+        if self._stage < len(plan):
+            a = plan[self._stage]
+            self._stage += 1
+            return a
+        if not self._claim_submitted:
+            self._claim_submitted = True
+            return ExperimentAction(
+                action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
+                parameters={
+                    "claim": {
+                        "mass_estimate_gev": float(truth.get("mass_gev", 125.0)),
+                        "mass_uncertainty_gev": 0.5,
+                        "width_estimate_gev": float(truth.get("width_gev", 0.01)),
+                        "significance_sigma": max(obs.cumulative_significance, 5.0),
+                        "decay_channel": true_channel,
+                        "spin_hypothesis": int(truth.get("spin", 0)),
+                        "parity": str(truth.get("parity", "+")),
+                        "cross_section_fb": float(truth.get("cross_section_fb", 50.0)),
+                        "confidence": 0.95,
+                    }
+                },
+                confidence=0.95,
+                justification="oracle claim from hidden truth",
+            )
+        return ExperimentAction(
+            action_type=ActionType.REQUEST_THEORY_REVIEW,
+            confidence=0.5,
+            justification="oracle filler",
+        )
+__all__ = ["CernAgent", "RandomAgent", "HeuristicAgent", "OracleAgent"]

scripts/push_to_hub.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""Push CERNenv artefacts to the Hugging Face Hub.
+Two subcommands:
+* ``model`` — push trained LoRA adapters (output of ``training_unsloth.py``)
+  to a model repo. Generates a model card describing the run.
+* ``space`` — push a directory as a Hugging Face Space
+  (e.g. ``space/training`` for the trainer Space, or the project root
+  to publish the env Space). Front-matter is taken from the README.md
+  inside the directory.
+Usage:
+    python -m scripts.push_to_hub model \\
+        --adapter_dir runs/unsloth-grpo \\
+        --repo_id YOUR_HF_USERNAME/cernenv-grpo-qwen2.5-3b \\
+        --base_model unsloth/Qwen2.5-3B-Instruct
+    python -m scripts.push_to_hub space \\
+        --space_dir space/training \\
+        --repo_id YOUR_HF_USERNAME/cernenv-trainer \\
+        --hardware a100-large
+    python -m scripts.push_to_hub space \\
+        --space_dir . \\
+        --repo_id YOUR_HF_USERNAME/cernenv \\
+        --include "models.py" "server/**" "openenv.yaml" "pyproject.toml" "client.py" "README.md"
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Iterable, List, Optional
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+DEFAULT_SPACE_EXCLUDES: List[str] = [
+    ".venv/**",
+    "__pycache__/**",
+    "**/__pycache__/**",
+    "*.pyc",
+    ".cursor/**",
+    ".git/**",
+    ".DS_Store",
+    "**/.DS_Store",
+    "runs/**",
+    "training/runs/**",
+    "training/plots/**",
+    "wandb/**",
+    "*.zip",
+    "*.apk",
+    "*.png",
+    "*.jpg",
+    "*.jpeg",
+    "[External]*.txt",
+    "Hackathon FAQs*.txt",
+    "*.log",
+]
+def _hf_login() -> None:
+    from huggingface_hub import login
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise SystemExit(
+            "HF_TOKEN environment variable is required (write-scoped Hugging Face token)."
+        )
+    login(token=token)
+def _model_card(*, repo_id: str, base_model: str, run_dir: Path) -> str:
+    return f"""---
+license: bsd-3-clause
+library_name: peft
+base_model: {base_model}
+tags:
+  - cernenv
+  - openenv
+  - reinforcement-learning
+  - grpo
+  - unsloth
+  - lora
+  - particle-physics
+---
+# {repo_id}
+LoRA (Low-Rank Adaptation) adapters trained with **GRPO** (Group-Relative
+Policy Optimization) inside the **CERNenv** OpenEnv environment — an
+LHC (Large Hadron Collider) particle-discovery POMDP (Partially Observable
+Markov Decision Process).
+The agent (this model) plays the role of a high-energy physicist running an
+analysis: it configures the beam, allocates luminosity, picks decay
+channels and triggers, reconstructs events, fits resonances, estimates
+significance, and finally submits a structured discovery claim that is
+graded against a hidden ground-truth particle.
+* Base model: `{base_model}`
+* RL framework: TRL (Transformer Reinforcement Learning) GRPO
+* Acceleration: Unsloth + 4-bit + LoRA
+* Environment: [CERNenv](https://huggingface.co/spaces/{repo_id.split('/')[0]}/cernenv)
+## Usage
+```python
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+base = "{base_model}"
+adapter = "{repo_id}"
+tokenizer = AutoTokenizer.from_pretrained(base)
+model = AutoModelForCausalLM.from_pretrained(base, device_map="auto")
+model = PeftModel.from_pretrained(model, adapter)
+```
+See the CERNenv repo for full evaluation, plots, and the `LLMAgent` wrapper.
+"""
+def push_model(
+    *,
+    adapter_dir: str,
+    repo_id: str,
+    base_model: str,
+    private: bool,
+) -> None:
+    from huggingface_hub import HfApi, create_repo
+    _hf_login()
+    api = HfApi()
+    run_dir = Path(adapter_dir)
+    if not run_dir.exists():
+        raise SystemExit(f"adapter_dir not found: {run_dir}")
+    create_repo(repo_id=repo_id, repo_type="model", private=private, exist_ok=True)
+    card_path = run_dir / "README.md"
+    card_path.write_text(_model_card(repo_id=repo_id, base_model=base_model, run_dir=run_dir))
+    logger.info("uploading %s → %s", run_dir, repo_id)
+    api.upload_folder(
+        folder_path=str(run_dir),
+        repo_id=repo_id,
+        repo_type="model",
+        commit_message="Upload CERNenv GRPO LoRA adapters",
+    )
+    logger.info("done: https://huggingface.co/%s", repo_id)
+def push_space(
+    *,
+    space_dir: str,
+    repo_id: str,
+    hardware: Optional[str],
+    private: bool,
+    include: Optional[List[str]],
+    exclude: Optional[List[str]],
+) -> None:
+    from huggingface_hub import HfApi, create_repo
+    _hf_login()
+    api = HfApi()
+    src = Path(space_dir).resolve()
+    if not src.exists():
+        raise SystemExit(f"space_dir not found: {src}")
+    create_repo(
+        repo_id=repo_id,
+        repo_type="space",
+        space_sdk="docker",
+        space_hardware=hardware,
+        private=private,
+        exist_ok=True,
+    )
+    effective_exclude = list(DEFAULT_SPACE_EXCLUDES)
+    if exclude:
+        effective_exclude.extend(exclude)
+    logger.info("uploading %s → space:%s", src, repo_id)
+    logger.info("ignore patterns: %s", effective_exclude)
+    api.upload_folder(
+        folder_path=str(src),
+        repo_id=repo_id,
+        repo_type="space",
+        commit_message="Update CERNenv Space",
+        allow_patterns=include,
+        ignore_patterns=effective_exclude,
+    )
+    logger.info("done: https://huggingface.co/spaces/%s", repo_id)
+def main() -> None:  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    m = sub.add_parser("model", help="push trained LoRA adapters to the Hub")
+    m.add_argument("--adapter_dir", required=True)
+    m.add_argument("--repo_id", required=True)
+    m.add_argument("--base_model", required=True)
+    m.add_argument("--private", action="store_true")
+    s = sub.add_parser("space", help="push a directory as an HF Space")
+    s.add_argument("--space_dir", required=True)
+    s.add_argument("--repo_id", required=True)
+    s.add_argument("--hardware", default=None,
+                   help="e.g. a100-large, t4-small, l4-medium")
+    s.add_argument("--private", action="store_true")
+    s.add_argument("--include", nargs="*", default=None,
+                   help="glob patterns to include")
+    s.add_argument("--exclude", nargs="*", default=None,
+                   help="glob patterns to exclude")
+    args = parser.parse_args()
+    if args.cmd == "model":
+        push_model(
+            adapter_dir=args.adapter_dir,
+            repo_id=args.repo_id,
+            base_model=args.base_model,
+            private=args.private,
+        )
+    elif args.cmd == "space":
+        push_space(
+            space_dir=args.space_dir,
+            repo_id=args.repo_id,
+            hardware=args.hardware,
+            private=args.private,
+            include=args.include,
+            exclude=args.exclude,
+        )
+if __name__ == "__main__":  # pragma: no cover
+    main()

scripts/run_agent.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Run a (non-LLM) baseline agent against the in-process environment.
+Usage:
+    python -m scripts.run_agent --agent heuristic --scenario easy_diphoton_160 --seed 7
+    python -m scripts.run_agent --agent oracle --difficulty hard --episodes 5
+"""
+from __future__ import annotations
+import argparse
+import json
+from dataclasses import asdict
+from typing import Any, Dict, List
+from server.environment import CERNCollisionEnvironment
+from scripts.baseline_agents import (
+    HeuristicAgent,
+    OracleAgent,
+    RandomAgent,
+)
+AGENT_REGISTRY = {
+    "random": RandomAgent,
+    "heuristic": HeuristicAgent,
+    "oracle": OracleAgent,
+}
+def run_episode(
+    *,
+    agent_name: str,
+    difficulty: str | None,
+    scenario: str | None,
+    seed: int,
+    max_steps: int,
+    verbose: bool,
+) -> Dict[str, Any]:
+    env = CERNCollisionEnvironment(max_steps=max_steps)
+    obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
+    agent_cls = AGENT_REGISTRY[agent_name]
+    if agent_name == "random":
+        agent = agent_cls(seed=seed)
+    else:
+        agent = agent_cls()
+    if agent_name == "oracle":
+        agent.truth = env.hidden_truth()
+    agent.reset()
+    total_reward = 0.0
+    step_log: List[Dict[str, Any]] = []
+    while not obs.done:
+        action = agent.act(obs)
+        obs = env.step(action)
+        total_reward += float(obs.reward or 0.0)
+        if verbose:
+            print(
+                f"  step {obs.step_index:2d}  {action.action_type.value:24s} "
+                f"rew={obs.reward:+.3f}  done={obs.done}"
+            )
+        step_log.append(
+            {
+                "step": obs.step_index,
+                "action": action.action_type.value,
+                "reward": float(obs.reward or 0.0),
+                "violations": obs.rule_violations,
+            }
+        )
+    summary = {
+        "agent": agent_name,
+        "scenario": env.state.scenario_name,
+        "difficulty": env.state.difficulty,
+        "seed": seed,
+        "total_reward": total_reward,
+        "cumulative_reward": float(env.state.cumulative_reward),
+        "terminal_reward": env.state.terminal_reward,
+        "discovered": env.state.discovered,
+        "correct_mass": env.state.correct_mass,
+        "correct_channel": env.state.correct_channel,
+        "correct_spin": env.state.correct_spin,
+        "steps": len(step_log),
+        "truth": env.hidden_truth(),
+        "log": step_log,
+    }
+    return summary
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--agent", choices=list(AGENT_REGISTRY), default="heuristic")
+    parser.add_argument("--scenario", default=None)
+    parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default=None)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--episodes", type=int, default=1)
+    parser.add_argument("--max-steps", type=int, default=40)
+    parser.add_argument("--out", default=None, help="Optional path to dump JSON results")
+    parser.add_argument("--quiet", action="store_true")
+    args = parser.parse_args()
+    rollouts: List[Dict[str, Any]] = []
+    for ep in range(args.episodes):
+        seed = args.seed + ep
+        summary = run_episode(
+            agent_name=args.agent,
+            difficulty=args.difficulty,
+            scenario=args.scenario,
+            seed=seed,
+            max_steps=args.max_steps,
+            verbose=not args.quiet and args.episodes == 1,
+        )
+        rollouts.append(summary)
+        print(
+            f"[{ep + 1}/{args.episodes}] agent={args.agent} "
+            f"scenario={summary['scenario']} reward={summary['total_reward']:+.3f} "
+            f"discovered={summary['discovered']} correct_mass={summary['correct_mass']} "
+            f"correct_channel={summary['correct_channel']}"
+        )
+    if args.out:
+        with open(args.out, "w") as f:
+            json.dump(rollouts, f, indent=2, default=str)
+        print(f"saved → {args.out}")
+if __name__ == "__main__":
+    main()

server/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# CERNenv server: OpenEnv FastAPI image
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git curl && \
+    rm -rf /var/lib/apt/lists/*
+ARG ENV_NAME=cernenv
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """CERNenv server package."""

server/app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""FastAPI app exposing ``CERNCollisionEnvironment`` over the OpenEnv HTTP API."""
+from __future__ import annotations
+import os
+from typing import Optional
+from openenv.core.env_server import create_fastapi_app
+from models import CollisionObservation, ExperimentAction
+from server.environment import CERNCollisionEnvironment
+def make_env_factory(
+    max_steps: int,
+    default_difficulty: Optional[str],
+):
+    def factory() -> CERNCollisionEnvironment:
+        return CERNCollisionEnvironment(
+            max_steps=max_steps,
+            default_difficulty=default_difficulty,
+        )
+    return factory
+def build_app(
+    *,
+    max_steps: int = 40,
+    default_difficulty: Optional[str] = None,
+):
+    """Construct the FastAPI app with a per-session environment factory."""
+    factory = make_env_factory(max_steps=max_steps, default_difficulty=default_difficulty)
+    return create_fastapi_app(factory, ExperimentAction, CollisionObservation)
+app = build_app(
+    max_steps=int(os.getenv("CERNENV_MAX_STEPS", "40")),
+    default_difficulty=os.getenv("CERNENV_DEFAULT_DIFFICULTY") or None,
+)
+def main() -> None:  # pragma: no cover - CLI entrypoint
+    import uvicorn
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", "8000"))
+    uvicorn.run("server.app:app", host=host, port=port, log_level="info")
+if __name__ == "__main__":  # pragma: no cover
+    main()

server/environment.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""``CERNCollisionEnvironment``: orchestrates simulator + rules + rewards.
+This is the OpenEnv-compatible ``Environment`` that the FastAPI app exposes.
+It owns one episode at a time:
+  reset(seed)  → builds a fresh latent state from a sampled scenario.
+  step(action) → validates → generates noisy output → updates state →
+                 computes reward → builds the agent observation.
+"""
+from __future__ import annotations
+import logging
+import uuid
+from typing import Any, List, Optional
+from openenv.core.env_server import Environment, State
+from models import (
+    AGENT_ENVIRONMENT_RULES,
+    ActionType,
+    CollisionObservation,
+    DiscoveryClaim,
+    ExperimentAction,
+    IntermediateOutput,
+    OutputType,
+    PipelineStepRecord,
+    ResourceUsage,
+    TaskSpec,
+    build_agent_system_prompt,
+)
+from server.rewards import (
+    RewardWeights,
+    compute_step_reward,
+    compute_terminal_reward,
+)
+from server.rules import RulesEngine, ViolationCode
+from server.simulator import (
+    NoiseModel,
+    OutputGenerator,
+    TransitionEngine,
+    compute_action_cost,
+)
+from server.simulator.latent_state import FullLatentState
+from server.tasks import sample_scenario, Scenario
+logger = logging.getLogger(__name__)
+# ── State container ──────────────────────────────────────────────────────
+class CernState(State):
+    """OpenEnv State subclass: includes hidden truth & runtime stats."""
+    scenario_name: Optional[str] = None
+    difficulty: Optional[str] = None
+    episode_done: bool = False
+    cumulative_reward: float = 0.0
+    terminal_reward: Optional[float] = None
+    discovered: Optional[bool] = None
+    correct_mass: Optional[bool] = None
+    correct_channel: Optional[bool] = None
+    correct_spin: Optional[bool] = None
+    truth_mass_gev: Optional[float] = None
+    truth_channel: Optional[str] = None
+# ── Environment ──────────────────────────────────────────────────────────
+class CERNCollisionEnvironment(Environment[ExperimentAction, CollisionObservation, CernState]):
+    """LHC particle-discovery POMDP environment."""
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(
+        self,
+        *,
+        max_steps: int = 40,
+        default_difficulty: Optional[str] = None,
+        default_scenario_name: Optional[str] = None,
+        reward_weights: Optional[RewardWeights] = None,
+    ) -> None:
+        super().__init__()
+        self.max_steps = max_steps
+        self.default_difficulty = default_difficulty
+        self.default_scenario_name = default_scenario_name
+        self.reward_weights = reward_weights or RewardWeights()
+        self._state = CernState()
+        self._scenario: Optional[Scenario] = None
+        self._latent: Optional[FullLatentState] = None
+        self._task: Optional[TaskSpec] = None
+        self._noise: Optional[NoiseModel] = None
+        self._output_gen: Optional[OutputGenerator] = None
+        self._transition: Optional[TransitionEngine] = None
+        self._rules: Optional[RulesEngine] = None
+        self._history: List[PipelineStepRecord] = []
+        self._all_outputs: List[IntermediateOutput] = []
+    # ── Environment API ────────────────────────────────────────────────
+    @property
+    def state(self) -> CernState:
+        return self._state
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> CollisionObservation:
+        difficulty = kwargs.get("difficulty") or self.default_difficulty
+        scenario_name = kwargs.get("scenario") or self.default_scenario_name
+        scenario = sample_scenario(
+            difficulty=difficulty,
+            name=scenario_name,
+            seed=seed,
+        )
+        self._scenario = scenario
+        self._latent = scenario.fresh_latent()
+        self._task = scenario.task
+        if seed is not None:
+            self._latent.rng_seed = int(seed)
+        self._noise = NoiseModel(seed=self._latent.rng_seed)
+        self._output_gen = OutputGenerator(self._noise)
+        self._transition = TransitionEngine()
+        self._rules = RulesEngine(
+            mass_search_window_gev=tuple(self._task.mass_search_window_gev),
+        )
+        self._history = []
+        self._all_outputs = []
+        self._state = CernState(
+            episode_id=episode_id or f"ep-{uuid.uuid4().hex[:8]}",
+            step_count=0,
+            scenario_name=scenario.name,
+            difficulty=scenario.difficulty,
+            episode_done=False,
+            cumulative_reward=0.0,
+            truth_mass_gev=self._latent.particle.mass_gev,
+            truth_channel=self._latent.particle.primary_channel,
+        )
+        obs = self._build_observation(
+            latest_output=None,
+            done=False,
+            reward=0.0,
+            step_breakdown={},
+            rule_violations=[],
+        )
+        return obs
+    def step(
+        self,
+        action: ExperimentAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> CollisionObservation:
+        if self._latent is None:
+            self.reset()
+        if self._state.episode_done:
+            return self._build_terminal_observation(reason="episode already complete")
+        assert self._rules is not None
+        assert self._output_gen is not None
+        assert self._transition is not None
+        prev_state = self._latent.model_copy(deep=True)
+        rule_result = self._rules.validate(action, self._latent)
+        if not rule_result.allowed:
+            output = IntermediateOutput(
+                output_type=OutputType.FAILURE_REPORT,
+                step_index=self._state.step_count,
+                success=False,
+                quality_score=0.0,
+                summary="Action rejected: " + "; ".join(rule_result.messages),
+                warnings=rule_result.messages,
+            )
+        else:
+            output = self._output_gen.generate(
+                action=action,
+                state=self._latent,
+                step_index=self._state.step_count,
+            )
+        # Apply transition (state mutation + cost accounting)
+        if rule_result.allowed:
+            self._transition.step(self._latent, action, output)
+        else:
+            cost = compute_action_cost(action, output)
+            self._latent.resources.budget_used_musd += cost["musd"]
+            self._latent.resources.time_used_days += cost["days"]
+            self._latent.step_count += 1
+        self._all_outputs.append(output)
+        cost = compute_action_cost(action, output)
+        record = PipelineStepRecord(
+            step_index=self._state.step_count,
+            action_type=action.action_type,
+            method=action.method,
+            parameters=action.parameters,
+            output_summary=output.summary,
+            output_type=output.output_type,
+            success=output.success,
+            quality_score=float(output.quality_score),
+            cost_musd=float(cost["musd"]),
+            luminosity_cost_fb=float(cost["luminosity_fb"]),
+            time_cost_days=float(cost["days"]),
+        )
+        self._history.append(record)
+        step_reward = compute_step_reward(
+            action=action,
+            output=output,
+            state_before=prev_state,
+            state_after=self._latent,
+            rule_result=rule_result,
+            weights=self.reward_weights,
+        )
+        self._state.cumulative_reward += step_reward.reward
+        self._state.step_count += 1
+        terminal_now = (
+            action.action_type == ActionType.SUBMIT_DISCOVERY_CLAIM
+            and rule_result.allowed
+        )
+        time_up = (
+            self._state.step_count >= self.max_steps
+            or self._latent.resources.budget_exhausted
+            or self._latent.resources.time_exhausted
+        )
+        terminal_reward_value = 0.0
+        if terminal_now:
+            claim = self._claim_from_action(action)
+            term = compute_terminal_reward(
+                state=self._latent,
+                claim=claim,
+                weights=self.reward_weights,
+            )
+            terminal_reward_value = term.reward
+            self._state.cumulative_reward += terminal_reward_value
+            self._state.terminal_reward = terminal_reward_value
+            self._state.discovered = term.discovered
+            self._state.correct_mass = term.correct_mass
+            self._state.correct_channel = term.correct_channel
+            self._state.correct_spin = term.correct_spin
+        done = terminal_now or time_up
+        if done:
+            self._state.episode_done = True
+        observation = self._build_observation(
+            latest_output=output,
+            done=done,
+            reward=step_reward.reward + terminal_reward_value,
+            step_breakdown=step_reward.breakdown.components,
+            rule_violations=[
+                *(v.value for v in rule_result.violations),
+                *(v.value for v in rule_result.soft_violations),
+            ],
+        )
+        return observation
+    # ── Helpers ────────────────────────────────────────────────────────
+    def _claim_from_action(self, action: ExperimentAction) -> DiscoveryClaim:
+        raw = action.parameters.get("claim") or {}
+        try:
+            return DiscoveryClaim(**raw)
+        except Exception as exc:  # pragma: no cover - defensive
+            logger.warning("Malformed claim, defaulting: %s", exc)
+            return DiscoveryClaim()
+    def _build_terminal_observation(self, reason: str) -> CollisionObservation:
+        obs = self._build_observation(
+            latest_output=IntermediateOutput(
+                output_type=OutputType.FAILURE_REPORT,
+                step_index=self._state.step_count,
+                success=False,
+                summary=reason,
+            ),
+            done=True,
+            reward=0.0,
+            step_breakdown={},
+            rule_violations=["episode_terminated"],
+        )
+        return obs
+    def _build_observation(
+        self,
+        *,
+        latest_output: Optional[IntermediateOutput],
+        done: bool,
+        reward: float,
+        step_breakdown: dict,
+        rule_violations: list,
+    ) -> CollisionObservation:
+        assert self._latent is not None
+        assert self._task is not None
+        res = self._latent.resources
+        usage = ResourceUsage(
+            budget_used_musd=res.budget_used_musd,
+            budget_remaining_musd=res.budget_remaining,
+            luminosity_used_fb=res.luminosity_used_fb,
+            luminosity_remaining_fb=res.luminosity_remaining,
+            time_used_days=res.time_used_days,
+            time_remaining_days=res.time_remaining,
+            compute_hours_used=res.compute_hours_used,
+        )
+        obs = CollisionObservation(
+            done=done,
+            reward=float(reward),
+            task=self._task,
+            step_index=self._state.step_count,
+            pipeline_history=list(self._history),
+            available_channels=self._task.available_channels,
+            available_triggers=self._task.available_triggers,
+            available_tools=self._task.available_tools,
+            resource_usage=usage,
+            latest_output=latest_output,
+            all_outputs=list(self._all_outputs),
+            candidate_masses_gev=list(self._latent.candidate_masses_gev),
+            candidate_significances=list(self._latent.candidate_significances),
+            selected_channel=self._latent.selected_channel,
+            selected_beam_energy=self._latent.selected_beam_energy,
+            cumulative_significance=float(
+                self._latent.progress.best_significance_sigma or 0.0
+            ),
+            uncertainty_summary={
+                "energy_scale_unc_gev": self._latent.detector.energy_scale_uncertainty,
+                "luminosity_unc": self._latent.detector.luminosity_uncertainty,
+                "resolution_gev": self._latent.detector.detector_resolution_gev,
+            },
+            rule_violations=rule_violations,
+            step_reward_breakdown=dict(step_breakdown),
+        )
+        return obs
+    # ── Convenience for diagnostics ────────────────────────────────────
+    def hidden_truth(self) -> Optional[dict]:
+        """Reveal the hidden particle (debug / evaluation only)."""
+        if self._latent is None:
+            return None
+        return self._latent.particle.model_dump()
+__all__ = [
+    "CernState",
+    "CERNCollisionEnvironment",
+    "AGENT_ENVIRONMENT_RULES",
+    "build_agent_system_prompt",
+]

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv-core[core]>=0.2.0
+numpy>=1.24.0
+scipy>=1.10.0
+pydantic>=2.0.0
+fastapi>=0.110.0
+uvicorn>=0.27.0

server/rewards/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Reward components for CERNenv."""
+from .reward_function import (
+    RewardBreakdown,
+    RewardWeights,
+    StepReward,
+    TerminalReward,
+    compute_step_reward,
+    compute_terminal_reward,
+)
+__all__ = [
+    "RewardBreakdown",
+    "RewardWeights",
+    "StepReward",
+    "TerminalReward",
+    "compute_step_reward",
+    "compute_terminal_reward",
+]

server/rewards/reward_function.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""Decomposed reward function.
+Two stages:
+1. **Per-step reward** ``compute_step_reward``: shapes behaviour with small
+   incentives (progress, evidence quality, valid prerequisites) and
+   penalties (rule violations, repeated work, wasted resources).
+2. **Terminal reward** ``compute_terminal_reward``: graded only when the
+   agent submits a discovery claim or runs out of resources. Compares the
+   submitted claim against the hidden ``LatentParticle`` truth.
+The terminal reward is intentionally dominant so the policy must care about
+the *correct* discovery, not just looking busy.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import numpy as np
+from models import (
+    ActionType,
+    DiscoveryClaim,
+    ExperimentAction,
+    IntermediateOutput,
+)
+from server.rules.engine import RuleResult, ViolationCode
+from server.simulator.latent_state import FullLatentState
+# ── Configuration ────────────────────────────────────────────────────────
+@dataclass
+class RewardWeights:
+    # ── per-step shaping ────────────────────────────────────────
+    valid_action: float = 0.05
+    progress_milestone: float = 0.25
+    evidence_quality: float = 0.20
+    tool_fit: float = 0.10
+    soft_violation: float = -0.05
+    hard_violation: float = -0.50
+    redundancy: float = -0.10
+    resource_overspend: float = -0.30
+    failure: float = -0.30
+    # ── terminal grading ────────────────────────────────────────
+    terminal_scale: float = 5.0   # multiplied with the convex sum below
+    mass_calibration: float = 0.30
+    significance_quality: float = 0.20
+    channel_correctness: float = 0.20
+    spin_correctness: float = 0.10
+    width_calibration: float = 0.05
+    confidence_calibration: float = 0.10
+    efficiency_bonus: float = 0.05
+    overconfident_wrong_penalty: float = 4.0  # subtracted from terminal
+# ── Outputs ──────────────────────────────────────────────────────────────
+@dataclass
+class RewardBreakdown:
+    components: Dict[str, float] = field(default_factory=dict)
+    total: float = 0.0
+    def add(self, key: str, value: float) -> None:
+        self.components[key] = self.components.get(key, 0.0) + value
+        self.total += value
+@dataclass
+class StepReward:
+    reward: float
+    breakdown: RewardBreakdown
+@dataclass
+class TerminalReward:
+    reward: float
+    breakdown: RewardBreakdown
+    discovered: bool
+    correct_mass: bool
+    correct_channel: bool
+    correct_spin: bool
+# ── Per-step ─────────────────────────────────────────────────────────────
+_PROGRESS_FLAGS = [
+    "beam_configured",
+    "luminosity_allocated",
+    "trigger_set",
+    "collisions_collected",
+    "channel_selected",
+    "tracks_reconstructed",
+    "detector_calibrated",
+    "invariant_mass_built",
+    "background_subtracted",
+    "resonance_fitted",
+    "significance_estimated",
+]
+def _milestone_progress(state_before: FullLatentState, state_after: FullLatentState) -> int:
+    """Number of new progress milestones unlocked this step."""
+    delta = 0
+    for flag in _PROGRESS_FLAGS:
+        was = getattr(state_before.progress, flag)
+        now = getattr(state_after.progress, flag)
+        if now and not was:
+            delta += 1
+    return delta
+def compute_step_reward(
+    *,
+    action: ExperimentAction,
+    output: IntermediateOutput,
+    state_before: FullLatentState,
+    state_after: FullLatentState,
+    rule_result: RuleResult,
+    weights: RewardWeights = RewardWeights(),
+) -> StepReward:
+    breakdown = RewardBreakdown()
+    if rule_result.allowed and output.success:
+        breakdown.add("valid_action", weights.valid_action)
+    if not output.success:
+        breakdown.add("failure", weights.failure)
+    # progress
+    new_milestones = _milestone_progress(state_before, state_after)
+    if new_milestones > 0:
+        breakdown.add("progress", weights.progress_milestone * new_milestones)
+    # evidence quality
+    if output.success:
+        breakdown.add("evidence_quality", weights.evidence_quality * float(output.quality_score))
+    # tool fit (named method exists in the recommended toolset)
+    if action.method:
+        breakdown.add("tool_fit", weights.tool_fit * 0.5)
+    # rule penalties
+    if rule_result.violations:
+        breakdown.add("hard_violation", weights.hard_violation * len(rule_result.violations))
+    if rule_result.soft_violations:
+        soft_redundant = sum(1 for v in rule_result.soft_violations if v == ViolationCode.REDUNDANT)
+        soft_other = len(rule_result.soft_violations) - soft_redundant
+        if soft_redundant:
+            breakdown.add("redundancy", weights.redundancy * soft_redundant)
+        if soft_other:
+            breakdown.add("soft_violation", weights.soft_violation * soft_other)
+    # resource overspend
+    res = state_after.resources
+    if res.budget_used_musd > res.budget_total_musd:
+        breakdown.add("budget_overspend", weights.resource_overspend)
+    if res.luminosity_used_fb > res.luminosity_total_fb:
+        breakdown.add("lumi_overspend", weights.resource_overspend)
+    if res.time_used_days > res.time_limit_days:
+        breakdown.add("time_overspend", weights.resource_overspend)
+    return StepReward(reward=float(breakdown.total), breakdown=breakdown)
+# ── Terminal grading ─────────────────────────────────────────────────────
+def _mass_score(true_mass: float, claim_mass: Optional[float], unc: Optional[float]) -> float:
+    """1.0 within 1σ, smoothly decays to 0 by 5 GeV (or 5σ, whichever larger)."""
+    if claim_mass is None or true_mass <= 0:
+        return 0.0
+    err = abs(claim_mass - true_mass)
+    # Tolerance: max(1.0 GeV, 1% of true mass, claimed unc)
+    tol = max(1.0, 0.01 * true_mass)
+    if unc is not None and unc > 0:
+        tol = max(tol, float(unc))
+    if err <= tol:
+        return 1.0
+    if err >= 5 * tol:
+        return 0.0
+    return float(np.clip(1.0 - (err - tol) / (4 * tol), 0.0, 1.0))
+def _significance_score(state: FullLatentState, claim_sigma: Optional[float]) -> float:
+    """High score when claimed σ matches measured σ and is ≥ 5."""
+    measured = state.progress.best_significance_sigma or 0.0
+    if claim_sigma is None:
+        return 0.0
+    over_claim = max(0.0, claim_sigma - measured)
+    base = float(np.clip(measured / 5.0, 0.0, 1.0))
+    penalty = float(np.clip(over_claim / 3.0, 0.0, 1.0))
+    return float(np.clip(base - 0.5 * penalty, 0.0, 1.0))
+def _confidence_calibration(claim_conf: float, mass_score: float, channel_correct: bool) -> float:
+    """Reward agents whose confidence tracks their actual accuracy."""
+    actual = 0.5 * mass_score + 0.5 * (1.0 if channel_correct else 0.0)
+    err = abs(actual - claim_conf)
+    return float(np.clip(1.0 - err, 0.0, 1.0))
+def _efficiency_bonus(state: FullLatentState) -> float:
+    """Reward leftover budget (encourages succinct experiments)."""
+    res = state.resources
+    score = 0.0
+    score += np.clip(res.budget_remaining / res.budget_total_musd, 0.0, 1.0)
+    score += np.clip(res.luminosity_remaining / res.luminosity_total_fb, 0.0, 1.0)
+    score += np.clip(res.time_remaining / res.time_limit_days, 0.0, 1.0)
+    return float(score / 3.0)
+def compute_terminal_reward(
+    *,
+    state: FullLatentState,
+    claim: DiscoveryClaim,
+    weights: RewardWeights = RewardWeights(),
+) -> TerminalReward:
+    breakdown = RewardBreakdown()
+    truth = state.particle
+    mass_score = _mass_score(truth.mass_gev, claim.mass_estimate_gev, claim.mass_uncertainty_gev)
+    breakdown.add("mass_calibration", weights.mass_calibration * mass_score)
+    sig_score = _significance_score(state, claim.significance_sigma)
+    breakdown.add("significance_quality", weights.significance_quality * sig_score)
+    channel_ok = claim.decay_channel == truth.primary_channel
+    breakdown.add("channel_correctness", weights.channel_correctness * (1.0 if channel_ok else 0.0))
+    spin_ok = claim.spin_hypothesis is not None and claim.spin_hypothesis == truth.spin
+    breakdown.add("spin_correctness", weights.spin_correctness * (1.0 if spin_ok else 0.0))
+    width_score = 0.0
+    if claim.width_estimate_gev is not None and truth.width_gev > 0:
+        rel = abs(claim.width_estimate_gev - truth.width_gev) / max(truth.width_gev, 1e-3)
+        width_score = float(np.clip(1.0 - rel, 0.0, 1.0))
+    breakdown.add("width_calibration", weights.width_calibration * width_score)
+    conf_score = _confidence_calibration(claim.confidence, mass_score, channel_ok)
+    breakdown.add("confidence_calibration", weights.confidence_calibration * conf_score)
+    eff_score = _efficiency_bonus(state)
+    breakdown.add("efficiency_bonus", weights.efficiency_bonus * eff_score)
+    discovered = (
+        mass_score >= 0.5
+        and channel_ok
+        and (claim.significance_sigma or 0.0) >= 4.5
+    )
+    raw = breakdown.total * weights.terminal_scale
+    # Overconfident-wrong penalty: high confidence but wrong channel & far mass
+    if claim.confidence >= 0.8 and (mass_score < 0.2 or not channel_ok):
+        raw -= weights.overconfident_wrong_penalty
+        breakdown.add("overconfident_wrong", -weights.overconfident_wrong_penalty)
+    return TerminalReward(
+        reward=float(raw),
+        breakdown=breakdown,
+        discovered=discovered,
+        correct_mass=mass_score >= 0.5,
+        correct_channel=channel_ok,
+        correct_spin=spin_ok,
+    )
+__all__ = [
+    "RewardBreakdown",
+    "RewardWeights",
+    "StepReward",
+    "TerminalReward",
+    "compute_step_reward",
+    "compute_terminal_reward",
+]

server/rules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Rules engine: prerequisites, resources, redundancy, claim validity."""
+from .engine import RuleResult, RulesEngine, ViolationCode
+__all__ = ["RuleResult", "RulesEngine", "ViolationCode"]

server/rules/engine.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""RulesEngine for CERNenv.
+Validates an incoming ``ExperimentAction`` against the current latent state
+*before* it is executed. Rule violations are reported back as warnings on the
+observation and feed into the per-step penalty in the reward function.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional
+from models import (
+    ActionType,
+    DetectorChannel,
+    ExperimentAction,
+    TriggerType,
+)
+from server.simulator.latent_state import FullLatentState
+class ViolationCode(str, Enum):
+    PREREQ_MISSING = "prerequisite_missing"
+    BUDGET_EXHAUSTED = "budget_exhausted"
+    LUMI_EXHAUSTED = "luminosity_exhausted"
+    TIME_EXHAUSTED = "time_exhausted"
+    REDUNDANT = "redundant"
+    INVALID_PARAMS = "invalid_parameters"
+    INVALID_CLAIM = "invalid_claim"
+    CHANNEL_MISMATCH = "channel_mismatch"
+    OUT_OF_WINDOW = "out_of_search_window"
+@dataclass
+class RuleResult:
+    allowed: bool
+    violations: List[ViolationCode] = field(default_factory=list)
+    messages: List[str] = field(default_factory=list)
+    soft_violations: List[ViolationCode] = field(default_factory=list)
+    def add(self, code: ViolationCode, msg: str, soft: bool = False) -> None:
+        self.messages.append(msg)
+        if soft:
+            self.soft_violations.append(code)
+        else:
+            self.violations.append(code)
+            self.allowed = False
+class RulesEngine:
+    """Stateless validator (state is passed in)."""
+    def __init__(
+        self,
+        mass_search_window_gev: tuple[float, float] = (50.0, 1000.0),
+    ) -> None:
+        self.mass_search_window_gev = mass_search_window_gev
+    # ── Public API ─────────────────────────────────────────────────────
+    def validate(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+    ) -> RuleResult:
+        result = RuleResult(allowed=True)
+        # ── resource gating (hard) ────────────────────────────────
+        if state.resources.budget_exhausted:
+            result.add(ViolationCode.BUDGET_EXHAUSTED, "Budget fully spent.")
+        if state.resources.time_exhausted:
+            result.add(ViolationCode.TIME_EXHAUSTED, "Time budget exhausted.")
+        # luminosity exhaustion only blocks DAQ-style actions
+        if (
+            state.resources.luminosity_exhausted
+            and action.action_type in {
+                ActionType.ALLOCATE_LUMINOSITY,
+                ActionType.COLLECT_COLLISIONS,
+            }
+        ):
+            result.add(ViolationCode.LUMI_EXHAUSTED, "Integrated luminosity budget spent.")
+        if not result.allowed:
+            return result
+        a = action.action_type
+        prog = state.progress
+        # ── prerequisites ──────────────────────────────────────────
+        if a == ActionType.COLLECT_COLLISIONS:
+            if not prog.beam_configured:
+                result.add(ViolationCode.PREREQ_MISSING, "Configure the beam first.")
+            if not prog.luminosity_allocated:
+                result.add(ViolationCode.PREREQ_MISSING, "Allocate luminosity first.")
+            if not prog.trigger_set:
+                result.add(ViolationCode.PREREQ_MISSING, "Set a trigger first.")
+            if not state.selected_channel:
+                result.add(ViolationCode.PREREQ_MISSING, "Select a decay channel first.")
+        elif a == ActionType.BUILD_INVARIANT_MASS:
+            if not prog.collisions_collected:
+                result.add(ViolationCode.PREREQ_MISSING, "Collect collisions before building histograms.")
+            if not prog.tracks_reconstructed:
+                result.add(ViolationCode.PREREQ_MISSING, "Reconstruct tracks before building histograms.")
+        elif a == ActionType.SUBTRACT_BACKGROUND:
+            if not prog.invariant_mass_built:
+                result.add(ViolationCode.PREREQ_MISSING, "Build invariant-mass histogram first.")
+        elif a == ActionType.FIT_RESONANCE:
+            if not prog.invariant_mass_built:
+                result.add(ViolationCode.PREREQ_MISSING, "Build the histogram before fitting.")
+        elif a == ActionType.MEASURE_ANGULAR:
+            if not (prog.resonance_fitted or prog.bump_scanned):
+                result.add(
+                    ViolationCode.PREREQ_MISSING,
+                    "Identify a peak (fit or bump scan) before angular analysis.",
+                )
+        elif a == ActionType.ESTIMATE_SIGNIFICANCE:
+            if not prog.collisions_collected:
+                result.add(ViolationCode.PREREQ_MISSING, "Collect data before significance estimation.")
+        elif a == ActionType.SUBMIT_DISCOVERY_CLAIM:
+            if not prog.resonance_fitted and not prog.bump_scanned:
+                result.add(ViolationCode.PREREQ_MISSING, "No fitted resonance or bump scan; cannot claim a discovery.")
+            if not prog.significance_estimated:
+                result.add(ViolationCode.PREREQ_MISSING, "Estimate significance before submitting a claim.")
+        # ── parameter & search-window validation (soft) ────────────
+        if a == ActionType.SELECT_CHANNEL:
+            channel = action.parameters.get("channel")
+            if channel:
+                try:
+                    DetectorChannel(channel)
+                except ValueError:
+                    result.add(ViolationCode.INVALID_PARAMS, f"Unknown channel '{channel}'.", soft=True)
+        if a == ActionType.SET_TRIGGER:
+            trig = action.parameters.get("trigger")
+            if trig:
+                try:
+                    TriggerType(trig)
+                except ValueError:
+                    result.add(ViolationCode.INVALID_PARAMS, f"Unknown trigger '{trig}'.", soft=True)
+        if a == ActionType.BUILD_INVARIANT_MASS:
+            window = action.parameters.get("mass_window_gev")
+            if window and len(window) == 2:
+                lo, hi = float(window[0]), float(window[1])
+                if hi <= lo:
+                    result.add(
+                        ViolationCode.INVALID_PARAMS,
+                        f"Mass window [{lo}, {hi}] is non-positive.",
+                        soft=True,
+                    )
+                if lo > self.mass_search_window_gev[1] or hi < self.mass_search_window_gev[0]:
+                    result.add(
+                        ViolationCode.OUT_OF_WINDOW,
+                        f"Histogram window [{lo}, {hi}] is outside the task search window "
+                        f"{self.mass_search_window_gev}.",
+                        soft=True,
+                    )
+        # ── redundancy (soft) ─────────────────────────────────────
+        if a == ActionType.CONFIGURE_BEAM and prog.beam_configured:
+            result.add(ViolationCode.REDUNDANT, "Beam already configured; reconfiguring wastes budget.", soft=True)
+        if a == ActionType.SELECT_CHANNEL and prog.channel_selected:
+            result.add(ViolationCode.REDUNDANT, "Channel already selected.", soft=True)
+        if a == ActionType.RECONSTRUCT_TRACKS and prog.tracks_reconstructed:
+            result.add(ViolationCode.REDUNDANT, "Tracks already reconstructed.", soft=True)
+        if a == ActionType.CALIBRATE_DETECTOR and prog.detector_calibrated:
+            result.add(ViolationCode.REDUNDANT, "Detector already calibrated.", soft=True)
+        # ── claim sanity ──────────────────────────────────────────
+        if a == ActionType.SUBMIT_DISCOVERY_CLAIM:
+            claim = action.parameters.get("claim") or {}
+            mass = claim.get("mass_estimate_gev")
+            if mass is None:
+                result.add(ViolationCode.INVALID_CLAIM, "Claim missing mass estimate.")
+            else:
+                try:
+                    m = float(mass)
+                except Exception:
+                    result.add(ViolationCode.INVALID_CLAIM, "Claim mass is not numeric.")
+                else:
+                    lo, hi = self.mass_search_window_gev
+                    if not (lo <= m <= hi):
+                        result.add(
+                            ViolationCode.INVALID_CLAIM,
+                            f"Claim mass {m} outside search window [{lo}, {hi}].",
+                            soft=True,
+                        )
+            if claim.get("significance_sigma") is None:
+                result.add(ViolationCode.INVALID_CLAIM, "Claim missing significance.", soft=True)
+        return result
+__all__ = ["RuleResult", "RulesEngine", "ViolationCode"]

server/simulator/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Simulator: latent particle truth, noise model, output generation."""
+from .latent_state import (
+    DetectorState,
+    ExperimentProgress,
+    FullLatentState,
+    LatentParticle,
+    ResourceState,
+)
+from .noise import NoiseModel
+from .output_generator import OutputGenerator
+from .transition import (
+    ACTION_COSTS,
+    TransitionEngine,
+    TransitionResult,
+    compute_action_cost,
+)
+__all__ = [
+    "ACTION_COSTS",
+    "DetectorState",
+    "ExperimentProgress",
+    "FullLatentState",
+    "LatentParticle",
+    "NoiseModel",
+    "OutputGenerator",
+    "ResourceState",
+    "TransitionEngine",
+    "TransitionResult",
+    "compute_action_cost",
+]

server/simulator/latent_state.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Latent (hidden) state of the LHC simulator.
+The agent never sees these structures. They define the ground-truth particle
+properties, detector imperfections, experiment progress flags, and the live
+resource budget.
+"""
+from __future__ import annotations
+from typing import Dict, List, Optional
+from pydantic import BaseModel, Field
+# ── Particle truth ────────────────────────────────────────────────────────
+class LatentParticle(BaseModel):
+    """The hidden mystery particle that the agent must discover.
+    Defines the true mass, width, decay branching ratios, spin, parity,
+    production cross-section, and dominant decay channel. The agent has to
+    recover these values from noisy observations.
+    """
+    name: str = "X"
+    mass_gev: float = 125.0
+    width_gev: float = 0.004
+    spin: int = 0  # 0, 1, or 2
+    parity: str = "+"  # "+" or "-"
+    cross_section_fb: float = 50.0  # signal cross-section in femtobarns
+    decay_branching: Dict[str, float] = Field(
+        default_factory=lambda: {
+            "diphoton": 0.0023,
+            "dilepton_ee": 0.00003,
+            "dilepton_mumu": 0.00022,
+            "four_lepton": 0.000125,
+            "bb": 0.58,
+            "dijet": 0.30,
+        },
+        description="Branching ratio (BR) per decay channel, sums to ~1.",
+    )
+    primary_channel: str = "diphoton"
+# ── Detector & accelerator state ─────────────────────────────────────────
+class DetectorState(BaseModel):
+    """Hidden detector and accelerator parameters that shape noise.
+    These influence resolution, trigger efficiency, pileup, and systematic
+    uncertainties applied to every observation.
+    """
+    detector_resolution_gev: float = 1.5  # absolute mass resolution σ_m
+    pileup_mu: float = 30.0               # average pileup interactions per crossing
+    trigger_efficiency: float = 0.85
+    luminosity_uncertainty: float = 0.025  # 2.5% relative uncertainty
+    energy_scale_offset: float = 0.0       # systematic shift in GeV
+    energy_scale_uncertainty: float = 0.3  # σ on the scale
+    background_shape_alpha: float = -2.5   # exponent of background ~ 1/m^|α|
+    qcd_background_strength: float = 1.0   # scale factor for hadronic background
+    detector_calibrated: bool = False
+    tracker_aligned: bool = False
+    # Channel-dependent reconstruction efficiency
+    channel_efficiency: Dict[str, float] = Field(
+        default_factory=lambda: {
+            "diphoton": 0.45,
+            "dilepton_ee": 0.55,
+            "dilepton_mumu": 0.70,
+            "four_lepton": 0.40,
+            "dijet": 0.80,
+            "bb": 0.50,
+        }
+    )
+# ── Experiment progress flags ────────────────────────────────────────────
+class ExperimentProgress(BaseModel):
+    """Boolean milestones used by rules and reward shaping."""
+    beam_configured: bool = False
+    luminosity_allocated: bool = False
+    trigger_set: bool = False
+    collisions_collected: bool = False
+    detector_calibrated: bool = False
+    tracks_reconstructed: bool = False
+    channel_selected: bool = False
+    invariant_mass_built: bool = False
+    background_subtracted: bool = False
+    resonance_fitted: bool = False
+    bump_scanned: bool = False
+    angular_measured: bool = False
+    significance_estimated: bool = False
+    systematics_requested: bool = False
+    theory_review_requested: bool = False
+    claim_submitted: bool = False
+    n_events_collected: int = 0
+    n_signal_candidates: int = 0
+    n_background_estimate: int = 0
+    best_fit_mass_gev: Optional[float] = None
+    best_fit_width_gev: Optional[float] = None
+    best_significance_sigma: Optional[float] = None
+    best_channel: Optional[str] = None
+    best_beam_energy: Optional[str] = None
+# ── Resources ─────────────────────────────────────────────────────────────
+class ResourceState(BaseModel):
+    """Live resource accounting (superset of the agent-visible ResourceUsage)."""
+    budget_total_musd: float = 100.0
+    budget_used_musd: float = 0.0
+    luminosity_total_fb: float = 300.0
+    luminosity_used_fb: float = 0.0
+    time_limit_days: float = 365.0
+    time_used_days: float = 0.0
+    compute_hours_used: float = 0.0
+    @property
+    def budget_remaining(self) -> float:
+        return max(0.0, self.budget_total_musd - self.budget_used_musd)
+    @property
+    def luminosity_remaining(self) -> float:
+        return max(0.0, self.luminosity_total_fb - self.luminosity_used_fb)
+    @property
+    def time_remaining(self) -> float:
+        return max(0.0, self.time_limit_days - self.time_used_days)
+    @property
+    def budget_exhausted(self) -> bool:
+        return self.budget_remaining <= 0
+    @property
+    def luminosity_exhausted(self) -> bool:
+        return self.luminosity_remaining <= 0
+    @property
+    def time_exhausted(self) -> bool:
+        return self.time_remaining <= 0
+# ── Aggregate hidden state ───────────────────────────────────────────────
+class FullLatentState(BaseModel):
+    """Complete hidden state of the simulated LHC analysis world."""
+    particle: LatentParticle = Field(default_factory=LatentParticle)
+    detector: DetectorState = Field(default_factory=DetectorState)
+    progress: ExperimentProgress = Field(default_factory=ExperimentProgress)
+    resources: ResourceState = Field(default_factory=ResourceState)
+    selected_channel: Optional[str] = None
+    selected_beam_energy: Optional[str] = None
+    selected_trigger: Optional[str] = None
+    candidate_masses_gev: List[float] = Field(default_factory=list)
+    candidate_significances: List[float] = Field(default_factory=list)
+    hidden_failure_conditions: List[str] = Field(default_factory=list)
+    rng_seed: int = 42
+    step_count: int = 0

server/simulator/noise.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""Stochastic noise model for the LHC simulator.
+All randomness is funneled through a single seeded ``numpy.Generator`` so
+episodes are reproducible. The methods are physics-flavoured: Poisson event
+counts, Gaussian-smeared masses, log-normal cross-sections, false discovery
+helpers, and quality degradation.
+"""
+from __future__ import annotations
+from typing import List
+import numpy as np
+class NoiseModel:
+    """Centralised noise generator for the CERN simulator."""
+    def __init__(self, seed: int = 42):
+        self.rng = np.random.default_rng(seed)
+    def reseed(self, seed: int) -> None:
+        self.rng = np.random.default_rng(seed)
+    # ── counting / Poisson statistics ─────────────────────────────────
+    def poisson(self, lam: float) -> int:
+        return int(self.rng.poisson(max(lam, 0.0)))
+    def signal_yield(
+        self,
+        cross_section_fb: float,
+        luminosity_fb: float,
+        branching: float,
+        efficiency: float,
+        trigger_efficiency: float,
+    ) -> int:
+        """Expected signal events ~ σ × L × BR × ε_reco × ε_trig + Poisson noise.
+        BR = branching ratio of the decay channel.
+        ε_reco = channel reconstruction efficiency.
+        ε_trig = trigger acceptance.
+        """
+        mu = cross_section_fb * luminosity_fb * branching * efficiency * trigger_efficiency
+        return self.poisson(mu)
+    def background_yield(
+        self,
+        baseline_per_fb: float,
+        luminosity_fb: float,
+        qcd_strength: float,
+        trigger_efficiency: float,
+    ) -> int:
+        """Expected background events scale linearly with luminosity."""
+        mu = baseline_per_fb * luminosity_fb * qcd_strength * trigger_efficiency
+        return self.poisson(mu)
+    # ── mass smearing ──────────────────────────────────────────────────
+    def smear_mass(
+        self,
+        true_mass_gev: float,
+        resolution_gev: float,
+        scale_offset_gev: float = 0.0,
+    ) -> float:
+        return float(self.rng.normal(true_mass_gev + scale_offset_gev, resolution_gev))
+    def fit_mass_estimate(
+        self,
+        true_mass_gev: float,
+        n_signal: int,
+        resolution_gev: float,
+        scale_offset_gev: float,
+    ) -> float:
+        """Fitted mass ≈ true mass + Gaussian error scaling like 1/√N_signal."""
+        n_eff = max(n_signal, 1)
+        sigma = resolution_gev / np.sqrt(n_eff)
+        return float(self.rng.normal(true_mass_gev + scale_offset_gev, sigma))
+    def fit_mass_uncertainty(
+        self,
+        n_signal: int,
+        resolution_gev: float,
+    ) -> float:
+        """Statistical mass uncertainty from a peak with N_signal events."""
+        n_eff = max(n_signal, 1)
+        return float(resolution_gev / np.sqrt(n_eff))
+    # ── significance ───────────────────────────────────────────────────
+    def asimov_significance(
+        self,
+        n_signal: int,
+        n_background: int,
+        nuisance_inflation: float = 0.0,
+    ) -> float:
+        """Asymptotic Asimov-style significance Z = √(2[(s+b) ln(1+s/b) - s]).
+        A small nuisance_inflation term in [0,1] shrinks Z to mimic systematic
+        penalties when calibration / systematics studies are skipped.
+        """
+        if n_background <= 0:
+            return 0.0
+        s = float(n_signal)
+        b = float(n_background)
+        if s <= 0:
+            return 0.0
+        term = (s + b) * np.log(1.0 + s / b) - s
+        z = float(np.sqrt(max(2.0 * term, 0.0)))
+        return float(z * (1.0 - nuisance_inflation))
+    # ── helpers ─────────────────────────────────────────────────────────
+    def coin_flip(self, p: float) -> bool:
+        return bool(self.rng.random() < p)
+    def jitter(self, mean: float, sigma: float) -> float:
+        return float(self.rng.normal(mean, sigma))
+    def quality_degradation(self, base_quality: float, factors: List[float]) -> float:
+        q = base_quality
+        for f in factors:
+            q *= f
+        return float(np.clip(q + self.rng.normal(0, 0.02), 0.0, 1.0))
+    def sample_qc_metric(
+        self, mean: float, std: float, clip_lo: float = 0.0, clip_hi: float = 1.0
+    ) -> float:
+        return float(np.clip(self.rng.normal(mean, std), clip_lo, clip_hi))
+    def histogram(
+        self,
+        n_signal: int,
+        n_background: int,
+        true_mass_gev: float,
+        resolution_gev: float,
+        window_lo_gev: float,
+        window_hi_gev: float,
+        n_bins: int = 40,
+        background_alpha: float = -2.5,
+    ) -> List[int]:
+        """Generate a coarse invariant-mass histogram.
+        Signal is Gaussian around the (smeared) true mass with width
+        =resolution; background is a falling power-law shape.
+        """
+        if window_hi_gev <= window_lo_gev:
+            return [0] * n_bins
+        edges = np.linspace(window_lo_gev, window_hi_gev, n_bins + 1)
+        centers = 0.5 * (edges[:-1] + edges[1:])
+        sig_mu = true_mass_gev
+        sig_pdf = np.exp(-0.5 * ((centers - sig_mu) / max(resolution_gev, 1e-3)) ** 2)
+        sig_pdf /= max(sig_pdf.sum(), 1e-9)
+        bg_pdf = np.power(np.clip(centers, 1.0, None), background_alpha)
+        bg_pdf /= max(bg_pdf.sum(), 1e-9)
+        sig_counts = self.rng.multinomial(max(n_signal, 0), sig_pdf)
+        bg_counts = self.rng.multinomial(max(n_background, 0), bg_pdf)
+        return (sig_counts + bg_counts).astype(int).tolist()

server/simulator/output_generator.py ADDED Viewed

	@@ -0,0 +1,586 @@

+"""Builds the noisy ``IntermediateOutput`` returned to the agent each step.
+The OutputGenerator never mutates state; it only inspects the latent state
+plus the action and produces a structured artifact. State changes happen in
+``TransitionEngine``.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+import numpy as np
+from models import (
+    ActionType,
+    DetectorChannel,
+    ExperimentAction,
+    IntermediateOutput,
+    OutputType,
+    TriggerType,
+)
+from .latent_state import FullLatentState
+from .noise import NoiseModel
+# ── Channel-specific background per fb^-1 (very rough physics-flavoured) ─
+BACKGROUND_PER_FB: Dict[str, float] = {
+    "diphoton": 1500.0,
+    "dilepton_ee": 8000.0,
+    "dilepton_mumu": 9000.0,
+    "four_lepton": 80.0,
+    "dijet": 250000.0,
+    "bb": 50000.0,
+}
+# ── Trigger ↔ channel affinity ───────────────────────────────────────────
+TRIGGER_AFFINITY: Dict[str, Dict[str, float]] = {
+    "low_pt": {
+        "diphoton": 0.5,
+        "dilepton_ee": 0.6,
+        "dilepton_mumu": 0.6,
+        "four_lepton": 0.5,
+        "dijet": 0.9,
+        "bb": 0.7,
+    },
+    "high_pt": {
+        "diphoton": 0.9,
+        "dilepton_ee": 0.8,
+        "dilepton_mumu": 0.85,
+        "four_lepton": 0.85,
+        "dijet": 0.7,
+        "bb": 0.55,
+    },
+    "diphoton_hlt": {
+        "diphoton": 1.0,
+        "dilepton_ee": 0.05,
+        "dilepton_mumu": 0.05,
+        "four_lepton": 0.1,
+        "dijet": 0.05,
+        "bb": 0.05,
+    },
+    "dilepton_hlt": {
+        "diphoton": 0.05,
+        "dilepton_ee": 1.0,
+        "dilepton_mumu": 1.0,
+        "four_lepton": 0.85,
+        "dijet": 0.05,
+        "bb": 0.05,
+    },
+    "jet_hlt": {
+        "diphoton": 0.1,
+        "dilepton_ee": 0.1,
+        "dilepton_mumu": 0.1,
+        "four_lepton": 0.1,
+        "dijet": 1.0,
+        "bb": 0.85,
+    },
+}
+# ── Beam-energy luminosity & cross-section scaling ───────────────────────
+BEAM_SCALING: Dict[str, Dict[str, float]] = {
+    "7TeV":  {"xsec_scale": 0.45, "cost_per_fb": 0.05, "days_per_fb": 0.6},
+    "8TeV":  {"xsec_scale": 0.65, "cost_per_fb": 0.08, "days_per_fb": 0.7},
+    "13TeV": {"xsec_scale": 1.00, "cost_per_fb": 0.12, "days_per_fb": 0.8},
+    "14TeV": {"xsec_scale": 1.15, "cost_per_fb": 0.18, "days_per_fb": 0.9},
+}
+def _trigger_efficiency(trigger: Optional[str], channel: Optional[str]) -> float:
+    if not trigger or not channel:
+        return 0.0
+    table = TRIGGER_AFFINITY.get(trigger, {})
+    return float(table.get(channel, 0.1))
+class OutputGenerator:
+    """Translates an action + latent state into a noisy observable artifact."""
+    def __init__(self, noise: NoiseModel):
+        self.noise = noise
+    # ── Public API ────────────────────────────────────────────────────
+    def generate(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        a = action.action_type
+        if a == ActionType.CONFIGURE_BEAM:
+            return self._beam(action, state, step_index)
+        if a == ActionType.ALLOCATE_LUMINOSITY:
+            return self._luminosity(action, state, step_index)
+        if a == ActionType.SET_TRIGGER:
+            return self._trigger(action, state, step_index)
+        if a == ActionType.COLLECT_COLLISIONS:
+            return self._collect(action, state, step_index)
+        if a == ActionType.CALIBRATE_DETECTOR:
+            return self._calibrate(action, state, step_index)
+        if a == ActionType.RECONSTRUCT_TRACKS:
+            return self._reconstruct(action, state, step_index)
+        if a == ActionType.SELECT_CHANNEL:
+            return self._select_channel(action, state, step_index)
+        if a == ActionType.BUILD_INVARIANT_MASS:
+            return self._invariant_mass(action, state, step_index)
+        if a == ActionType.SUBTRACT_BACKGROUND:
+            return self._subtract_background(action, state, step_index)
+        if a == ActionType.FIT_RESONANCE:
+            return self._fit_resonance(action, state, step_index)
+        if a == ActionType.SCAN_BUMP:
+            return self._scan_bump(action, state, step_index)
+        if a == ActionType.MEASURE_ANGULAR:
+            return self._measure_angular(action, state, step_index)
+        if a == ActionType.ESTIMATE_SIGNIFICANCE:
+            return self._estimate_significance(action, state, step_index)
+        if a == ActionType.REQUEST_SYSTEMATICS:
+            return self._request_systematics(action, state, step_index)
+        if a == ActionType.REQUEST_THEORY_REVIEW:
+            return self._request_theory(action, state, step_index)
+        if a == ActionType.SUBMIT_DISCOVERY_CLAIM:
+            return self._submit_claim(action, state, step_index)
+        return self._failure(step_index, f"Unhandled action: {a}")
+    # ── helpers ────────────────────────────────────────────────────────
+    def _failure(self, step_index: int, msg: str) -> IntermediateOutput:
+        return IntermediateOutput(
+            output_type=OutputType.FAILURE_REPORT,
+            step_index=step_index,
+            success=False,
+            quality_score=0.0,
+            summary=msg,
+            warnings=[msg],
+        )
+    # ── DAQ (Data Acquisition) outputs ────────────────────────────────
+    def _beam(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        beam = action.parameters.get("beam_energy") or state.selected_beam_energy or "13TeV"
+        scaling = BEAM_SCALING.get(beam, BEAM_SCALING["13TeV"])
+        return IntermediateOutput(
+            output_type=OutputType.BEAM_CONFIG,
+            step_index=step_index,
+            success=True,
+            quality_score=0.9,
+            summary=f"LHC configured at √s={beam}; effective xsec scale={scaling['xsec_scale']:.2f}.",
+            data={
+                "beam_energy": beam,
+                "xsec_scale": scaling["xsec_scale"],
+                "cost_per_fb_musd": scaling["cost_per_fb"],
+                "days_per_fb": scaling["days_per_fb"],
+            },
+        )
+    def _luminosity(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        requested = float(action.parameters.get("luminosity_fb", 30.0))
+        granted = max(0.0, min(requested, state.resources.luminosity_remaining))
+        warnings: List[str] = []
+        if granted < requested:
+            warnings.append(
+                f"Luminosity capped: requested {requested:.1f} fb^-1, "
+                f"granted {granted:.1f} fb^-1."
+            )
+        return IntermediateOutput(
+            output_type=OutputType.LUMINOSITY_LOG,
+            step_index=step_index,
+            success=granted > 0,
+            quality_score=1.0 if granted > 0 else 0.0,
+            summary=f"Allocated {granted:.1f} fb^-1 of integrated luminosity.",
+            data={"luminosity_fb": granted, "requested_fb": requested},
+            warnings=warnings,
+        )
+    def _trigger(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        trigger = action.parameters.get("trigger") or state.selected_trigger or "high_pt"
+        try:
+            TriggerType(trigger)
+        except ValueError:
+            return self._failure(step_index, f"Unknown trigger: {trigger}")
+        eff = state.detector.trigger_efficiency
+        return IntermediateOutput(
+            output_type=OutputType.TRIGGER_REPORT,
+            step_index=step_index,
+            success=True,
+            quality_score=eff,
+            summary=f"Trigger {trigger} armed; ε_trig={eff:.2f}.",
+            data={"trigger": trigger, "trigger_efficiency": eff},
+        )
+    def _collect(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        beam = state.selected_beam_energy or "13TeV"
+        scaling = BEAM_SCALING.get(beam, BEAM_SCALING["13TeV"])
+        lumi_request = float(action.parameters.get("luminosity_fb", 0.0))
+        if lumi_request <= 0:
+            lumi_request = max(0.0, state.resources.luminosity_remaining * 0.2)
+        lumi = max(0.0, min(lumi_request, state.resources.luminosity_remaining))
+        if lumi <= 0:
+            return self._failure(step_index, "No luminosity remaining to collect.")
+        channel = state.selected_channel or state.particle.primary_channel
+        try:
+            DetectorChannel(channel)
+        except ValueError:
+            return self._failure(step_index, f"Invalid channel: {channel}")
+        trig = state.selected_trigger or "high_pt"
+        trig_eff = _trigger_efficiency(trig, channel)
+        reco_eff = state.detector.channel_efficiency.get(channel, 0.4)
+        if not state.detector.tracker_aligned and channel in {"dilepton_ee", "dilepton_mumu", "four_lepton"}:
+            reco_eff *= 0.7
+        if not state.detector.detector_calibrated and channel in {"diphoton"}:
+            reco_eff *= 0.8
+        br = state.particle.decay_branching.get(channel, 0.0)
+        eff_xsec = state.particle.cross_section_fb * scaling["xsec_scale"]
+        n_sig = self.noise.signal_yield(
+            cross_section_fb=eff_xsec,
+            luminosity_fb=lumi,
+            branching=br,
+            efficiency=reco_eff,
+            trigger_efficiency=trig_eff,
+        )
+        n_bg = self.noise.background_yield(
+            baseline_per_fb=BACKGROUND_PER_FB.get(channel, 1000.0),
+            luminosity_fb=lumi,
+            qcd_strength=state.detector.qcd_background_strength,
+            trigger_efficiency=trig_eff,
+        )
+        cost = lumi * scaling["cost_per_fb"]
+        days = lumi * scaling["days_per_fb"]
+        return IntermediateOutput(
+            output_type=OutputType.COLLISION_BATCH,
+            step_index=step_index,
+            success=True,
+            quality_score=float(np.clip(reco_eff * trig_eff + 0.1, 0.0, 1.0)),
+            summary=(
+                f"Collected {lumi:.1f} fb^-1 in {channel} with trigger {trig}: "
+                f"~{n_sig + n_bg} reconstructed events."
+            ),
+            data={
+                "luminosity_fb": lumi,
+                "beam_energy": beam,
+                "channel": channel,
+                "trigger": trig,
+                "n_signal_candidates": int(n_sig),
+                "n_background_estimate": int(n_bg),
+                "cost_musd": cost,
+                "time_days": days,
+                "trigger_efficiency": trig_eff,
+                "reco_efficiency": reco_eff,
+            },
+            uncertainty=float(np.clip(0.05 + (1.0 - reco_eff) * 0.2, 0.0, 0.5)),
+        )
+    # ── Reconstruction outputs ────────────────────────────────────────
+    def _calibrate(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        method = action.method or "ECAL_calibration"
+        improvement = self.noise.sample_qc_metric(0.5, 0.1, 0.0, 0.95)
+        return IntermediateOutput(
+            output_type=OutputType.CALIBRATION_REPORT,
+            step_index=step_index,
+            success=True,
+            quality_score=0.9,
+            summary=f"Detector calibrated using {method}; resolution improved by {improvement*100:.1f}%.",
+            data={
+                "method": method,
+                "resolution_improvement": improvement,
+            },
+            uncertainty=0.05,
+        )
+    def _reconstruct(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        method = action.method or "Athena"
+        return IntermediateOutput(
+            output_type=OutputType.RECONSTRUCTION,
+            step_index=step_index,
+            success=True,
+            quality_score=0.85,
+            summary=f"Tracks and physics objects reconstructed via {method}.",
+            data={"method": method},
+            uncertainty=0.05,
+        )
+    def _select_channel(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        channel = action.parameters.get("channel") or state.selected_channel
+        if not channel:
+            return self._failure(step_index, "No channel specified.")
+        try:
+            DetectorChannel(channel)
+        except ValueError:
+            return self._failure(step_index, f"Unknown channel: {channel}")
+        return IntermediateOutput(
+            output_type=OutputType.CHANNEL_SELECTION,
+            step_index=step_index,
+            success=True,
+            quality_score=0.95,
+            summary=f"Analysis channel set to {channel}.",
+            data={"channel": channel},
+        )
+    # ── Analysis outputs ──────────────────────────────────────────────
+    def _invariant_mass(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        if state.progress.n_events_collected <= 0:
+            return self._failure(step_index, "No collisions collected yet.")
+        window = action.parameters.get("mass_window_gev") or [50.0, 1000.0]
+        n_bins = int(action.parameters.get("n_bins", 40))
+        true_m = state.particle.mass_gev
+        in_window = window[0] <= true_m <= window[1]
+        n_sig = state.progress.n_signal_candidates if in_window else 0
+        hist = self.noise.histogram(
+            n_signal=n_sig,
+            n_background=state.progress.n_background_estimate,
+            true_mass_gev=true_m,
+            resolution_gev=state.detector.detector_resolution_gev,
+            window_lo_gev=window[0],
+            window_hi_gev=window[1],
+            n_bins=n_bins,
+            background_alpha=state.detector.background_shape_alpha,
+        )
+        return IntermediateOutput(
+            output_type=OutputType.INVARIANT_MASS_HIST,
+            step_index=step_index,
+            success=True,
+            quality_score=0.85 if in_window else 0.4,
+            summary=(
+                f"Invariant-mass histogram in [{window[0]:.0f}, {window[1]:.0f}] GeV "
+                f"with {n_bins} bins, total {sum(hist)} entries."
+            ),
+            data={
+                "window_gev": window,
+                "bin_counts": hist,
+                "n_signal_in_window": n_sig,
+                "n_background_in_window": state.progress.n_background_estimate,
+            },
+            uncertainty=0.1,
+        )
+    def _subtract_background(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        if not state.progress.invariant_mass_built:
+            return self._failure(step_index, "Build the invariant-mass histogram first.")
+        residual = self.noise.sample_qc_metric(0.05, 0.02, 0.0, 0.5)
+        return IntermediateOutput(
+            output_type=OutputType.BACKGROUND_SUBTRACTION,
+            step_index=step_index,
+            success=True,
+            quality_score=0.85,
+            summary=f"Smooth background subtracted; residual fraction ≈ {residual*100:.1f}%.",
+            data={"residual_fraction": residual},
+            uncertainty=0.08,
+        )
+    def _fit_resonance(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        if not state.progress.background_subtracted and not state.progress.invariant_mass_built:
+            return self._failure(step_index, "Need a histogram (and ideally background subtraction) before fitting.")
+        n_sig = max(state.progress.n_signal_candidates, 1)
+        true_m = state.particle.mass_gev
+        scale = state.detector.energy_scale_offset
+        res = state.detector.detector_resolution_gev
+        m_fit = self.noise.fit_mass_estimate(true_m, n_sig, res, scale)
+        m_unc = self.noise.fit_mass_uncertainty(n_sig, res)
+        w_fit = max(0.001, abs(self.noise.jitter(state.particle.width_gev, 0.1 * res)))
+        return IntermediateOutput(
+            output_type=OutputType.FIT_RESULT,
+            step_index=step_index,
+            success=True,
+            quality_score=0.9,
+            summary=f"Resonance fit: m={m_fit:.2f} ± {m_unc:.2f} GeV, Γ≈{w_fit:.3f} GeV.",
+            data={
+                "fit_mass_gev": m_fit,
+                "fit_mass_unc_gev": m_unc,
+                "fit_width_gev": w_fit,
+                "n_signal_used": int(n_sig),
+            },
+            uncertainty=float(np.clip(m_unc / max(true_m, 1.0), 0.0, 1.0)),
+        )
+    def _scan_bump(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        if state.progress.n_events_collected <= 0:
+            return self._failure(step_index, "Collect data before bump-hunting.")
+        true_m = state.particle.mass_gev
+        m_obs = self.noise.smear_mass(true_m, state.detector.detector_resolution_gev * 1.2)
+        return IntermediateOutput(
+            output_type=OutputType.BUMP_SCAN,
+            step_index=step_index,
+            success=True,
+            quality_score=0.7,
+            summary=f"Bump scan most-significant region near m≈{m_obs:.1f} GeV.",
+            data={"candidate_mass_gev": m_obs},
+            uncertainty=0.15,
+        )
+    def _measure_angular(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        spin_truth = state.particle.spin
+        # Returns posterior over {0,1,2} biased by truth + noise
+        weights = np.array([0.1, 0.1, 0.1])
+        weights[spin_truth] += 0.6
+        weights += self.noise.rng.normal(0, 0.05, size=3)
+        weights = np.clip(weights, 0.01, None)
+        weights /= weights.sum()
+        return IntermediateOutput(
+            output_type=OutputType.ANGULAR_RESULT,
+            step_index=step_index,
+            success=True,
+            quality_score=0.8,
+            summary=(
+                "Angular distribution favours spin-"
+                f"{int(np.argmax(weights))} ({weights.max():.2f} posterior)."
+            ),
+            data={
+                "spin_posterior": weights.tolist(),
+                "favoured_spin": int(np.argmax(weights)),
+                "parity_estimate": state.particle.parity,
+            },
+            uncertainty=float(1.0 - weights.max()),
+        )
+    def _estimate_significance(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        n_sig = state.progress.n_signal_candidates
+        n_bg = state.progress.n_background_estimate
+        nuisance = 0.0
+        if not state.progress.systematics_requested:
+            nuisance += 0.15
+        if not state.progress.detector_calibrated:
+            nuisance += 0.10
+        z = self.noise.asimov_significance(n_sig, n_bg, nuisance_inflation=nuisance)
+        return IntermediateOutput(
+            output_type=OutputType.SIGNIFICANCE,
+            step_index=step_index,
+            success=True,
+            quality_score=0.9,
+            summary=f"Estimated local significance Z = {z:.2f} σ.",
+            data={
+                "significance_sigma": z,
+                "n_signal": int(n_sig),
+                "n_background": int(n_bg),
+                "nuisance_inflation": nuisance,
+            },
+            uncertainty=float(np.clip(0.05 + nuisance, 0.0, 0.5)),
+        )
+    # ── Meta outputs ──────────────────────────────────────────────────
+    def _request_systematics(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        method = action.method or "Luminosity_calibration"
+        return IntermediateOutput(
+            output_type=OutputType.SYSTEMATICS_REPORT,
+            step_index=step_index,
+            success=True,
+            quality_score=0.85,
+            summary=f"Systematics study via {method}; nuisance band tightened.",
+            data={"method": method},
+            uncertainty=0.04,
+        )
+    def _request_theory(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        return IntermediateOutput(
+            output_type=OutputType.THEORY_REVIEW,
+            step_index=step_index,
+            success=True,
+            quality_score=0.7,
+            summary="Theory review: candidate consistent with Standard-Model-extension scalar / vector hypotheses.",
+            data={"hypotheses": ["BSM scalar", "BSM vector", "SM background fluctuation"]},
+            uncertainty=0.2,
+        )
+    def _submit_claim(
+        self,
+        action: ExperimentAction,
+        state: FullLatentState,
+        step_index: int,
+    ) -> IntermediateOutput:
+        claim: Dict[str, Any] = action.parameters.get("claim") or {}
+        return IntermediateOutput(
+            output_type=OutputType.DISCOVERY_CLAIM,
+            step_index=step_index,
+            success=True,
+            quality_score=1.0,
+            summary="Discovery claim submitted for grading.",
+            data=claim,
+        )

server/simulator/transition.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""Pure-function transition engine.
+Given a (latent_state, action, generated_output) triple, produces the next
+latent state plus the deltas needed for the agent-visible observation. The
+``TransitionEngine`` does **not** generate randomness directly; it consumes
+artifacts from the ``OutputGenerator``.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict
+from models import (
+    ActionType,
+    ExperimentAction,
+    IntermediateOutput,
+    OutputType,
+)
+from .latent_state import FullLatentState
+# Per-action default cost in (millions of USD, days, compute hours)
+ACTION_COSTS: Dict[ActionType, Dict[str, float]] = {
+    ActionType.CONFIGURE_BEAM:        {"musd": 0.10, "days": 0.5, "compute": 0.1},
+    ActionType.ALLOCATE_LUMINOSITY:   {"musd": 0.05, "days": 0.2, "compute": 0.0},
+    ActionType.SET_TRIGGER:           {"musd": 0.05, "days": 0.1, "compute": 0.0},
+    ActionType.COLLECT_COLLISIONS:    {"musd": 0.00, "days": 0.0, "compute": 1.0},  # main cost is in luminosity
+    ActionType.CALIBRATE_DETECTOR:    {"musd": 0.20, "days": 1.0, "compute": 1.5},
+    ActionType.RECONSTRUCT_TRACKS:    {"musd": 0.15, "days": 0.8, "compute": 5.0},
+    ActionType.SELECT_CHANNEL:        {"musd": 0.00, "days": 0.05, "compute": 0.0},
+    ActionType.BUILD_INVARIANT_MASS:  {"musd": 0.05, "days": 0.3, "compute": 1.0},
+    ActionType.SUBTRACT_BACKGROUND:   {"musd": 0.05, "days": 0.3, "compute": 0.5},
+    ActionType.FIT_RESONANCE:         {"musd": 0.10, "days": 0.4, "compute": 0.5},
+    ActionType.SCAN_BUMP:             {"musd": 0.05, "days": 0.2, "compute": 0.5},
+    ActionType.MEASURE_ANGULAR:       {"musd": 0.10, "days": 0.4, "compute": 0.5},
+    ActionType.ESTIMATE_SIGNIFICANCE: {"musd": 0.05, "days": 0.1, "compute": 0.2},
+    ActionType.REQUEST_SYSTEMATICS:   {"musd": 0.30, "days": 1.5, "compute": 1.0},
+    ActionType.REQUEST_THEORY_REVIEW: {"musd": 0.05, "days": 0.5, "compute": 0.0},
+    ActionType.SUBMIT_DISCOVERY_CLAIM:{"musd": 0.0,  "days": 0.1, "compute": 0.0},
+}
+def compute_action_cost(action: ExperimentAction, output: IntermediateOutput) -> Dict[str, float]:
+    """Return realised (musd, days, compute_hours, luminosity_fb) for this action."""
+    base = ACTION_COSTS.get(action.action_type, {"musd": 0.0, "days": 0.0, "compute": 0.0})
+    musd = float(base.get("musd", 0.0))
+    days = float(base.get("days", 0.0))
+    compute = float(base.get("compute", 0.0))
+    lumi_fb = 0.0
+    data = output.data or {}
+    if action.action_type == ActionType.COLLECT_COLLISIONS:
+        lumi_fb = float(data.get("luminosity_fb", 0.0))
+        musd += float(data.get("cost_musd", 0.0))
+        days += float(data.get("time_days", 0.0))
+    return {
+        "musd": musd,
+        "days": days,
+        "compute_hours": compute,
+        "luminosity_fb": lumi_fb,
+    }
+@dataclass
+class TransitionResult:
+    next_state: FullLatentState
+    realised_cost: Dict[str, float]
+class TransitionEngine:
+    """Applies an action's output to evolve the latent state."""
+    def step(
+        self,
+        state: FullLatentState,
+        action: ExperimentAction,
+        output: IntermediateOutput,
+    ) -> TransitionResult:
+        # We mutate the live state in place, then return it. This is fine
+        # because the environment owns the only reference.
+        cost = compute_action_cost(action, output)
+        state.resources.budget_used_musd += cost["musd"]
+        state.resources.time_used_days += cost["days"]
+        state.resources.compute_hours_used += cost["compute_hours"]
+        state.resources.luminosity_used_fb += cost["luminosity_fb"]
+        if not output.success:
+            state.step_count += 1
+            return TransitionResult(next_state=state, realised_cost=cost)
+        a = action.action_type
+        data = output.data or {}
+        if a == ActionType.CONFIGURE_BEAM:
+            beam = data.get("beam_energy")
+            state.selected_beam_energy = beam
+            state.progress.beam_configured = True
+        elif a == ActionType.ALLOCATE_LUMINOSITY:
+            state.progress.luminosity_allocated = True
+        elif a == ActionType.SET_TRIGGER:
+            trig = data.get("trigger")
+            state.selected_trigger = trig
+            state.progress.trigger_set = True
+        elif a == ActionType.COLLECT_COLLISIONS:
+            state.progress.collisions_collected = True
+            state.progress.n_events_collected += int(
+                data.get("n_signal_candidates", 0)
+            ) + int(data.get("n_background_estimate", 0))
+            state.progress.n_signal_candidates += int(data.get("n_signal_candidates", 0))
+            state.progress.n_background_estimate += int(data.get("n_background_estimate", 0))
+            state.progress.best_channel = data.get("channel") or state.progress.best_channel
+            state.progress.best_beam_energy = (
+                data.get("beam_energy") or state.progress.best_beam_energy
+            )
+        elif a == ActionType.CALIBRATE_DETECTOR:
+            state.progress.detector_calibrated = True
+            state.detector.detector_calibrated = True
+            improvement = float(data.get("resolution_improvement", 0.0))
+            state.detector.detector_resolution_gev = max(
+                0.05,
+                state.detector.detector_resolution_gev * (1.0 - improvement),
+            )
+        elif a == ActionType.RECONSTRUCT_TRACKS:
+            state.progress.tracks_reconstructed = True
+            state.detector.tracker_aligned = True
+        elif a == ActionType.SELECT_CHANNEL:
+            channel = data.get("channel")
+            if channel:
+                state.selected_channel = channel
+                state.progress.channel_selected = True
+        elif a == ActionType.BUILD_INVARIANT_MASS:
+            state.progress.invariant_mass_built = True
+        elif a == ActionType.SUBTRACT_BACKGROUND:
+            state.progress.background_subtracted = True
+        elif a == ActionType.FIT_RESONANCE:
+            state.progress.resonance_fitted = True
+            m = float(data.get("fit_mass_gev", 0.0))
+            unc = float(data.get("fit_mass_unc_gev", 0.0))
+            w = float(data.get("fit_width_gev", 0.0))
+            if m > 0:
+                state.candidate_masses_gev.append(m)
+                state.candidate_significances.append(0.0)
+                state.progress.best_fit_mass_gev = m
+                state.progress.best_fit_width_gev = w
+        elif a == ActionType.SCAN_BUMP:
+            state.progress.bump_scanned = True
+            cm = float(data.get("candidate_mass_gev", 0.0))
+            if cm > 0:
+                state.candidate_masses_gev.append(cm)
+                state.candidate_significances.append(0.0)
+        elif a == ActionType.MEASURE_ANGULAR:
+            state.progress.angular_measured = True
+        elif a == ActionType.ESTIMATE_SIGNIFICANCE:
+            state.progress.significance_estimated = True
+            sig = float(data.get("significance_sigma", 0.0))
+            state.progress.best_significance_sigma = max(
+                state.progress.best_significance_sigma or 0.0, sig
+            )
+            if state.candidate_significances:
+                state.candidate_significances[-1] = sig
+        elif a == ActionType.REQUEST_SYSTEMATICS:
+            state.progress.systematics_requested = True
+            state.detector.energy_scale_uncertainty *= 0.6
+            state.detector.luminosity_uncertainty *= 0.7
+        elif a == ActionType.REQUEST_THEORY_REVIEW:
+            state.progress.theory_review_requested = True
+        elif a == ActionType.SUBMIT_DISCOVERY_CLAIM:
+            state.progress.claim_submitted = True
+        state.step_count += 1
+        return TransitionResult(next_state=state, realised_cost=cost)
+__all__ = [
+    "ACTION_COSTS",
+    "TransitionEngine",
+    "TransitionResult",
+    "compute_action_cost",
+]

server/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Task generator: curated scenarios + procedural curriculum."""
+from .scenarios import (
+    CURATED_SCENARIOS,
+    Scenario,
+    sample_scenario,
+)
+__all__ = ["CURATED_SCENARIOS", "Scenario", "sample_scenario"]

server/tasks/scenarios.py ADDED Viewed

	@@ -0,0 +1,422 @@

+"""Built-in physics scenarios + procedural sampling.
+Each scenario binds a hidden ``LatentParticle`` truth and a public
+``TaskSpec`` (search window, available channels, resource budgets, expected
+findings, paper references). Curated scenarios are inspired by famous LHC
+discoveries; procedural ones randomise mass, channel, width and budgets to
+build a curriculum.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+from models import (
+    DetectorChannel,
+    ExpectedFinding,
+    PaperReference,
+    TOOL_REGISTRY,
+    TaskSpec,
+)
+from server.simulator.latent_state import (
+    DetectorState,
+    FullLatentState,
+    LatentParticle,
+    ResourceState,
+)
+@dataclass
+class Scenario:
+    name: str
+    difficulty: str
+    task: TaskSpec
+    latent: FullLatentState
+    def fresh_latent(self) -> FullLatentState:
+        # Pydantic deep-copy so the env can mutate freely
+        return self.latent.model_copy(deep=True)
+# ── Curated, story-driven scenarios ──────────────────────────────────────
+def _higgs_like_scenario() -> Scenario:
+    particle = LatentParticle(
+        name="HiggsLike",
+        mass_gev=125.0,
+        width_gev=0.004,
+        spin=0,
+        parity="+",
+        cross_section_fb=55.0,
+        decay_branching={
+            "diphoton": 0.0023,
+            "dilepton_ee": 0.00003,
+            "dilepton_mumu": 0.00022,
+            "four_lepton": 0.000125,
+            "bb": 0.58,
+            "dijet": 0.30,
+        },
+        primary_channel="diphoton",
+    )
+    detector = DetectorState(
+        detector_resolution_gev=1.5,
+        pileup_mu=30.0,
+        trigger_efficiency=0.85,
+    )
+    resources = ResourceState(
+        budget_total_musd=120.0,
+        luminosity_total_fb=300.0,
+        time_limit_days=365.0,
+    )
+    latent = FullLatentState(
+        particle=particle,
+        detector=detector,
+        resources=resources,
+        rng_seed=125,
+    )
+    task = TaskSpec(
+        problem_statement=(
+            "An anomalous excess at ~125 GeV is rumoured in early 13 TeV runs. "
+            "Plan a campaign to confirm or refute a Standard-Model Higgs-like scalar. "
+            "Pick channels, allocate luminosity, fit, and submit a calibrated discovery claim."
+        ),
+        target_collider="LHC",
+        mass_search_window_gev=[100.0, 200.0],
+        budget_limit_musd=120.0,
+        luminosity_budget_fb=300.0,
+        time_limit_days=365.0,
+        prior_observations=[
+            "Earlier Tevatron data shows a mild diphoton excess near 125 GeV.",
+            "ATLAS/CMS rumour mills suggest a 4ℓ excess at low mass.",
+        ],
+        success_criteria=[
+            "Identify a resonance within 1 GeV of the truth.",
+            "Reach ≥5σ local significance.",
+            "Submit confidence consistent with calibration.",
+        ],
+        paper_references=[
+            PaperReference(
+                title="Observation of a new particle in the search for the SM Higgs boson",
+                arxiv_id="1207.7214",
+                doi="10.1016/j.physletb.2012.08.020",
+            ),
+        ],
+        expected_findings=[
+            ExpectedFinding(finding="Diphoton resonance at ~125 GeV", category="discovery"),
+            ExpectedFinding(finding="Spin-0, even parity", category="property"),
+        ],
+        difficulty="medium",
+        available_tools=list(TOOL_REGISTRY.keys()),
+    )
+    return Scenario(name="higgs_like_125", difficulty="medium", task=task, latent=latent)
+def _hidden_zprime_scenario() -> Scenario:
+    particle = LatentParticle(
+        name="ZPrime",
+        mass_gev=600.0,
+        width_gev=18.0,
+        spin=1,
+        parity="-",
+        cross_section_fb=12.0,
+        decay_branching={
+            "diphoton": 0.0,
+            "dilepton_ee": 0.04,
+            "dilepton_mumu": 0.04,
+            "four_lepton": 0.0,
+            "bb": 0.20,
+            "dijet": 0.70,
+        },
+        primary_channel="dilepton_mumu",
+    )
+    detector = DetectorState(
+        detector_resolution_gev=8.0,
+        pileup_mu=45.0,
+        trigger_efficiency=0.78,
+        qcd_background_strength=1.2,
+    )
+    resources = ResourceState(
+        budget_total_musd=140.0,
+        luminosity_total_fb=200.0,
+        time_limit_days=400.0,
+    )
+    latent = FullLatentState(
+        particle=particle, detector=detector, resources=resources, rng_seed=600,
+    )
+    task = TaskSpec(
+        problem_statement=(
+            "Run-2 dilepton spectra hint at a high-mass excess. Hunt for a heavy "
+            "Z'-like vector resonance and characterise spin-1, parity-odd hypothesis."
+        ),
+        mass_search_window_gev=[300.0, 1500.0],
+        budget_limit_musd=140.0,
+        luminosity_budget_fb=200.0,
+        time_limit_days=400.0,
+        prior_observations=[
+            "High-pT dilepton tail shows a 2.7σ shoulder near 600 GeV.",
+            "Dijet smooth-fit residuals consistent with the same window.",
+        ],
+        success_criteria=[
+            "Identify a high-mass dilepton/dijet resonance.",
+            "Constrain spin to be vector (1).",
+            "Report calibrated mass within 5% and ≥4σ significance.",
+        ],
+        paper_references=[
+            PaperReference(
+                title="Search for high-mass dilepton resonances at the LHC",
+                arxiv_id="1903.06248",
+            ),
+        ],
+        expected_findings=[
+            ExpectedFinding(finding="Heavy Z'-like dilepton resonance", category="discovery"),
+            ExpectedFinding(finding="Spin-1, parity-odd", category="property"),
+        ],
+        difficulty="hard",
+        available_tools=list(TOOL_REGISTRY.keys()),
+    )
+    return Scenario(name="hidden_zprime_600", difficulty="hard", task=task, latent=latent)
+def _diboson_resonance_scenario() -> Scenario:
+    particle = LatentParticle(
+        name="Graviton",
+        mass_gev=750.0,
+        width_gev=45.0,
+        spin=2,
+        parity="+",
+        cross_section_fb=6.0,
+        decay_branching={
+            "diphoton": 0.06,
+            "dilepton_ee": 0.005,
+            "dilepton_mumu": 0.005,
+            "four_lepton": 0.001,
+            "bb": 0.15,
+            "dijet": 0.70,
+        },
+        primary_channel="diphoton",
+    )
+    detector = DetectorState(
+        detector_resolution_gev=12.0,
+        pileup_mu=50.0,
+        trigger_efficiency=0.80,
+    )
+    resources = ResourceState(
+        budget_total_musd=110.0,
+        luminosity_total_fb=180.0,
+        time_limit_days=350.0,
+    )
+    latent = FullLatentState(
+        particle=particle, detector=detector, resources=resources, rng_seed=750,
+    )
+    task = TaskSpec(
+        problem_statement=(
+            "A faint γγ excess at 750 GeV stirred the field briefly in 2015-2016. "
+            "Re-investigate with the modern luminosity budget and decide if it is "
+            "real or a fluctuation."
+        ),
+        mass_search_window_gev=[400.0, 1200.0],
+        budget_limit_musd=110.0,
+        luminosity_budget_fb=180.0,
+        time_limit_days=350.0,
+        prior_observations=[
+            "Public CMS/ATLAS data show a 2-3σ diphoton bump near 750 GeV.",
+            "Theory papers proposed graviton, scalar singlet, and SM-fluctuation explanations.",
+        ],
+        success_criteria=[
+            "Decide between discovery and fluctuation with calibrated confidence.",
+        ],
+        paper_references=[
+            PaperReference(
+                title="Search for resonant production of high-mass diphoton pairs",
+                arxiv_id="1606.04093",
+            ),
+        ],
+        expected_findings=[
+            ExpectedFinding(finding="Possible diphoton resonance near 750 GeV", category="discovery"),
+        ],
+        difficulty="hard",
+        available_tools=list(TOOL_REGISTRY.keys()),
+    )
+    return Scenario(name="diphoton_750", difficulty="hard", task=task, latent=latent)
+def _easy_diphoton_scenario() -> Scenario:
+    """Generous budgets, narrow scalar, single obvious channel."""
+    particle = LatentParticle(
+        name="EasyScalar",
+        mass_gev=160.0,
+        width_gev=0.5,
+        spin=0,
+        parity="+",
+        cross_section_fb=120.0,
+        decay_branching={
+            "diphoton": 0.05,
+            "dilepton_ee": 0.001,
+            "dilepton_mumu": 0.005,
+            "four_lepton": 0.0001,
+            "bb": 0.50,
+            "dijet": 0.30,
+        },
+        primary_channel="diphoton",
+    )
+    detector = DetectorState(
+        detector_resolution_gev=2.0,
+        pileup_mu=20.0,
+        trigger_efficiency=0.9,
+    )
+    resources = ResourceState(
+        budget_total_musd=200.0,
+        luminosity_total_fb=400.0,
+        time_limit_days=500.0,
+    )
+    latent = FullLatentState(
+        particle=particle, detector=detector, resources=resources, rng_seed=160,
+    )
+    task = TaskSpec(
+        problem_statement=(
+            "Tutorial scenario: discover a narrow scalar that decays cleanly to "
+            "two photons. Resources are abundant; focus on running a clean pipeline."
+        ),
+        mass_search_window_gev=[80.0, 300.0],
+        budget_limit_musd=200.0,
+        luminosity_budget_fb=400.0,
+        time_limit_days=500.0,
+        success_criteria=[
+            "Identify the diphoton peak and submit a calibrated 5σ claim.",
+        ],
+        expected_findings=[
+            ExpectedFinding(finding="Diphoton scalar near 160 GeV", category="discovery"),
+        ],
+        difficulty="easy",
+        available_tools=list(TOOL_REGISTRY.keys()),
+    )
+    return Scenario(name="easy_diphoton_160", difficulty="easy", task=task, latent=latent)
+CURATED_SCENARIOS: List[Scenario] = [
+    _easy_diphoton_scenario(),
+    _higgs_like_scenario(),
+    _hidden_zprime_scenario(),
+    _diboson_resonance_scenario(),
+]
+# ── Procedural sampler ───────────────────────────────────────────────────
+_DIFFICULTY_TIERS = {
+    "easy":   {"mass_lo": 90.0,  "mass_hi": 250.0,  "xsec_lo": 80.0, "xsec_hi": 150.0, "res": 1.5,  "budget": 200.0, "lumi": 400.0},
+    "medium": {"mass_lo": 100.0, "mass_hi": 600.0,  "xsec_lo": 25.0, "xsec_hi": 80.0,  "res": 3.0,  "budget": 150.0, "lumi": 300.0},
+    "hard":   {"mass_lo": 250.0, "mass_hi": 1500.0, "xsec_lo": 5.0,  "xsec_hi": 25.0,  "res": 8.0,  "budget": 110.0, "lumi": 200.0},
+}
+def _procedural_scenario(difficulty: str, rng: np.random.Generator) -> Scenario:
+    tier = _DIFFICULTY_TIERS.get(difficulty, _DIFFICULTY_TIERS["medium"])
+    mass = float(rng.uniform(tier["mass_lo"], tier["mass_hi"]))
+    xsec = float(rng.uniform(tier["xsec_lo"], tier["xsec_hi"]))
+    spin = int(rng.choice([0, 1, 2]))
+    parity = str(rng.choice(["+", "-"]))
+    primary = str(rng.choice([c.value for c in DetectorChannel]))
+    branching = {c.value: 0.001 for c in DetectorChannel}
+    branching[primary] = float(rng.uniform(0.02, 0.6))
+    # normalise so it sums to ~1
+    total = sum(branching.values())
+    branching = {k: v / total for k, v in branching.items()}
+    particle = LatentParticle(
+        name=f"Mystery_{int(mass)}GeV",
+        mass_gev=mass,
+        width_gev=float(rng.uniform(0.5, 30.0) if difficulty != "easy" else rng.uniform(0.05, 2.0)),
+        spin=spin,
+        parity=parity,
+        cross_section_fb=xsec,
+        decay_branching=branching,
+        primary_channel=primary,
+    )
+    detector = DetectorState(
+        detector_resolution_gev=tier["res"],
+        pileup_mu=float(rng.uniform(20.0, 60.0)),
+        trigger_efficiency=float(rng.uniform(0.7, 0.92)),
+        qcd_background_strength=float(rng.uniform(0.8, 1.3)),
+    )
+    resources = ResourceState(
+        budget_total_musd=tier["budget"],
+        luminosity_total_fb=tier["lumi"],
+        time_limit_days=float(rng.uniform(300.0, 500.0)),
+    )
+    latent = FullLatentState(
+        particle=particle, detector=detector, resources=resources,
+        rng_seed=int(rng.integers(1, 1_000_000)),
+    )
+    window_lo = max(50.0, mass - 200.0)
+    window_hi = mass + 300.0
+    task = TaskSpec(
+        problem_statement=(
+            f"Procedural ({difficulty}): a hidden resonance lives somewhere in "
+            f"[{window_lo:.0f}, {window_hi:.0f}] GeV. Discover and characterise it."
+        ),
+        mass_search_window_gev=[window_lo, window_hi],
+        budget_limit_musd=tier["budget"],
+        luminosity_budget_fb=tier["lumi"],
+        time_limit_days=resources.time_limit_days,
+        difficulty=difficulty,
+        available_tools=list(TOOL_REGISTRY.keys()),
+        success_criteria=[
+            "Discover the hidden resonance with a calibrated mass and channel.",
+        ],
+    )
+    return Scenario(
+        name=f"procedural_{difficulty}_{int(mass)}",
+        difficulty=difficulty,
+        task=task,
+        latent=latent,
+    )
+def sample_scenario(
+    *,
+    difficulty: Optional[str] = None,
+    name: Optional[str] = None,
+    seed: Optional[int] = None,
+) -> Scenario:
+    rng = np.random.default_rng(seed)
+    if name:
+        for s in CURATED_SCENARIOS:
+            if s.name == name:
+                fresh = Scenario(
+                    name=s.name,
+                    difficulty=s.difficulty,
+                    task=s.task,
+                    latent=s.fresh_latent(),
+                )
+                if seed is not None:
+                    fresh.latent.rng_seed = int(seed)
+                return fresh
+    if difficulty in {"easy", "medium", "hard"}:
+        # mix curated + procedural
+        curated_pool = [s for s in CURATED_SCENARIOS if s.difficulty == difficulty]
+        if curated_pool and rng.random() < 0.4:
+            picked = curated_pool[int(rng.integers(0, len(curated_pool)))]
+            return Scenario(
+                name=picked.name,
+                difficulty=picked.difficulty,
+                task=picked.task,
+                latent=picked.fresh_latent(),
+            )
+        return _procedural_scenario(difficulty, rng)
+    # default: random difficulty
+    diff = str(rng.choice(["easy", "medium", "hard"]))
+    return _procedural_scenario(diff, rng)
+__all__ = ["CURATED_SCENARIOS", "Scenario", "sample_scenario"]

space/__init__.py ADDED Viewed

File without changes

space/env/Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+# CERNenv environment Space (Docker, CPU)
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONPATH=/home/user/app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -ms /bin/bash user
+USER user
+WORKDIR /home/user/app
+COPY --chown=user:user space/env/requirements.txt /home/user/app/space-env-requirements.txt
+RUN python -m pip install --upgrade pip && \
+    python -m pip install --user -r /home/user/app/space-env-requirements.txt
+COPY --chown=user:user . /home/user/app
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

space/env/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+---
+title: CERNenv
+emoji: ⚛️
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+suggested_hardware: cpu-basic
+pinned: false
+license: bsd-3-clause
+short_description: LHC particle-discovery RL environment
+---
+# CERNenv — LHC Discovery RL Environment
+OpenEnv-compatible reinforcement-learning environment that simulates an
+LHC (Large Hadron Collider) analysis. An LLM (Large Language Model) agent
+configures the beam, allocates luminosity, picks a decay channel and
+trigger, runs reconstruction, fits an invariant-mass spectrum, estimates
+significance, and finally submits a structured discovery claim that is
+graded against a hidden ground-truth particle.
+The Space exposes the standard OpenEnv HTTP + WebSocket API:
+* `GET  /health` — liveness
+* `GET  /schema` — action / observation / state JSON schemas
+* `POST /reset`  — start a new episode (`{ "seed": 7, "scenario": "easy_diphoton_160" }`)
+* `POST /step`   — execute one action
+* `GET  /state`  — current `CernState`
+* `WS   /ws`     — persistent session (recommended for multi-step rollouts)
+## Quickstart (Python client)
+```python
+import asyncio
+from openenv.core import EnvClient
+from huggingface_hub import constants
+# replace with your space id
+SPACE = "YOUR_HF_USERNAME/cernenv"
+# (option A) connect to the running Space directly
+import websockets
+async def main():
+    async with EnvClient.from_env(SPACE) as env:  # uses websockets under the hood
+        result = await env.reset(seed=7, scenario="easy_diphoton_160")
+        ...
+asyncio.run(main())
+```
+For training, see the companion **CERNenv Trainer** Space.

space/env/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy>=1.24.0
+scipy>=1.10.0
+pydantic>=2.0.0
+fastapi>=0.110.0
+uvicorn>=0.27.0
+git+https://github.com/meta-pytorch/OpenEnv.git

space/training/Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# CERNenv trainer Space (Docker, A100)
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/home/user/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface/transformers \
+    PYTHONPATH=/home/user/app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        python3.11 python3.11-venv python3.11-dev python3-pip \
+        git curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python \
+    && ln -sf /usr/bin/python3.11 /usr/local/bin/python3
+RUN useradd -ms /bin/bash user
+USER user
+ENV PATH="/home/user/.local/bin:${PATH}"
+WORKDIR /home/user/app
+COPY --chown=user:user space/training/requirements.txt /home/user/app/space-training-requirements.txt
+RUN python -m pip install --upgrade pip && \
+    python -m pip install --user -r /home/user/app/space-training-requirements.txt
+COPY --chown=user:user . /home/user/app
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "space.training.app:app", "--host", "0.0.0.0", "--port", "7860"]

space/training/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+title: CERNenv Trainer
+emoji: ⚛️
+colorFrom: indigo
+colorTo: pink
+sdk: docker
+suggested_hardware: a100-large
+suggested_storage: medium
+pinned: false
+license: bsd-3-clause
+short_description: GRPO trainer for CERNenv (Unsloth + LoRA, A100)
+---
+# CERNenv Trainer (Hugging Face Space, A100)
+Fine-tunes a small instruction-tuned LLM (Large Language Model) to act as
+an LHC (Large Hadron Collider) physicist inside the **CERNenv** OpenEnv
+environment using **GRPO** (Group-Relative Policy Optimization),
+**Unsloth**, and **LoRA** (Low-Rank Adaptation).
+## Hardware
+- Recommended: **A100 large (80 GB)**
+- Minimum: T4 / L4 (will use a smaller model + fewer episodes)
+## Required Space secrets
+| Secret | Purpose |
+| --- | --- |
+| `HF_TOKEN` | Hugging Face token with `write` access for model push |
+| `HF_USERNAME` | Hub username, used as the default model-repo owner |
+## Optional environment variables
+| Variable | Default | Notes |
+| --- | --- | --- |
+| `MODEL_NAME` | `unsloth/Qwen2.5-3B-Instruct` | Any chat model Unsloth supports |
+| `TOTAL_EPISODES` | `400` | Prompts × generations rollouts |
+| `DIFFICULTY` | `easy` | `easy` / `medium` / `hard` |
+| `MAX_STEPS` | `18` | Steps per episode |
+| `NUM_GENERATIONS` | `4` | GRPO group size |
+| `OUTPUT_DIR` | `runs/unsloth-grpo` | LoRA adapter output |
+| `PUSH_REPO` | `${HF_USERNAME}/cernenv-grpo-qwen2.5-3b` | Hub repo for adapters |
+| `AUTOSTART` | `0` | Set to `1` to start training on Space boot |
+## How to use
+This Space exposes a tiny FastAPI control panel:
+- `GET  /` — status + current run info
+- `POST /train` — start / restart a training run
+- `GET  /logs` — live tail of `training.log`
+- `GET  /metrics` — reward + success-rate snapshots
+Click **"Start training"** in the UI, or set `AUTOSTART=1` in the Space variables to kick off immediately on boot.
+When training finishes, the LoRA adapters are pushed to `PUSH_REPO`.
+## Local equivalent
+The same training run is reproducible locally with:
+```bash
+PYTHONPATH=. python -m training.training_unsloth \
+  --model_name unsloth/Qwen2.5-3B-Instruct \
+  --difficulty easy --total_episodes 400 --max_steps 18 \
+  --output_dir runs/unsloth-grpo
+```

space/training/__init__.py ADDED Viewed

File without changes

space/training/app.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""FastAPI control panel for the CERNenv trainer Space.
+Endpoints:
+    GET  /              → status page (HTML)
+    GET  /status        → JSON status of the current training run
+    GET  /metrics       → JSON snapshot of reward / success rate
+    GET  /logs          → tail of the training log
+    POST /train         → start (or restart) a training run
+    GET  /health        → liveness probe
+Designed to run on a Hugging Face Space with `sdk: docker`. Heavy training
+work runs in a background thread so the HTTP server stays responsive.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import subprocess
+import sys
+import threading
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+def _resolve_repo_root() -> Path:
+    env_root = os.environ.get("CERNENV_ROOT")
+    candidates = []
+    if env_root:
+        candidates.append(Path(env_root))
+    candidates.extend([
+        Path("/home/user/app"),
+        Path(__file__).resolve().parent.parent.parent,
+    ])
+    for p in candidates:
+        try:
+            if p.exists():
+                return p.resolve()
+        except OSError:
+            continue
+    return candidates[-1].resolve()
+REPO_ROOT = _resolve_repo_root()
+LOG_DIR = REPO_ROOT / "training" / "runs"
+try:
+    LOG_DIR.mkdir(parents=True, exist_ok=True)
+except OSError as exc:  # pragma: no cover - read-only filesystem fallback
+    logger.warning("could not create %s (%s); using /tmp", LOG_DIR, exc)
+    LOG_DIR = Path("/tmp/cernenv-runs")
+    LOG_DIR.mkdir(parents=True, exist_ok=True)
+LOG_FILE = LOG_DIR / "training.log"
+METRICS_FILE = REPO_ROOT / "training" / "plots" / "metrics_summary.json"
+def _env(name: str, default: str) -> str:
+    return os.environ.get(name, default)
+CONFIG = {
+    "model_name":       _env("MODEL_NAME", "unsloth/Qwen2.5-3B-Instruct"),
+    "difficulty":       _env("DIFFICULTY", "easy"),
+    "total_episodes":   int(_env("TOTAL_EPISODES", "400")),
+    "max_steps":        int(_env("MAX_STEPS", "18")),
+    "num_generations":  int(_env("NUM_GENERATIONS", "4")),
+    "output_dir":       _env("OUTPUT_DIR", "training/runs/unsloth-grpo"),
+    "hf_username":      _env("HF_USERNAME", "YOUR_HF_USERNAME"),
+    "push_repo":        _env(
+        "PUSH_REPO",
+        f"{_env('HF_USERNAME', 'YOUR_HF_USERNAME')}/cernenv-grpo-qwen2.5-3b",
+    ),
+    "autostart":        _env("AUTOSTART", "0") == "1",
+}
+# ── Run state ────────────────────────────────────────────────────────────
+class RunState:
+    def __init__(self) -> None:
+        self.lock = threading.Lock()
+        self.thread: Optional[threading.Thread] = None
+        self.process: Optional[subprocess.Popen] = None
+        self.status: str = "idle"  # idle | running | finished | failed
+        self.started_at: Optional[str] = None
+        self.finished_at: Optional[str] = None
+        self.last_error: Optional[str] = None
+        self.last_config: Dict[str, Any] = {}
+    def to_dict(self) -> Dict[str, Any]:
+        with self.lock:
+            return {
+                "status": self.status,
+                "started_at": self.started_at,
+                "finished_at": self.finished_at,
+                "last_error": self.last_error,
+                "last_config": self.last_config,
+            }
+STATE = RunState()
+# ── Training pipeline ────────────────────────────────────────────────────
+def _stream_subprocess(cmd: list[str], log_handle) -> int:
+    log_handle.write(f"\n$ {' '.join(cmd)}\n")
+    log_handle.flush()
+    proc = subprocess.Popen(
+        cmd,
+        cwd=str(REPO_ROOT),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        bufsize=1,
+        universal_newlines=True,
+        env={**os.environ, "PYTHONPATH": str(REPO_ROOT)},
+    )
+    STATE.process = proc
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        log_handle.write(line)
+        log_handle.flush()
+    rc = proc.wait()
+    log_handle.write(f"[exit code {rc}]\n")
+    log_handle.flush()
+    STATE.process = None
+    return rc
+def _training_pipeline(config: Dict[str, Any]) -> None:
+    started = datetime.now(timezone.utc).isoformat()
+    with STATE.lock:
+        STATE.status = "running"
+        STATE.started_at = started
+        STATE.finished_at = None
+        STATE.last_error = None
+        STATE.last_config = dict(config)
+    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(LOG_FILE, "a") as log:
+        log.write(f"\n=== Training started {started} ===\n")
+        log.write(json.dumps(config, indent=2) + "\n")
+        log.flush()
+        try:
+            output_dir = config["output_dir"]
+            difficulty = config["difficulty"]
+            max_steps = str(config["max_steps"])
+            episodes = str(config["total_episodes"])
+            num_gens = str(config["num_generations"])
+            model_name = config["model_name"]
+            push_repo = config["push_repo"]
+            eval_pre = "training/runs/eval_pre_train.jsonl"
+            eval_post = "training/runs/eval_post_train.jsonl"
+            plots_dir = "training/plots"
+            log.write("\n--- baseline (heuristic / oracle / random) ---\n")
+            log.flush()
+            for agent in ("random", "heuristic", "oracle"):
+                _stream_subprocess(
+                    [
+                        sys.executable, "-m", "scripts.run_agent",
+                        "--agent", agent, "--difficulty", difficulty,
+                        "--episodes", "3", "--quiet",
+                    ],
+                    log,
+                )
+            log.write("\n--- pre-train evaluation ---\n")
+            log.flush()
+            rc = _stream_subprocess(
+                [
+                    sys.executable, "-m", "training.evaluate",
+                    "--model_name", model_name,
+                    "--difficulty", difficulty,
+                    "--episodes", "16",
+                    "--max_steps", max_steps,
+                    "--tag", "pre_train",
+                    "--out", eval_pre,
+                ],
+                log,
+            )
+            if rc != 0:
+                raise RuntimeError(f"pre-train eval failed (rc={rc})")
+            log.write("\n--- GRPO training ---\n")
+            log.flush()
+            rc = _stream_subprocess(
+                [
+                    sys.executable, "-m", "training.training_unsloth",
+                    "--model_name", model_name,
+                    "--difficulty", difficulty,
+                    "--total_episodes", episodes,
+                    "--max_steps", max_steps,
+                    "--num_generations", num_gens,
+                    "--output_dir", output_dir,
+                ],
+                log,
+            )
+            if rc != 0:
+                raise RuntimeError(f"training failed (rc={rc})")
+            log.write("\n--- post-train evaluation ---\n")
+            log.flush()
+            rc = _stream_subprocess(
+                [
+                    sys.executable, "-m", "training.evaluate",
+                    "--model_name", model_name,
+                    "--adapter_dir", output_dir,
+                    "--difficulty", difficulty,
+                    "--episodes", "16",
+                    "--max_steps", max_steps,
+                    "--tag", "post_train",
+                    "--out", eval_post,
+                ],
+                log,
+            )
+            if rc != 0:
+                raise RuntimeError(f"post-train eval failed (rc={rc})")
+            log.write("\n--- plots ---\n")
+            log.flush()
+            _stream_subprocess(
+                [
+                    sys.executable, "-m", "training.plots",
+                    "--pre", eval_pre,
+                    "--post", eval_post,
+                    "--out_dir", plots_dir,
+                ],
+                log,
+            )
+            if os.environ.get("HF_TOKEN"):
+                log.write("\n--- push adapters to Hub ---\n")
+                log.flush()
+                _stream_subprocess(
+                    [
+                        sys.executable, "-m", "scripts.push_to_hub", "model",
+                        "--adapter_dir", output_dir,
+                        "--repo_id", push_repo,
+                        "--base_model", model_name,
+                    ],
+                    log,
+                )
+            else:
+                log.write("\n[skip] HF_TOKEN not set — not pushing to Hub\n")
+                log.flush()
+            with STATE.lock:
+                STATE.status = "finished"
+        except Exception as exc:
+            logger.exception("training pipeline failed")
+            with STATE.lock:
+                STATE.status = "failed"
+                STATE.last_error = str(exc)
+        finally:
+            finished = datetime.now(timezone.utc).isoformat()
+            log.write(f"\n=== Training ended {finished} ===\n")
+            log.flush()
+            with STATE.lock:
+                STATE.finished_at = finished
+def _start_training(config: Dict[str, Any]) -> None:
+    with STATE.lock:
+        if STATE.status == "running":
+            raise RuntimeError("a training run is already in progress")
+        STATE.thread = threading.Thread(
+            target=_training_pipeline,
+            args=(config,),
+            name="cernenv-trainer",
+            daemon=True,
+        )
+        STATE.thread.start()
+# ── FastAPI app ──────────────────────────────────────────────────────────
+app = FastAPI(title="CERNenv Trainer", version="0.1.0")
+_HTML = """\
+<!doctype html>
+<html lang=en>
+<head>
+  <meta charset=utf-8>
+  <title>CERNenv Trainer</title>
+  <style>
+    body {{ font-family: ui-sans-serif, system-ui, sans-serif; margin: 2rem auto; max-width: 760px; color:#111 }}
+    h1 {{ margin-bottom: 0 }}
+    .muted {{ color:#666 }}
+    pre {{ background:#0e1116; color:#e6edf3; padding:1rem; border-radius:6px; overflow-x:auto; max-height:50vh }}
+    button {{ font-size:1rem; padding:.6rem 1rem; border-radius:6px; border:1px solid #888; background:#fff; cursor:pointer }}
+    .pill {{ display:inline-block; padding:.1rem .5rem; border-radius:999px; background:#eef; color:#225 }}
+    .ok {{ background:#dfd; color:#272 }}
+    .fail {{ background:#fdd; color:#822 }}
+    .run {{ background:#fdf6d8; color:#774 }}
+    table {{ border-collapse:collapse; }}
+    td {{ padding:.2rem .8rem .2rem 0; }}
+  </style>
+</head>
+<body>
+  <h1>⚛️ CERNenv Trainer</h1>
+  <p class=muted>GRPO + Unsloth + LoRA on the CERNenv LHC discovery environment.</p>
+  <h3>Status: <span id=status class=pill>?</span></h3>
+  <table id=meta></table>
+  <p>
+    <button onclick="startRun()">▶ Start training</button>
+    <button onclick="refresh()">↻ Refresh</button>
+  </p>
+  <h3>Logs (tail)</h3>
+  <pre id=logs>loading…</pre>
+<script>
+async function refresh() {{
+  const s = await fetch('/status').then(r => r.json());
+  const pill = document.getElementById('status');
+  pill.textContent = s.status;
+  pill.className = 'pill ' + ({{idle:'',running:'run',finished:'ok',failed:'fail'}}[s.status] || '');
+  const meta = document.getElementById('meta');
+  meta.innerHTML = '';
+  for (const [k, v] of Object.entries({{
+    started_at: s.started_at, finished_at: s.finished_at, error: s.last_error,
+    ...(s.last_config || {{}}),
+  }})) {{
+    if (v == null || v === '') continue;
+    const tr = document.createElement('tr');
+    tr.innerHTML = `<td><b>${{k}}</b></td><td><code>${{v}}</code></td>`;
+    meta.appendChild(tr);
+  }}
+  const logs = await fetch('/logs?tail=200').then(r => r.text());
+  document.getElementById('logs').textContent = logs || '(no logs yet)';
+}}
+async function startRun() {{
+  await fetch('/train', {{method:'POST'}});
+  setTimeout(refresh, 500);
+}}
+refresh();
+setInterval(refresh, 5000);
+</script>
+</body>
+</html>
+"""
+@app.get("/", response_class=HTMLResponse)
+def index() -> HTMLResponse:
+    return HTMLResponse(_HTML)
+@app.get("/health")
+def health() -> Dict[str, str]:
+    return {"status": "ok"}
+@app.get("/status")
+def status() -> JSONResponse:
+    return JSONResponse(STATE.to_dict())
+@app.get("/metrics")
+def metrics() -> JSONResponse:
+    if METRICS_FILE.exists():
+        try:
+            return JSONResponse(json.loads(METRICS_FILE.read_text()))
+        except Exception:
+            return JSONResponse({"error": "metrics file unreadable"}, status_code=500)
+    return JSONResponse({"pre": None, "post": None})
+@app.get("/logs", response_class=PlainTextResponse)
+def logs(tail: int = 400) -> PlainTextResponse:
+    if not LOG_FILE.exists():
+        return PlainTextResponse("")
+    text = LOG_FILE.read_text()
+    lines = text.splitlines()
+    return PlainTextResponse("\n".join(lines[-max(tail, 1):]))
+@app.post("/train")
+def train() -> JSONResponse:
+    try:
+        _start_training(dict(CONFIG))
+    except RuntimeError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+    return JSONResponse({"status": "started", "config": CONFIG})
+@app.on_event("startup")
+def _maybe_autostart() -> None:
+    if CONFIG["autostart"]:
+        try:
+            _start_training(dict(CONFIG))
+            logger.info("autostarted training run")
+        except RuntimeError as exc:
+            logger.warning("autostart skipped: %s", exc)

space/training/requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.4.0
+unsloth
+unsloth_zoo
+transformers>=4.44.0
+trl>=0.9.0
+peft>=0.10.0
+accelerate>=1.0.0
+datasets>=2.18.0
+bitsandbytes>=0.43.0
+matplotlib>=3.8.0
+numpy>=1.24.0
+scipy>=1.10.0
+pydantic>=2.0.0
+fastapi>=0.110.0
+uvicorn>=0.27.0
+huggingface_hub>=0.24.0
+git+https://github.com/meta-pytorch/OpenEnv.git

training/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Training utilities: rollout collection, GRPO/PPO training, evaluation."""

training/colab_train_unsloth.ipynb ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CERNenv — Unsloth + LoRA + GRPO training\n",
+    "\n",
+    "Trains a small instruction-tuned LLM (Large Language Model) to act as an LHC (Large Hadron Collider) physicist inside the **CERNenv** OpenEnv environment, using **GRPO** (Group-Relative Policy Optimization) with **Unsloth** + **LoRA** (Low-Rank Adaptation).\n",
+    "\n",
+    "Runs on:\n",
+    "- a **Hugging Face Space** with an A100 GPU (recommended)\n",
+    "- Google **Colab** (T4 / L4) as a fallback\n",
+    "\n",
+    "Outputs:\n",
+    "- LoRA adapters at `runs/unsloth-grpo`\n",
+    "- Reward / success-rate curves at `training/plots/`\n",
+    "- Final adapters pushed to your Hugging Face Hub repo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Environment setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "import sys, os\n",
+    "IN_COLAB = 'google.colab' in sys.modules\n",
+    "IN_HF_SPACE = os.environ.get('SPACE_ID') is not None\n",
+    "print('Colab:', IN_COLAB, '| HF Space:', IN_HF_SPACE)\n",
+    "\n",
+    "if IN_COLAB:\n",
+    "    !git clone https://github.com/YOUR_HF_USERNAME/CERNenv.git\n",
+    "    %cd CERNenv\n",
+    "elif IN_HF_SPACE:\n",
+    "    %cd /home/user/app\n",
+    "else:\n",
+    "    pass\n",
+    "\n",
+    "!pip install -q -r requirements-unsloth.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, json, subprocess, sys\n",
+    "from pathlib import Path\n",
+    "import torch\n",
+    "print('CUDA:', torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)\n",
+    "Path('training/plots').mkdir(parents=True, exist_ok=True)\n",
+    "Path('training/runs').mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Hugging Face authentication\n",
+    "\n",
+    "On a Space, set the `HF_TOKEN` Space-secret. Locally / on Colab, paste a token below. The token must have **write** access to your model repo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "HF_TOKEN = os.environ.get('HF_TOKEN')\n",
+    "if HF_TOKEN:\n",
+    "    login(HF_TOKEN)\n",
+    "    print('logged in via HF_TOKEN env var')\n",
+    "else:\n",
+    "    from getpass import getpass\n",
+    "    login(getpass('Paste HF token: '))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Configure the run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HF_USERNAME = os.environ.get('HF_USERNAME', 'YOUR_HF_USERNAME')\n",
+    "MODEL_NAME = os.environ.get('MODEL_NAME', 'unsloth/Qwen2.5-3B-Instruct')\n",
+    "TOTAL_EPISODES = int(os.environ.get('TOTAL_EPISODES', '400'))\n",
+    "DIFFICULTY = os.environ.get('DIFFICULTY', 'easy')\n",
+    "MAX_STEPS = int(os.environ.get('MAX_STEPS', '18'))\n",
+    "OUTPUT_DIR = os.environ.get('OUTPUT_DIR', 'training/runs/unsloth-grpo')\n",
+    "PUSH_REPO = os.environ.get('PUSH_REPO', f'{HF_USERNAME}/cernenv-grpo-qwen2.5-3b')\n",
+    "print({'model': MODEL_NAME, 'episodes': TOTAL_EPISODES, 'difficulty': DIFFICULTY,\n",
+    "       'max_steps': MAX_STEPS, 'out': OUTPUT_DIR, 'repo': PUSH_REPO})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Quick sanity check: heuristic vs random baseline\n",
+    "\n",
+    "Before training, confirm the environment + reward signal are working."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m scripts.run_agent --agent random    --difficulty $DIFFICULTY --episodes 3 --quiet\n",
+    "!PYTHONPATH=. python -m scripts.run_agent --agent heuristic --difficulty $DIFFICULTY --episodes 3 --quiet\n",
+    "!PYTHONPATH=. python -m scripts.run_agent --agent oracle    --difficulty $DIFFICULTY --episodes 3 --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Pre-training evaluation (zero-shot LLM)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m training.evaluate \\\n",
+    "  --model_name $MODEL_NAME \\\n",
+    "  --difficulty $DIFFICULTY \\\n",
+    "  --episodes 16 \\\n",
+    "  --max_steps $MAX_STEPS \\\n",
+    "  --tag pre_train \\\n",
+    "  --out training/runs/eval_pre_train.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Train with Unsloth + LoRA + GRPO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m training.training_unsloth \\\n",
+    "  --model_name $MODEL_NAME \\\n",
+    "  --difficulty $DIFFICULTY \\\n",
+    "  --total_episodes $TOTAL_EPISODES \\\n",
+    "  --max_steps $MAX_STEPS \\\n",
+    "  --num_generations 4 \\\n",
+    "  --output_dir $OUTPUT_DIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Post-training evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m training.evaluate \\\n",
+    "  --model_name $MODEL_NAME \\\n",
+    "  --adapter_dir $OUTPUT_DIR \\\n",
+    "  --difficulty $DIFFICULTY \\\n",
+    "  --episodes 16 \\\n",
+    "  --max_steps $MAX_STEPS \\\n",
+    "  --tag post_train \\\n",
+    "  --out training/runs/eval_post_train.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Plot before / after"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m training.plots \\\n",
+    "  --pre training/runs/eval_pre_train.jsonl \\\n",
+    "  --post training/runs/eval_post_train.jsonl \\\n",
+    "  --out_dir training/plots"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Push trained adapters to the Hugging Face Hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=. python -m scripts.push_to_hub model \\\n",
+    "  --adapter_dir $OUTPUT_DIR \\\n",
+    "  --repo_id $PUSH_REPO \\\n",
+    "  --base_model $MODEL_NAME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Done. Reward + success-rate plots live in `training/plots/`, model adapters at `OUTPUT_DIR`, and a copy is pushed to `PUSH_REPO`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training/evaluate.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""Evaluate an LLM (with optional LoRA adapters) on CERNenv.
+Usage:
+    python -m training.evaluate --model_name unsloth/Qwen2.5-3B-Instruct \\
+        --difficulty easy --episodes 16 --tag pre_train \\
+        --out training/runs/eval_pre_train.jsonl
+    python -m training.evaluate --model_name unsloth/Qwen2.5-3B-Instruct \\
+        --adapter_dir training/runs/unsloth-grpo --difficulty easy \\
+        --episodes 16 --tag post_train --out training/runs/eval_post_train.jsonl
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+def _build_generate_fn(
+    *,
+    model_name: str,
+    adapter_dir: Optional[str],
+    use_unsloth: bool,
+    max_seq_length: int,
+):
+    if use_unsloth:
+        from unsloth import FastLanguageModel  # type: ignore
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=model_name,
+            max_seq_length=max_seq_length,
+            load_in_4bit=True,
+            fast_inference=True,
+        )
+        if adapter_dir:
+            model.load_adapter(adapter_dir)
+        FastLanguageModel.for_inference(model)
+    else:
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+        )
+        if adapter_dir:
+            from peft import PeftModel  # type: ignore
+            model = PeftModel.from_pretrained(model, adapter_dir)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    def prompt_fn(chat: List[Dict[str, str]]) -> str:
+        return tokenizer.apply_chat_template(
+            chat, add_generation_prompt=True, tokenize=False
+        )
+    def generate_fn(prompt: str, config) -> str:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=config.max_new_tokens,
+            do_sample=True,
+            temperature=config.temperature,
+            top_p=config.top_p,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+        gen = outputs[0][inputs["input_ids"].shape[1]:]
+        return tokenizer.decode(gen, skip_special_tokens=True)
+    return prompt_fn, generate_fn
+def main() -> None:  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", required=True)
+    parser.add_argument("--adapter_dir", default=None)
+    parser.add_argument("--scenario", default=None)
+    parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
+    parser.add_argument("--episodes", type=int, default=16)
+    parser.add_argument("--seed", type=int, default=1000)
+    parser.add_argument("--max_steps", type=int, default=18)
+    parser.add_argument("--max_seq_length", type=int, default=2048)
+    parser.add_argument("--no_unsloth", action="store_true")
+    parser.add_argument("--tag", default="eval")
+    parser.add_argument("--out", required=True)
+    args = parser.parse_args()
+    from server.environment import CERNCollisionEnvironment
+    from training.llm_agent import LLMAgentConfig
+    from training.rollouts import collect_episode, save_episodes_jsonl
+    use_unsloth = not args.no_unsloth
+    try:
+        prompt_fn, generate_fn = _build_generate_fn(
+            model_name=args.model_name,
+            adapter_dir=args.adapter_dir,
+            use_unsloth=use_unsloth,
+            max_seq_length=args.max_seq_length,
+        )
+    except ImportError as exc:
+        logger.warning("Unsloth not available (%s); falling back to transformers.", exc)
+        prompt_fn, generate_fn = _build_generate_fn(
+            model_name=args.model_name,
+            adapter_dir=args.adapter_dir,
+            use_unsloth=False,
+            max_seq_length=args.max_seq_length,
+        )
+    env = CERNCollisionEnvironment(max_steps=args.max_steps)
+    cfg = LLMAgentConfig()
+    episodes = []
+    for ep in range(args.episodes):
+        seed = args.seed + ep
+        rec = collect_episode(
+            env=env,
+            seed=seed,
+            scenario=args.scenario,
+            difficulty=args.difficulty,
+            prompt_fn=prompt_fn,
+            generate_fn=generate_fn,
+            config=cfg,
+        )
+        episodes.append(rec)
+        logger.info(
+            "[%s][%d/%d] reward=%+.3f discovered=%s mass=%s channel=%s",
+            args.tag, ep + 1, args.episodes,
+            rec.cumulative_reward, rec.discovered, rec.correct_mass, rec.correct_channel,
+        )
+    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
+    save_episodes_jsonl(episodes, args.out)
+    rewards = [e.cumulative_reward for e in episodes]
+    success = sum(1 for e in episodes if e.discovered) / len(episodes)
+    logger.info("[%s] mean_reward=%.3f success_rate=%.2f", args.tag, sum(rewards) / len(rewards), success)
+if __name__ == "__main__":  # pragma: no cover
+    main()

training/llm_agent.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""LLM (Large Language Model) agent that picks the next CERNenv action.
+The agent renders an observation as a short prompt, asks the LLM for a
+JSON-formatted ``ExperimentAction``, validates the response, and falls back
+to a safe default action if parsing fails. This is the unit shared by
+evaluation and the GRPO (Group-Relative Policy Optimization) training loop.
+"""
+from __future__ import annotations
+import json
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from models import (
+    ActionType,
+    CollisionObservation,
+    ExperimentAction,
+    build_agent_observation_context,
+    build_agent_system_prompt,
+)
+_VALID_ACTIONS = {a.value for a in ActionType}
+@dataclass
+class LLMAgentConfig:
+    """Knobs for prompt formatting and decoding."""
+    max_history_steps: int = 6
+    temperature: float = 0.7
+    max_new_tokens: int = 256
+    top_p: float = 0.95
+def render_history(obs: CollisionObservation, max_steps: int) -> str:
+    if not obs.pipeline_history:
+        return "  (none yet — pick a starting action)"
+    lines: List[str] = []
+    history = obs.pipeline_history[-max_steps:]
+    for rec in history:
+        success = "OK" if rec.success else "FAIL"
+        lines.append(
+            f"  step {rec.step_index:>2}  {rec.action_type.value:<24} {success}: {rec.output_summary[:80]}"
+        )
+    return "\n".join(lines)
+def render_resources(obs: CollisionObservation) -> str:
+    r = obs.resource_usage
+    return (
+        f"budget {r.budget_remaining_musd:.1f}/{r.budget_remaining_musd + r.budget_used_musd:.1f} M$ left, "
+        f"luminosity {r.luminosity_remaining_fb:.1f}/{r.luminosity_remaining_fb + r.luminosity_used_fb:.1f} fb^-1 left, "
+        f"time {r.time_remaining_days:.0f}/{r.time_remaining_days + r.time_used_days:.0f} days left"
+    )
+def render_user_prompt(
+    obs: CollisionObservation,
+    config: LLMAgentConfig = LLMAgentConfig(),
+) -> str:
+    parts: List[str] = []
+    parts.append("Task:")
+    parts.append("  " + obs.task.problem_statement.strip())
+    parts.append("")
+    parts.append("Public state:")
+    parts.append("  " + build_agent_observation_context(obs).replace("\n", "\n  "))
+    parts.append("")
+    parts.append("Resources:")
+    parts.append("  " + render_resources(obs))
+    parts.append("")
+    parts.append("Recent steps:")
+    parts.append(render_history(obs, max_steps=config.max_history_steps))
+    if obs.rule_violations:
+        parts.append("")
+        parts.append("Last-step violations: " + ", ".join(obs.rule_violations))
+    parts.append("")
+    parts.append("Choose ONE next action and respond with a single JSON object.")
+    return "\n".join(parts)
+def build_chat(
+    obs: CollisionObservation,
+    config: LLMAgentConfig = LLMAgentConfig(),
+) -> List[Dict[str, str]]:
+    return [
+        {"role": "system", "content": build_agent_system_prompt()},
+        {"role": "user", "content": render_user_prompt(obs, config)},
+    ]
+# ── Robust JSON extraction ───────────────────────────────────────────────
+_JSON_RE = re.compile(r"\{[\s\S]*\}")
+def extract_first_json(text: str) -> Optional[Dict[str, Any]]:
+    """Return the first parseable JSON object found inside ``text``."""
+    if not text:
+        return None
+    m = _JSON_RE.search(text)
+    if not m:
+        return None
+    candidate = m.group(0)
+    try:
+        return json.loads(candidate)
+    except json.JSONDecodeError:
+        # Try a relaxed pass: trim trailing commas
+        cleaned = re.sub(r",\s*([}\]])", r"\1", candidate)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            return None
+def parse_action(text: str) -> Optional[ExperimentAction]:
+    payload = extract_first_json(text)
+    if payload is None:
+        return None
+    action_type = payload.get("action_type")
+    if action_type not in _VALID_ACTIONS:
+        return None
+    try:
+        return ExperimentAction(
+            action_type=ActionType(action_type),
+            method=payload.get("method") or None,
+            parameters=payload.get("parameters") or {},
+            justification=payload.get("justification"),
+            confidence=float(payload.get("confidence", 0.5) or 0.5),
+        )
+    except Exception:
+        return None
+def safe_default_action(obs: CollisionObservation) -> ExperimentAction:
+    """Picks the next sensible scripted step when the LLM output is invalid."""
+    prog = obs.pipeline_history
+    flags = {a.value: False for a in ActionType}
+    for rec in prog:
+        if rec.success:
+            flags[rec.action_type.value] = True
+    if not flags[ActionType.CONFIGURE_BEAM.value]:
+        return ExperimentAction(
+            action_type=ActionType.CONFIGURE_BEAM,
+            parameters={"beam_energy": "13TeV"},
+            justification="default fallback",
+        )
+    if not flags[ActionType.SELECT_CHANNEL.value]:
+        return ExperimentAction(
+            action_type=ActionType.SELECT_CHANNEL,
+            parameters={"channel": obs.task.available_channels[0] if obs.task.available_channels else "diphoton"},
+            justification="default fallback",
+        )
+    if not flags[ActionType.SET_TRIGGER.value]:
+        return ExperimentAction(
+            action_type=ActionType.SET_TRIGGER,
+            parameters={"trigger": "diphoton_hlt"},
+            justification="default fallback",
+        )
+    if not flags[ActionType.ALLOCATE_LUMINOSITY.value]:
+        return ExperimentAction(
+            action_type=ActionType.ALLOCATE_LUMINOSITY,
+            parameters={"luminosity_fb": 50.0},
+            justification="default fallback",
+        )
+    if not flags[ActionType.COLLECT_COLLISIONS.value]:
+        return ExperimentAction(
+            action_type=ActionType.COLLECT_COLLISIONS,
+            parameters={"luminosity_fb": 50.0},
+            justification="default fallback",
+        )
+    if not flags[ActionType.RECONSTRUCT_TRACKS.value]:
+        return ExperimentAction(
+            action_type=ActionType.RECONSTRUCT_TRACKS,
+            justification="default fallback",
+        )
+    if not flags[ActionType.BUILD_INVARIANT_MASS.value]:
+        return ExperimentAction(
+            action_type=ActionType.BUILD_INVARIANT_MASS,
+            parameters={"mass_window_gev": obs.task.mass_search_window_gev},
+            justification="default fallback",
+        )
+    if not flags[ActionType.FIT_RESONANCE.value]:
+        return ExperimentAction(
+            action_type=ActionType.FIT_RESONANCE,
+            method="ROOT_RooFit",
+            justification="default fallback",
+        )
+    if not flags[ActionType.ESTIMATE_SIGNIFICANCE.value]:
+        return ExperimentAction(
+            action_type=ActionType.ESTIMATE_SIGNIFICANCE,
+            method="Asimov_significance",
+            justification="default fallback",
+        )
+    mass = obs.candidate_masses_gev[-1] if obs.candidate_masses_gev else 125.0
+    return ExperimentAction(
+        action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
+        parameters={
+            "claim": {
+                "mass_estimate_gev": mass,
+                "mass_uncertainty_gev": 1.0,
+                "significance_sigma": obs.cumulative_significance,
+                "decay_channel": obs.selected_channel or "diphoton",
+                "spin_hypothesis": 0,
+                "parity": "+",
+                "confidence": 0.7,
+            }
+        },
+        justification="default fallback claim",
+    )
+__all__ = [
+    "LLMAgentConfig",
+    "build_chat",
+    "extract_first_json",
+    "parse_action",
+    "render_history",
+    "render_resources",
+    "render_user_prompt",
+    "safe_default_action",
+]

training/plots.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Plot before/after evaluation curves and reward breakdowns.
+Reads two JSONL evaluation files (typically ``eval_pre_train.jsonl`` and
+``eval_post_train.jsonl``) produced by ``training.evaluate`` and writes
+publication-ready PNGs (Portable Network Graphics) under ``--out_dir``.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+def _load(path: str) -> List[Dict[str, Any]]:
+    eps = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                eps.append(json.loads(line))
+    return eps
+def _summarise(eps: List[Dict[str, Any]]) -> Dict[str, float]:
+    if not eps:
+        return {"mean": 0.0, "success_rate": 0.0, "mass_acc": 0.0, "channel_acc": 0.0}
+    rewards = [float(e.get("cumulative_reward") or 0.0) for e in eps]
+    return {
+        "mean": sum(rewards) / len(rewards),
+        "success_rate": sum(1 for e in eps if e.get("discovered")) / len(eps),
+        "mass_acc": sum(1 for e in eps if e.get("correct_mass")) / len(eps),
+        "channel_acc": sum(1 for e in eps if e.get("correct_channel")) / len(eps),
+    }
+def main() -> None:  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pre", required=True)
+    parser.add_argument("--post", required=True)
+    parser.add_argument("--out_dir", default="training/plots")
+    args = parser.parse_args()
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    pre = _load(args.pre)
+    post = _load(args.post)
+    pre_stats = _summarise(pre)
+    post_stats = _summarise(post)
+    out = Path(args.out_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    pre_rewards = [float(e.get("cumulative_reward") or 0.0) for e in pre]
+    post_rewards = [float(e.get("cumulative_reward") or 0.0) for e in post]
+    fig, ax = plt.subplots(figsize=(7, 4))
+    ax.hist(pre_rewards, bins=15, alpha=0.5, label=f"pre (μ={pre_stats['mean']:+.2f})")
+    ax.hist(post_rewards, bins=15, alpha=0.5, label=f"post (μ={post_stats['mean']:+.2f})")
+    ax.set_xlabel("episode cumulative reward")
+    ax.set_ylabel("episode count")
+    ax.set_title("CERNenv reward distribution: pre vs post training")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(out / "reward_distribution.png", dpi=140)
+    plt.close(fig)
+    metrics = ["mean", "success_rate", "mass_acc", "channel_acc"]
+    pre_vals = [pre_stats[m] for m in metrics]
+    post_vals = [post_stats[m] for m in metrics]
+    x = list(range(len(metrics)))
+    fig, ax = plt.subplots(figsize=(7, 4))
+    ax.bar([i - 0.18 for i in x], pre_vals, width=0.36, label="pre")
+    ax.bar([i + 0.18 for i in x], post_vals, width=0.36, label="post")
+    ax.set_xticks(x)
+    ax.set_xticklabels(metrics)
+    ax.set_title("Mean reward & accuracy: pre vs post training")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(out / "metrics_summary.png", dpi=140)
+    plt.close(fig)
+    with open(out / "metrics_summary.json", "w") as f:
+        json.dump({"pre": pre_stats, "post": post_stats}, f, indent=2)
+    print("wrote:", list(out.glob("*")))
+if __name__ == "__main__":  # pragma: no cover
+    main()

training/rollouts.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Rollout collector for LLM-driven CERNenv episodes.
+Runs an LLM agent in-process against ``CERNCollisionEnvironment`` and
+records full per-step trajectories: prompt, completion, parsed action,
+reward, observation snapshot, and final episode summary.
+"""
+from __future__ import annotations
+import json
+import logging
+from dataclasses import asdict, dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+from models import ActionType, CollisionObservation, ExperimentAction
+from server.environment import CERNCollisionEnvironment
+from .llm_agent import (
+    LLMAgentConfig,
+    build_chat,
+    parse_action,
+    safe_default_action,
+)
+logger = logging.getLogger(__name__)
+PromptFn = Callable[[List[Dict[str, str]]], str]
+"""Callable: tokenizer-aware prompt formatter (e.g. apply_chat_template)."""
+GenerateFn = Callable[[str, LLMAgentConfig], str]
+"""Callable: actually run the LLM and return the raw completion string."""
+@dataclass
+class StepRecord:
+    step: int
+    prompt: str
+    completion: str
+    action: Dict[str, Any]
+    parsed_ok: bool
+    reward: float
+    done: bool
+    rule_violations: List[str]
+    observation_summary: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class EpisodeRecord:
+    seed: int
+    scenario: Optional[str]
+    difficulty: Optional[str]
+    truth: Optional[Dict[str, Any]]
+    total_reward: float
+    cumulative_reward: float
+    terminal_reward: Optional[float]
+    discovered: Optional[bool]
+    correct_mass: Optional[bool]
+    correct_channel: Optional[bool]
+    correct_spin: Optional[bool]
+    steps: List[StepRecord]
+def _summarise_obs(obs: CollisionObservation) -> Dict[str, Any]:
+    return {
+        "step_index": obs.step_index,
+        "selected_channel": obs.selected_channel,
+        "selected_beam_energy": obs.selected_beam_energy,
+        "n_candidates": len(obs.candidate_masses_gev),
+        "best_significance": obs.cumulative_significance,
+        "budget_remaining_musd": obs.resource_usage.budget_remaining_musd,
+        "luminosity_remaining_fb": obs.resource_usage.luminosity_remaining_fb,
+    }
+def collect_episode(
+    *,
+    env: CERNCollisionEnvironment,
+    seed: int,
+    scenario: Optional[str],
+    difficulty: Optional[str],
+    prompt_fn: PromptFn,
+    generate_fn: GenerateFn,
+    config: LLMAgentConfig = LLMAgentConfig(),
+    max_steps: Optional[int] = None,
+) -> EpisodeRecord:
+    obs = env.reset(seed=seed, scenario=scenario, difficulty=difficulty)
+    steps: List[StepRecord] = []
+    total_reward = 0.0
+    cap = max_steps or env.max_steps
+    while not obs.done and len(steps) < cap:
+        chat = build_chat(obs, config)
+        prompt = prompt_fn(chat)
+        completion = generate_fn(prompt, config)
+        action = parse_action(completion)
+        parsed_ok = action is not None
+        if action is None:
+            action = safe_default_action(obs)
+        next_obs = env.step(action)
+        reward = float(next_obs.reward or 0.0)
+        total_reward += reward
+        steps.append(
+            StepRecord(
+                step=obs.step_index,
+                prompt=prompt,
+                completion=completion,
+                action=action.model_dump(),
+                parsed_ok=parsed_ok,
+                reward=reward,
+                done=next_obs.done,
+                rule_violations=list(next_obs.rule_violations),
+                observation_summary=_summarise_obs(obs),
+            )
+        )
+        obs = next_obs
+    return EpisodeRecord(
+        seed=seed,
+        scenario=env.state.scenario_name,
+        difficulty=env.state.difficulty,
+        truth=env.hidden_truth(),
+        total_reward=total_reward,
+        cumulative_reward=float(env.state.cumulative_reward),
+        terminal_reward=env.state.terminal_reward,
+        discovered=env.state.discovered,
+        correct_mass=env.state.correct_mass,
+        correct_channel=env.state.correct_channel,
+        correct_spin=env.state.correct_spin,
+        steps=steps,
+    )
+def save_episodes_jsonl(episodes: List[EpisodeRecord], path: str) -> None:
+    with open(path, "w") as f:
+        for ep in episodes:
+            f.write(json.dumps(asdict(ep), default=str) + "\n")
+def load_episodes_jsonl(path: str) -> List[Dict[str, Any]]:
+    eps: List[Dict[str, Any]] = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                eps.append(json.loads(line))
+    return eps
+__all__ = [
+    "EpisodeRecord",
+    "StepRecord",
+    "collect_episode",
+    "save_episodes_jsonl",
+    "load_episodes_jsonl",
+]

training/training_script.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""GRPO (Group-Relative Policy Optimization) training script for CERNenv.
+Uses Hugging Face TRL (Transformer Reinforcement Learning) ``GRPOTrainer`` to
+fine-tune a small instruction-tuned model on full episodes of the CERN
+environment. Each ``query`` is a prompt sampled from a freshly-reset env;
+the reward function rolls the model's response through the environment and
+returns the per-step + (optional) terminal reward.
+This script is intentionally CPU-friendly and self-contained. For
+GPU-accelerated training with LoRA, prefer ``training_unsloth.py``.
+Run:
+    python -m training.training_script \
+        --model_name HuggingFaceTB/SmolLM2-360M-Instruct \
+        --total_episodes 200 --max_steps 18 --output_dir training/grpo-output
+"""
+from __future__ import annotations
+import argparse
+import logging
+import math
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import torch
+from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from models import ExperimentAction
+from server.environment import CERNCollisionEnvironment
+from training.llm_agent import (
+    LLMAgentConfig,
+    build_chat,
+    parse_action,
+    safe_default_action,
+)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+# ── Episode reward harness ───────────────────────────────────────────────
+@dataclass
+class EpisodeContext:
+    """Per-prompt reusable env + observation snapshot used by the reward fn."""
+    env: CERNCollisionEnvironment
+    seed: int
+    scenario: Optional[str]
+    difficulty: Optional[str]
+def _stepwise_reward(
+    *,
+    completion_text: str,
+    ctx: EpisodeContext,
+) -> float:
+    """Roll the model's first response through one full episode and
+    return the cumulative reward (per-step + terminal).
+    The completion is interpreted as the first action only; subsequent
+    steps fall back to the safe default policy. This keeps the reward
+    bandwidth high for early-exploration training without requiring
+    multi-turn rollouts inside GRPO.
+    """
+    env = ctx.env
+    obs = env.reset(seed=ctx.seed, scenario=ctx.scenario, difficulty=ctx.difficulty)
+    action = parse_action(completion_text) or safe_default_action(obs)
+    obs = env.step(action)
+    cumulative = float(obs.reward or 0.0)
+    while not obs.done:
+        fallback = safe_default_action(obs)
+        obs = env.step(fallback)
+        cumulative += float(obs.reward or 0.0)
+    return cumulative
+def _format_validity_bonus(completion_text: str) -> float:
+    return 0.5 if parse_action(completion_text) is not None else -0.5
+def make_reward_fn(ctx: EpisodeContext):
+    """Return a TRL-compatible reward function (closes over ``ctx``)."""
+    def reward_fn(prompts: List[str], completions: List[str], **kwargs: Any) -> List[float]:
+        rewards: List[float] = []
+        for completion in completions:
+            r = _stepwise_reward(completion_text=completion, ctx=ctx)
+            r += _format_validity_bonus(completion)
+            rewards.append(float(r))
+        return rewards
+    return reward_fn
+# ── Prompt dataset ───────────────────────────────────────────────────────
+def build_dataset(
+    *,
+    tokenizer,
+    n_prompts: int,
+    seed: int,
+    scenario: Optional[str],
+    difficulty: Optional[str],
+) -> Dataset:
+    env = CERNCollisionEnvironment()
+    prompts: List[str] = []
+    for i in range(n_prompts):
+        obs = env.reset(seed=seed + i, scenario=scenario, difficulty=difficulty)
+        chat = build_chat(obs)
+        prompt = tokenizer.apply_chat_template(
+            chat, add_generation_prompt=True, tokenize=False
+        )
+        prompts.append(prompt)
+    return Dataset.from_dict({"prompt": prompts})
+# ── Main ─────────────────────────────────────────────────────────────────
+def main() -> None:  # pragma: no cover - training entrypoint
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="HuggingFaceTB/SmolLM2-360M-Instruct")
+    parser.add_argument("--scenario", default=None)
+    parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
+    parser.add_argument("--total_episodes", type=int, default=200)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--max_steps", type=int, default=18)
+    parser.add_argument("--num_generations", type=int, default=4)
+    parser.add_argument("--learning_rate", type=float, default=1e-5)
+    parser.add_argument("--max_prompt_length", type=int, default=1024)
+    parser.add_argument("--max_completion_length", type=int, default=256)
+    parser.add_argument("--output_dir", default="training/grpo-output")
+    args = parser.parse_args()
+    try:
+        from trl import GRPOConfig, GRPOTrainer
+    except ImportError as exc:  # pragma: no cover
+        raise SystemExit(
+            "TRL (Transformer Reinforcement Learning) is required: "
+            "pip install -r requirements-train.txt"
+        ) from exc
+    logger.info("Loading tokenizer + model: %s", args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name,
+        torch_dtype=torch.float32,
+    )
+    logger.info("Building prompt dataset (%d prompts)", args.total_episodes)
+    dataset = build_dataset(
+        tokenizer=tokenizer,
+        n_prompts=args.total_episodes,
+        seed=args.seed,
+        scenario=args.scenario,
+        difficulty=args.difficulty,
+    )
+    env = CERNCollisionEnvironment(max_steps=args.max_steps)
+    ctx = EpisodeContext(
+        env=env,
+        seed=args.seed,
+        scenario=args.scenario,
+        difficulty=args.difficulty,
+    )
+    reward_fn = make_reward_fn(ctx)
+    cfg = GRPOConfig(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=2,
+        num_generations=args.num_generations,
+        learning_rate=args.learning_rate,
+        max_prompt_length=args.max_prompt_length,
+        max_completion_length=args.max_completion_length,
+        logging_steps=5,
+        save_steps=50,
+        seed=args.seed,
+        bf16=False,
+        fp16=False,
+        report_to=[],
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        reward_funcs=[reward_fn],
+        args=cfg,
+    )
+    logger.info("Starting GRPO training")
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    logger.info("Saved model to %s", args.output_dir)
+if __name__ == "__main__":  # pragma: no cover
+    main()

training/training_unsloth.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""Unsloth + LoRA (Low-Rank Adaptation) GRPO training for CERNenv.
+This is the recommended path for Colab / single-GPU runs because Unsloth's
+fused kernels and 4-bit loading let us train 2B–8B models with limited VRAM.
+Run on Colab:
+    !pip install -q unsloth unsloth_zoo trl peft datasets bitsandbytes
+    !python -m training.training_unsloth \
+        --model_name unsloth/Qwen2.5-3B-Instruct \
+        --total_episodes 400 --num_generations 4 --output_dir runs/unsloth-grpo
+"""
+from __future__ import annotations
+import argparse
+import logging
+from typing import Any, List, Optional
+from datasets import Dataset
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+def main() -> None:  # pragma: no cover - heavy GPU path
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="unsloth/Qwen2.5-3B-Instruct")
+    parser.add_argument("--scenario", default=None)
+    parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy")
+    parser.add_argument("--total_episodes", type=int, default=400)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--max_steps", type=int, default=18)
+    parser.add_argument("--num_generations", type=int, default=4)
+    parser.add_argument("--max_prompt_length", type=int, default=2048)
+    parser.add_argument("--max_completion_length", type=int, default=384)
+    parser.add_argument("--learning_rate", type=float, default=5e-6)
+    parser.add_argument("--load_in_4bit", action="store_true", default=True)
+    parser.add_argument("--lora_rank", type=int, default=16)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--output_dir", default="training/runs/unsloth-grpo")
+    args = parser.parse_args()
+    from unsloth import FastLanguageModel
+    from trl import GRPOConfig, GRPOTrainer
+    from server.environment import CERNCollisionEnvironment
+    from training.llm_agent import (
+        LLMAgentConfig,
+        build_chat,
+        parse_action,
+        safe_default_action,
+    )
+    from training.training_script import EpisodeContext, _format_validity_bonus, _stepwise_reward
+    logger.info("Loading Unsloth model: %s", args.model_name)
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=args.model_name,
+        max_seq_length=args.max_prompt_length + args.max_completion_length,
+        load_in_4bit=args.load_in_4bit,
+        fast_inference=True,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        use_gradient_checkpointing="unsloth",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Build prompts
+    env = CERNCollisionEnvironment(max_steps=args.max_steps)
+    prompts: List[str] = []
+    for i in range(args.total_episodes):
+        obs = env.reset(seed=args.seed + i, scenario=args.scenario, difficulty=args.difficulty)
+        chat = build_chat(obs)
+        prompts.append(
+            tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
+        )
+    dataset = Dataset.from_dict({"prompt": prompts})
+    ctx = EpisodeContext(
+        env=env, seed=args.seed,
+        scenario=args.scenario, difficulty=args.difficulty,
+    )
+    def reward_fn(prompts: List[str], completions: List[str], **kwargs: Any) -> List[float]:
+        rewards: List[float] = []
+        for completion in completions:
+            r = _stepwise_reward(completion_text=completion, ctx=ctx)
+            r += _format_validity_bonus(completion)
+            rewards.append(float(r))
+        return rewards
+    cfg = GRPOConfig(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=4,
+        num_generations=args.num_generations,
+        learning_rate=args.learning_rate,
+        max_prompt_length=args.max_prompt_length,
+        max_completion_length=args.max_completion_length,
+        logging_steps=5,
+        save_steps=50,
+        seed=args.seed,
+        bf16=True,
+        report_to=[],
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        reward_funcs=[reward_fn],
+        args=cfg,
+    )
+    logger.info("Starting Unsloth + LoRA GRPO training")
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    logger.info("Saved adapters to %s", args.output_dir)
+if __name__ == "__main__":  # pragma: no cover
+    main()