OpenEnv 0.2.3 conformance: mount /openenv sub-app, add adapter + tests + example client
Browse files- README.md +57 -0
- example_client.py +80 -0
- openenv.yaml +20 -4
- opensleuth_env/openenv_adapter.py +267 -0
- requirements.txt +10 -3
- server.py +58 -2
- tests/__init__.py +0 -0
- tests/test_env.py +334 -0
- tests/test_openenv_conformance.py +257 -0
README.md
CHANGED
|
@@ -111,6 +111,55 @@ additive. `/functions` returns the same shape as before (with one *additive*
|
|
| 111 |
`source` field). Open-ended/Hub tasks are exposed via the new `/tasks`
|
| 112 |
endpoint so older clients aren't surprised.
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
## Hardware
|
| 115 |
|
| 116 |
CPU-only — `cpu-basic` is plenty. Do **not** assign GPU to this Space.
|
|
@@ -120,4 +169,12 @@ CPU-only — `cpu-basic` is plenty. Do **not** assign GPU to this Space.
|
|
| 120 |
```bash
|
| 121 |
pip install -r requirements.txt
|
| 122 |
uvicorn server:app --port 7860 --reload
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
```
|
|
|
|
| 111 |
`source` field). Open-ended/Hub tasks are exposed via the new `/tasks`
|
| 112 |
endpoint so older clients aren't surprised.
|
| 113 |
|
| 114 |
+
## OpenEnv conformance
|
| 115 |
+
|
| 116 |
+
This Space targets the [meta-pytorch / OpenEnv](https://github.com/meta-pytorch/OpenEnv)
|
| 117 |
+
v0.2.3 spec (`pip install openenv-core==0.2.3`). The OpenEnv-conformant
|
| 118 |
+
surface is mounted at **`/openenv/*`** alongside (not on top of) the legacy
|
| 119 |
+
endpoints listed above so the in-flight trainer keeps working unchanged.
|
| 120 |
+
|
| 121 |
+
| OpenEnv route | Path | Notes |
|
| 122 |
+
|--------------------------|-----------------------|----------------------------------------------------------|
|
| 123 |
+
| `GET /health` | `/openenv/health` | `{"status": "healthy"}` |
|
| 124 |
+
| `GET /metadata` | `/openenv/metadata` | `EnvironmentMetadata` (name, description, version, ...) |
|
| 125 |
+
| `GET /schema` | `/openenv/schema` | JSON schemas for `action`, `observation`, `state` |
|
| 126 |
+
| `GET /state` | `/openenv/state` | Episode `State` (episode_id, step_count, ...) |
|
| 127 |
+
| `POST /reset` | `/openenv/reset` | Returns `{"observation", "reward", "done"}` envelope |
|
| 128 |
+
| `POST /step` | `/openenv/step` | Body: `{"action": {"action_type": "probe"|"submit", ...}}` |
|
| 129 |
+
| `WS /ws` | `/openenv/ws` | Persistent session: `reset` → `step`* → `state` → `close` |
|
| 130 |
+
|
| 131 |
+
`OpenSleuthEnvironment` (in `opensleuth_env/openenv_adapter.py`) subclasses
|
| 132 |
+
`openenv.core.env_server.interfaces.Environment`, so any OpenEnv-aware
|
| 133 |
+
harness (`openenv` CLI, `GenericEnvClient`, TRL/torchforge integrations,
|
| 134 |
+
LightningAI Studio, ...) can pick it up via standard introspection.
|
| 135 |
+
|
| 136 |
+
### Talking to it as an OpenEnv client
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
import asyncio
|
| 140 |
+
from openenv import GenericEnvClient, GenericAction
|
| 141 |
+
|
| 142 |
+
async def main():
|
| 143 |
+
base = "https://anugrah55-opensleuth-env-gemini-cli.hf.space/openenv"
|
| 144 |
+
async with GenericEnvClient(base_url=base) as env:
|
| 145 |
+
result = await env.reset(target_name="fibonacci", max_steps=8)
|
| 146 |
+
result = await env.step(GenericAction(action_type="probe", input_repr="10"))
|
| 147 |
+
print(result.observation["probe_history"][-1])
|
| 148 |
+
|
| 149 |
+
asyncio.run(main())
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
A runnable end-to-end example lives in [`example_client.py`](example_client.py).
|
| 153 |
+
|
| 154 |
+
### What is *not* yet conformant
|
| 155 |
+
|
| 156 |
+
* No MCP tool surface (RFC 003). Our actions are typed Pydantic models, not
|
| 157 |
+
MCP tools, because the underlying probe/submit semantics map cleanly to a
|
| 158 |
+
single `OpenSleuthAction` discriminator. Adding MCP would be additive.
|
| 159 |
+
* No Rubric/EvalHarness integration (RFC 004) — reward shaping lives in
|
| 160 |
+
`opensleuth_env/env.py` and is intentionally not split into a separate
|
| 161 |
+
rubric for now.
|
| 162 |
+
|
| 163 |
## Hardware
|
| 164 |
|
| 165 |
CPU-only — `cpu-basic` is plenty. Do **not** assign GPU to this Space.
|
|
|
|
| 169 |
```bash
|
| 170 |
pip install -r requirements.txt
|
| 171 |
uvicorn server:app --port 7860 --reload
|
| 172 |
+
# legacy contract: http://localhost:7860/{health,reset,step,state/{eid}}
|
| 173 |
+
# OpenEnv-conformant surface: http://localhost:7860/openenv/{health,reset,step,state,schema,metadata,ws}
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
To run only the OpenEnv conformance tests:
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
PYTHONPATH=. python -m pytest tests/test_openenv_conformance.py -v
|
| 180 |
```
|
example_client.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example: talk to the OpenSleuth env via the upstream OpenEnv client.
|
| 2 |
+
|
| 3 |
+
This script connects to the deployed Space using the canonical OpenEnv
|
| 4 |
+
``GenericEnvClient`` (HTTP+WebSocket) and runs one episode end-to-end.
|
| 5 |
+
|
| 6 |
+
Usage::
|
| 7 |
+
|
| 8 |
+
pip install openenv-core==0.2.3
|
| 9 |
+
python example_client.py # hits the deployed Space
|
| 10 |
+
python example_client.py http://localhost:7860 # against a local server
|
| 11 |
+
|
| 12 |
+
We hit the ``/openenv`` sub-app rather than the legacy bare routes, because
|
| 13 |
+
the OpenEnv client requires an OpenEnv-conformant ``/ws`` WebSocket. The
|
| 14 |
+
legacy ``/reset`` and ``/step`` endpoints used by the in-flight trainer are
|
| 15 |
+
preserved unchanged at the root.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import asyncio
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
DEFAULT_BASE = (
|
| 24 |
+
"https://anugrah55-opensleuth-env-gemini-cli.hf.space/openenv"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
async def main(base_url: str) -> None:
|
| 29 |
+
from openenv import GenericEnvClient, GenericAction
|
| 30 |
+
|
| 31 |
+
print(f"Connecting to {base_url} ...")
|
| 32 |
+
async with GenericEnvClient(base_url=base_url) as env:
|
| 33 |
+
# Reset with the default ('fibonacci') target. Pass any of the legacy
|
| 34 |
+
# OpenSleuth reset kwargs as extra fields; OpenEnv ResetRequest has
|
| 35 |
+
# extra='allow', so target_name / target_code / max_steps / etc. all
|
| 36 |
+
# flow through.
|
| 37 |
+
result = await env.reset(target_name="fibonacci", max_steps=8, seed=42)
|
| 38 |
+
obs = result.observation
|
| 39 |
+
print("\n[reset]")
|
| 40 |
+
print(f" episode_id = {obs['episode_id']}")
|
| 41 |
+
print(f" target = {obs['target_function_name']} ({obs['difficulty']})")
|
| 42 |
+
|
| 43 |
+
# Probe a few inputs.
|
| 44 |
+
for repr_input in ("1", "5", "10", "-1", "'oops'"):
|
| 45 |
+
result = await env.step(
|
| 46 |
+
GenericAction(action_type="probe", input_repr=repr_input)
|
| 47 |
+
)
|
| 48 |
+
last = result.observation["probe_history"][-1]
|
| 49 |
+
print(
|
| 50 |
+
f"[probe {repr_input!s:>8}] -> output={last['output_repr']!r:>30} "
|
| 51 |
+
f"reward={result.reward:+.2f} done={result.done}"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Submit a perfect implementation.
|
| 55 |
+
code = (
|
| 56 |
+
"def fibonacci(n):\n"
|
| 57 |
+
" if not isinstance(n, int) or isinstance(n, bool) or n <= 0 or n > 90:\n"
|
| 58 |
+
" raise ValueError('bad')\n"
|
| 59 |
+
" a, b = 0, 1\n"
|
| 60 |
+
" for _ in range(n - 1):\n"
|
| 61 |
+
" a, b = b, a + b\n"
|
| 62 |
+
" return b\n"
|
| 63 |
+
)
|
| 64 |
+
result = await env.step(GenericAction(action_type="submit", code=code))
|
| 65 |
+
info = result.observation.get("info", {})
|
| 66 |
+
print("\n[submit reference impl]")
|
| 67 |
+
print(f" reward = {result.reward:.2f}")
|
| 68 |
+
print(f" done = {result.done}")
|
| 69 |
+
print(f" info = {info}")
|
| 70 |
+
|
| 71 |
+
# State endpoint sanity check.
|
| 72 |
+
state = await env.state()
|
| 73 |
+
print(f"\n[state] {state}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
base = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_BASE
|
| 78 |
+
if not base.rstrip("/").endswith("/openenv"):
|
| 79 |
+
base = base.rstrip("/") + "/openenv"
|
| 80 |
+
asyncio.run(main(base))
|
openenv.yaml
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
|
|
| 1 |
name: opensleuth
|
| 2 |
-
version: 0.
|
| 3 |
-
description:
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
name: opensleuth
|
| 3 |
+
version: "0.5.0"
|
| 4 |
+
description: >-
|
| 5 |
+
OpenSleuth: an OpenEnv-conformant environment that trains LLMs to
|
| 6 |
+
reverse-engineer hidden Python functions by probing them and submitting code
|
| 7 |
+
that reproduces them. Used for GRPO RL post-training.
|
| 8 |
+
author: anugrah55
|
| 9 |
+
type: space
|
| 10 |
+
runtime: fastapi
|
| 11 |
+
app: server:app
|
| 12 |
+
port: 7860
|
| 13 |
+
action: OpenSleuthAction
|
| 14 |
+
observation: OpenSleuthObservation
|
| 15 |
+
documentation_url: https://huggingface.co/spaces/anugrah55/opensleuth-env-gemini-cli
|
| 16 |
+
tags:
|
| 17 |
+
- rl
|
| 18 |
+
- grpo
|
| 19 |
+
- code
|
| 20 |
+
- openenv
|
| 21 |
+
- openenv-conformant
|
opensleuth_env/openenv_adapter.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv-conformant adapter for OpenSleuthEnv.
|
| 2 |
+
|
| 3 |
+
Wraps the existing multi-episode :class:`OpenSleuthEnv` registry as a
|
| 4 |
+
single-episode-per-session :class:`openenv.core.env_server.interfaces.Environment`
|
| 5 |
+
so the canonical OpenEnv HTTP / WebSocket protocol can be served alongside
|
| 6 |
+
the legacy ``/reset`` + ``/step`` endpoints the in-flight trainer uses.
|
| 7 |
+
|
| 8 |
+
This module is *additive*. It does not touch the legacy server contract;
|
| 9 |
+
``server.py`` mounts the OpenEnv-style sub-application at ``/openenv/*`` so the
|
| 10 |
+
trainer (which talks to the bare ``/reset`` and ``/step``) is unaffected.
|
| 11 |
+
|
| 12 |
+
The adapter conforms to OpenEnv 0.2.x:
|
| 13 |
+
|
| 14 |
+
* ``Environment.reset(seed, episode_id, **kwargs) -> Observation``
|
| 15 |
+
* ``Environment.step(action, timeout_s, **kwargs) -> Observation``
|
| 16 |
+
* ``Environment.state -> State``
|
| 17 |
+
* ``Environment.get_metadata() -> EnvironmentMetadata``
|
| 18 |
+
|
| 19 |
+
See https://github.com/meta-pytorch/OpenEnv (v0.2.3, BSD-3) for the spec.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
from typing import Any, List, Literal, Optional
|
| 25 |
+
from uuid import uuid4
|
| 26 |
+
|
| 27 |
+
from pydantic import Field
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from openenv.core.env_server.interfaces import Environment
|
| 31 |
+
from openenv.core.env_server.types import (
|
| 32 |
+
Action as OEAction,
|
| 33 |
+
EnvironmentMetadata,
|
| 34 |
+
Observation as OEObservation,
|
| 35 |
+
State as OEState,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
OPENENV_AVAILABLE = True
|
| 39 |
+
except ImportError: # pragma: no cover - openenv is required at runtime in the Space
|
| 40 |
+
OPENENV_AVAILABLE = False
|
| 41 |
+
OEAction = object # type: ignore[assignment, misc]
|
| 42 |
+
OEObservation = object # type: ignore[assignment, misc]
|
| 43 |
+
OEState = object # type: ignore[assignment, misc]
|
| 44 |
+
Environment = object # type: ignore[assignment, misc]
|
| 45 |
+
EnvironmentMetadata = object # type: ignore[assignment, misc]
|
| 46 |
+
|
| 47 |
+
from .env import OpenSleuthEnv
|
| 48 |
+
from .models import ProbeAction, SubmitAction
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
if OPENENV_AVAILABLE:
|
| 52 |
+
|
| 53 |
+
class OpenSleuthAction(OEAction):
|
| 54 |
+
"""Unified OpenEnv-style action.
|
| 55 |
+
|
| 56 |
+
The OpenEnv spec wants a single concrete Action subclass per
|
| 57 |
+
environment; we encode the probe / submit choice via the
|
| 58 |
+
``action_type`` discriminator field. Internally we still translate
|
| 59 |
+
to the original :class:`ProbeAction` / :class:`SubmitAction` so the
|
| 60 |
+
legacy reward shaping is preserved bit-for-bit.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
action_type: Literal["probe", "submit"] = Field(
|
| 64 |
+
..., description="Either 'probe' (with input_repr) or 'submit' (with code)."
|
| 65 |
+
)
|
| 66 |
+
input_repr: Optional[str] = Field(
|
| 67 |
+
default=None,
|
| 68 |
+
description="Python literal repr of the probe input. Required when action_type='probe'.",
|
| 69 |
+
)
|
| 70 |
+
code: Optional[str] = Field(
|
| 71 |
+
default=None,
|
| 72 |
+
description="Python source defining the target function. Required when action_type='submit'.",
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
class OpenSleuthObservation(OEObservation):
|
| 76 |
+
"""OpenEnv observation wrapper.
|
| 77 |
+
|
| 78 |
+
OpenEnv's ``Observation`` base class supplies ``done``, ``reward``,
|
| 79 |
+
and ``metadata``. We add OpenSleuth-specific fields for the agent
|
| 80 |
+
(target signature, probe history, etc.). Trainer-facing structured
|
| 81 |
+
info is also surfaced via ``info`` for backwards compat.
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
episode_id: str = Field(default="", description="Per-session episode id.")
|
| 85 |
+
target_function_name: str = Field(default="")
|
| 86 |
+
target_function_signature: str = Field(
|
| 87 |
+
default="", description="Public signature + docstring for the target."
|
| 88 |
+
)
|
| 89 |
+
probe_history: List[dict] = Field(
|
| 90 |
+
default_factory=list,
|
| 91 |
+
description="Recent probe records (input_repr, output_repr, is_error, ...).",
|
| 92 |
+
)
|
| 93 |
+
last_error: str = Field(default="", description="Last error string, if any.")
|
| 94 |
+
steps_taken: int = Field(default=0)
|
| 95 |
+
max_steps: int = Field(default=25)
|
| 96 |
+
difficulty: Optional[str] = Field(default=None)
|
| 97 |
+
coverage_buckets_seen: int = Field(default=0)
|
| 98 |
+
seen_outputs_count: int = Field(default=0)
|
| 99 |
+
seen_error_types_count: int = Field(default=0)
|
| 100 |
+
info: dict = Field(
|
| 101 |
+
default_factory=dict,
|
| 102 |
+
description="Structured info from the underlying step (matches the legacy info dict).",
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
class OpenSleuthState(OEState):
|
| 106 |
+
"""OpenEnv-style episode state."""
|
| 107 |
+
|
| 108 |
+
target_function_name: Optional[str] = Field(default=None)
|
| 109 |
+
max_steps: int = Field(default=25)
|
| 110 |
+
finished: bool = Field(default=False)
|
| 111 |
+
|
| 112 |
+
class OpenSleuthEnvironment(Environment):
|
| 113 |
+
"""OpenEnv-conformant adapter around :class:`OpenSleuthEnv`.
|
| 114 |
+
|
| 115 |
+
One adapter instance == one episode (one WebSocket session). Inside,
|
| 116 |
+
we keep a single :class:`OpenSleuthEnv` registry but only ever populate
|
| 117 |
+
a single episode at a time.
|
| 118 |
+
|
| 119 |
+
``SUPPORTS_CONCURRENT_SESSIONS = True`` is safe because each WebSocket
|
| 120 |
+
connection in OpenEnv's :class:`HTTPEnvServer` instantiates its own
|
| 121 |
+
:class:`OpenSleuthEnvironment`, and our underlying registries are
|
| 122 |
+
per-instance.
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 126 |
+
|
| 127 |
+
def __init__(self) -> None:
|
| 128 |
+
super().__init__()
|
| 129 |
+
self._env = OpenSleuthEnv()
|
| 130 |
+
self._episode_id: Optional[str] = None
|
| 131 |
+
self._target_function_name: Optional[str] = None
|
| 132 |
+
self._max_steps: int = 25
|
| 133 |
+
self._step_count: int = 0
|
| 134 |
+
self._done: bool = False
|
| 135 |
+
|
| 136 |
+
def reset( # type: ignore[override]
|
| 137 |
+
self,
|
| 138 |
+
seed: Optional[int] = None,
|
| 139 |
+
episode_id: Optional[str] = None,
|
| 140 |
+
target_name: Optional[str] = None,
|
| 141 |
+
target_code: Optional[str] = None,
|
| 142 |
+
target_function_name: Optional[str] = None,
|
| 143 |
+
max_steps: int = 25,
|
| 144 |
+
edge_cases: Optional[list] = None,
|
| 145 |
+
fuzz_spec: Optional[dict] = None,
|
| 146 |
+
**kwargs: Any,
|
| 147 |
+
) -> "OpenSleuthObservation":
|
| 148 |
+
# Default to a builtin so a bare reset() still produces a valid
|
| 149 |
+
# episode (per OpenEnv spec, reset() with no args must work).
|
| 150 |
+
if not target_name and not target_code:
|
| 151 |
+
target_name = "fibonacci"
|
| 152 |
+
obs = self._env.reset(
|
| 153 |
+
target_name=target_name,
|
| 154 |
+
seed=seed if seed is not None else 0,
|
| 155 |
+
max_steps=max_steps,
|
| 156 |
+
target_code=target_code,
|
| 157 |
+
target_function_name=target_function_name,
|
| 158 |
+
edge_cases=edge_cases,
|
| 159 |
+
fuzz_spec=fuzz_spec,
|
| 160 |
+
)
|
| 161 |
+
self._episode_id = episode_id or obs.episode_id
|
| 162 |
+
self._target_function_name = obs.target_function_name
|
| 163 |
+
self._max_steps = max_steps
|
| 164 |
+
self._step_count = 0
|
| 165 |
+
self._done = False
|
| 166 |
+
return self._wrap_obs(obs, reward=None, done=False, info={})
|
| 167 |
+
|
| 168 |
+
def step( # type: ignore[override]
|
| 169 |
+
self,
|
| 170 |
+
action: "OpenSleuthAction",
|
| 171 |
+
timeout_s: Optional[float] = None,
|
| 172 |
+
**kwargs: Any,
|
| 173 |
+
) -> "OpenSleuthObservation":
|
| 174 |
+
if self._episode_id is None:
|
| 175 |
+
# Auto-reset on first step with the default target so HTTP /step
|
| 176 |
+
# smoke tests don't 500 just because /reset wasn't called first.
|
| 177 |
+
self.reset()
|
| 178 |
+
|
| 179 |
+
internal_action: Any
|
| 180 |
+
if action.action_type == "probe":
|
| 181 |
+
if action.input_repr is None:
|
| 182 |
+
raise ValueError(
|
| 183 |
+
"OpenSleuthAction(action_type='probe') requires input_repr."
|
| 184 |
+
)
|
| 185 |
+
internal_action = ProbeAction(input_repr=action.input_repr)
|
| 186 |
+
elif action.action_type == "submit":
|
| 187 |
+
if action.code is None:
|
| 188 |
+
raise ValueError(
|
| 189 |
+
"OpenSleuthAction(action_type='submit') requires code."
|
| 190 |
+
)
|
| 191 |
+
internal_action = SubmitAction(code=action.code)
|
| 192 |
+
else: # pragma: no cover - Pydantic Literal already constrains this
|
| 193 |
+
raise ValueError(f"Unknown action_type: {action.action_type!r}")
|
| 194 |
+
|
| 195 |
+
assert self._episode_id is not None
|
| 196 |
+
resp = self._env.step(self._episode_id, internal_action)
|
| 197 |
+
self._step_count += 1
|
| 198 |
+
self._done = resp.done
|
| 199 |
+
return self._wrap_obs(
|
| 200 |
+
resp.observation, reward=resp.reward, done=resp.done, info=resp.info
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
@property
|
| 204 |
+
def state(self) -> "OpenSleuthState": # type: ignore[override]
|
| 205 |
+
return OpenSleuthState(
|
| 206 |
+
episode_id=self._episode_id,
|
| 207 |
+
step_count=self._step_count,
|
| 208 |
+
target_function_name=self._target_function_name,
|
| 209 |
+
max_steps=self._max_steps,
|
| 210 |
+
finished=self._done,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
def get_metadata(self) -> "EnvironmentMetadata": # type: ignore[override]
|
| 214 |
+
return EnvironmentMetadata(
|
| 215 |
+
name="OpenSleuth",
|
| 216 |
+
description=(
|
| 217 |
+
"Algorithmic detective: probe a hidden Python function then submit "
|
| 218 |
+
"code that reproduces it. Used for GRPO RL training on Qwen-2.5."
|
| 219 |
+
),
|
| 220 |
+
version="0.4.1",
|
| 221 |
+
author="OpenSleuth team",
|
| 222 |
+
documentation_url=(
|
| 223 |
+
"https://huggingface.co/spaces/anugrah55/opensleuth-env-gemini-cli"
|
| 224 |
+
),
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
def close(self) -> None: # type: ignore[override]
|
| 228 |
+
self._episode_id = None
|
| 229 |
+
self._target_function_name = None
|
| 230 |
+
self._step_count = 0
|
| 231 |
+
self._done = False
|
| 232 |
+
|
| 233 |
+
def _wrap_obs(
|
| 234 |
+
self,
|
| 235 |
+
internal_obs: Any,
|
| 236 |
+
*,
|
| 237 |
+
reward: Optional[float],
|
| 238 |
+
done: bool,
|
| 239 |
+
info: dict,
|
| 240 |
+
) -> "OpenSleuthObservation":
|
| 241 |
+
return OpenSleuthObservation(
|
| 242 |
+
done=done,
|
| 243 |
+
reward=reward,
|
| 244 |
+
episode_id=internal_obs.episode_id,
|
| 245 |
+
target_function_name=internal_obs.target_function_name,
|
| 246 |
+
target_function_signature=internal_obs.target_function_signature,
|
| 247 |
+
probe_history=[r.model_dump() for r in internal_obs.probe_history],
|
| 248 |
+
last_error=internal_obs.last_error,
|
| 249 |
+
steps_taken=internal_obs.steps_taken,
|
| 250 |
+
max_steps=internal_obs.max_steps,
|
| 251 |
+
difficulty=internal_obs.difficulty,
|
| 252 |
+
coverage_buckets_seen=internal_obs.coverage_buckets_seen,
|
| 253 |
+
seen_outputs_count=internal_obs.seen_outputs_count,
|
| 254 |
+
seen_error_types_count=internal_obs.seen_error_types_count,
|
| 255 |
+
info=info,
|
| 256 |
+
metadata={"info": info},
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
__all__ = ["OPENENV_AVAILABLE"]
|
| 261 |
+
if OPENENV_AVAILABLE:
|
| 262 |
+
__all__ += [
|
| 263 |
+
"OpenSleuthAction",
|
| 264 |
+
"OpenSleuthObservation",
|
| 265 |
+
"OpenSleuthState",
|
| 266 |
+
"OpenSleuthEnvironment",
|
| 267 |
+
]
|
requirements.txt
CHANGED
|
@@ -1,8 +1,15 @@
|
|
| 1 |
-
fastapi==0.
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
# Level 2: Hub-driven task catalog. We swallow load failures at runtime so
|
| 5 |
# the env still functions if Hub is offline, but the dependency is required
|
| 6 |
# for Hub-backed tasks to be discoverable.
|
| 7 |
datasets>=3.0.0
|
| 8 |
huggingface_hub>=0.25.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# fastapi >=0.118 / starlette >=0.48 are required because openenv-core 0.2.3
|
| 2 |
+
# references status.HTTP_422_UNPROCESSABLE_CONTENT (added in starlette 0.48).
|
| 3 |
+
fastapi>=0.118.0
|
| 4 |
+
starlette>=0.48.0
|
| 5 |
+
uvicorn[standard]>=0.32.1
|
| 6 |
+
pydantic>=2.10.3
|
| 7 |
# Level 2: Hub-driven task catalog. We swallow load failures at runtime so
|
| 8 |
# the env still functions if Hub is offline, but the dependency is required
|
| 9 |
# for Hub-backed tasks to be discoverable.
|
| 10 |
datasets>=3.0.0
|
| 11 |
huggingface_hub>=0.25.0
|
| 12 |
+
# Hackathon conformance: meta-pytorch/OpenEnv 0.2.x -- exposes the canonical
|
| 13 |
+
# /openenv/{reset,step,state,health,metadata,schema,ws} surface alongside our
|
| 14 |
+
# legacy contract. See opensleuth_env/openenv_adapter.py.
|
| 15 |
+
openenv-core==0.2.3
|
server.py
CHANGED
|
@@ -1,4 +1,17 @@
|
|
| 1 |
-
"""FastAPI server exposing the OpenSleuth environment over HTTP.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -23,10 +36,53 @@ from opensleuth_env.task_catalog import TaskResolutionError
|
|
| 23 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
| 24 |
log = logging.getLogger("opensleuth.server")
|
| 25 |
|
| 26 |
-
app = FastAPI(title="OpenSleuth Env", version="0.
|
| 27 |
env = OpenSleuthEnv()
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
@app.get("/health")
|
| 31 |
def health():
|
| 32 |
return {
|
|
|
|
| 1 |
+
"""FastAPI server exposing the OpenSleuth environment over HTTP.
|
| 2 |
+
|
| 3 |
+
Two HTTP surfaces are served from this app:
|
| 4 |
+
|
| 5 |
+
* The legacy OpenSleuth contract (``/health``, ``/functions``, ``/tasks``,
|
| 6 |
+
``/reset``, ``/step``, ``/state/{episode_id}``, ``/probe_once``) used by the
|
| 7 |
+
in-flight trainer and eval harness.
|
| 8 |
+
* The OpenEnv-conformant sub-app mounted at ``/openenv/*`` (added in v0.5.0
|
| 9 |
+
for hackathon conformance) -- exposes ``/openenv/reset``, ``/openenv/step``,
|
| 10 |
+
``/openenv/state``, ``/openenv/health``, ``/openenv/metadata``,
|
| 11 |
+
``/openenv/schema``, and the canonical ``/openenv/ws`` WebSocket. See
|
| 12 |
+
:mod:`opensleuth_env.openenv_adapter` and
|
| 13 |
+
https://github.com/meta-pytorch/OpenEnv (v0.2.3).
|
| 14 |
+
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
| 17 |
|
|
|
|
| 36 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
| 37 |
log = logging.getLogger("opensleuth.server")
|
| 38 |
|
| 39 |
+
app = FastAPI(title="OpenSleuth Env", version="0.5.0")
|
| 40 |
env = OpenSleuthEnv()
|
| 41 |
|
| 42 |
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
# OpenEnv conformance: mount an upstream-spec sub-app at /openenv.
|
| 45 |
+
# This is kept additive so the existing trainer (which talks to the bare
|
| 46 |
+
# /reset and /step routes above) is completely unaffected.
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
from openenv.core.env_server.http_server import HTTPEnvServer
|
| 51 |
+
|
| 52 |
+
from opensleuth_env.openenv_adapter import (
|
| 53 |
+
OPENENV_AVAILABLE,
|
| 54 |
+
OpenSleuthAction,
|
| 55 |
+
OpenSleuthEnvironment,
|
| 56 |
+
OpenSleuthObservation,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
if OPENENV_AVAILABLE:
|
| 60 |
+
openenv_app = FastAPI(
|
| 61 |
+
title="OpenSleuth (OpenEnv-conformant)",
|
| 62 |
+
version="0.5.0",
|
| 63 |
+
description=(
|
| 64 |
+
"OpenEnv 0.2.x conformant surface for the OpenSleuth environment.\n\n"
|
| 65 |
+
"See https://github.com/meta-pytorch/OpenEnv -- this sub-app implements"
|
| 66 |
+
" the canonical reset/step/state/health/metadata/schema HTTP routes plus"
|
| 67 |
+
" the /ws WebSocket session protocol."
|
| 68 |
+
),
|
| 69 |
+
)
|
| 70 |
+
_openenv_server = HTTPEnvServer(
|
| 71 |
+
env=OpenSleuthEnvironment,
|
| 72 |
+
action_cls=OpenSleuthAction,
|
| 73 |
+
observation_cls=OpenSleuthObservation,
|
| 74 |
+
max_concurrent_envs=8,
|
| 75 |
+
)
|
| 76 |
+
_openenv_server.register_routes(openenv_app)
|
| 77 |
+
app.mount("/openenv", openenv_app)
|
| 78 |
+
log.info("Mounted OpenEnv-conformant sub-app at /openenv (openenv-core %s)",
|
| 79 |
+
_openenv_server.__class__.__module__)
|
| 80 |
+
else: # pragma: no cover
|
| 81 |
+
log.warning("openenv-core not importable; /openenv/* will be unavailable.")
|
| 82 |
+
except Exception as e: # pragma: no cover - fail open so legacy routes keep working
|
| 83 |
+
log.warning("Could not register OpenEnv sub-app: %s: %s", type(e).__name__, e)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
@app.get("/health")
|
| 87 |
def health():
|
| 88 |
return {
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for the OpenSleuth env + verifier.
|
| 2 |
+
|
| 3 |
+
Run with `pytest -q` from the env/ directory.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
from opensleuth_env import (
|
| 11 |
+
BLACK_BOX_FUNCTIONS,
|
| 12 |
+
OpenSleuthEnv,
|
| 13 |
+
ProbeAction,
|
| 14 |
+
SubmitAction,
|
| 15 |
+
)
|
| 16 |
+
from opensleuth_env.env import _bucket_of, NEW_BUCKET_BONUS, NEW_OUTPUT_BONUS, PROBE_STEP_COST
|
| 17 |
+
from opensleuth_env.verifier import (
|
| 18 |
+
calculate_complexity_penalty,
|
| 19 |
+
generate_fuzz_inputs,
|
| 20 |
+
get_edge_inputs,
|
| 21 |
+
verify_submission,
|
| 22 |
+
_looks_like_reference_import,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ---------- env transitions ------------------------------------------------
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_reset_returns_episode_id_and_signature():
|
| 30 |
+
env = OpenSleuthEnv()
|
| 31 |
+
obs = env.reset("fibonacci")
|
| 32 |
+
assert obs.episode_id
|
| 33 |
+
assert obs.target_function_name == "fibonacci"
|
| 34 |
+
assert "fibonacci" in obs.target_function_signature
|
| 35 |
+
assert obs.probe_history == []
|
| 36 |
+
assert obs.steps_taken == 0
|
| 37 |
+
# New v0.3 metadata.
|
| 38 |
+
assert obs.difficulty == "easy"
|
| 39 |
+
assert obs.coverage_buckets_seen == 0
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_unknown_target_raises():
|
| 43 |
+
env = OpenSleuthEnv()
|
| 44 |
+
with pytest.raises(ValueError):
|
| 45 |
+
env.reset("not_a_real_function")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_probe_with_int_input_records_output():
|
| 49 |
+
env = OpenSleuthEnv()
|
| 50 |
+
obs = env.reset("fibonacci")
|
| 51 |
+
resp = env.step(obs.episode_id, ProbeAction(input_repr="10"))
|
| 52 |
+
assert resp.done is False
|
| 53 |
+
assert resp.observation.probe_history[-1].is_error is False
|
| 54 |
+
assert resp.observation.probe_history[-1].output_repr == "55"
|
| 55 |
+
# First successful probe = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST.
|
| 56 |
+
expected = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST
|
| 57 |
+
assert resp.reward == pytest.approx(expected)
|
| 58 |
+
assert resp.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
|
| 59 |
+
assert resp.info["bucket"] == "int:medium"
|
| 60 |
+
assert resp.observation.coverage_buckets_seen == 1
|
| 61 |
+
assert resp.observation.seen_outputs_count == 1
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_probe_with_invalid_literal_returns_parse_error():
|
| 65 |
+
env = OpenSleuthEnv()
|
| 66 |
+
obs = env.reset("fibonacci")
|
| 67 |
+
resp = env.step(obs.episode_id, ProbeAction(input_repr="not a literal"))
|
| 68 |
+
assert resp.done is False
|
| 69 |
+
assert resp.observation.probe_history[-1].error_type == "ParseError"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def test_repeated_output_only_pays_intrinsic_once():
|
| 73 |
+
env = OpenSleuthEnv()
|
| 74 |
+
obs = env.reset("fibonacci")
|
| 75 |
+
r1 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
|
| 76 |
+
r2 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
|
| 77 |
+
assert r1.reward > r2.reward
|
| 78 |
+
# Second hit on the same bucket+output: just the per-step cost.
|
| 79 |
+
assert r2.reward == pytest.approx(PROBE_STEP_COST)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_step_limit_terminates_episode():
|
| 83 |
+
env = OpenSleuthEnv()
|
| 84 |
+
obs = env.reset("fibonacci", max_steps=2)
|
| 85 |
+
env.step(obs.episode_id, ProbeAction(input_repr="1"))
|
| 86 |
+
resp = env.step(obs.episode_id, ProbeAction(input_repr="2"))
|
| 87 |
+
assert resp.done is True
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def test_unknown_episode_id_raises():
|
| 91 |
+
env = OpenSleuthEnv()
|
| 92 |
+
with pytest.raises(KeyError):
|
| 93 |
+
env.step("does-not-exist", ProbeAction(input_repr="1"))
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ---------- coverage bucketing (CovRL-Fuzz inspired) -----------------------
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def test_bucket_of_distinguishes_qualitative_input_classes():
|
| 100 |
+
assert _bucket_of(0) == "int:zero"
|
| 101 |
+
assert _bucket_of(-1) == "int:negative"
|
| 102 |
+
assert _bucket_of(5) == "int:small"
|
| 103 |
+
assert _bucket_of(50) == "int:medium"
|
| 104 |
+
assert _bucket_of(5000) == "int:large"
|
| 105 |
+
assert _bucket_of(50_000) == "int:huge"
|
| 106 |
+
assert _bucket_of("") == "str:empty"
|
| 107 |
+
assert _bucket_of("a") == "str:singleton"
|
| 108 |
+
assert _bucket_of([]) == "list:empty"
|
| 109 |
+
assert _bucket_of((1, 2)) == "tuple:short"
|
| 110 |
+
assert _bucket_of(True) == "bool:True" # bool isolated from int
|
| 111 |
+
assert _bucket_of(None) == "none"
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_probe_distinct_buckets_each_pay_coverage_bonus():
|
| 115 |
+
env = OpenSleuthEnv()
|
| 116 |
+
obs = env.reset("fibonacci")
|
| 117 |
+
# 1 (small), 50 (medium), 5 (already small)
|
| 118 |
+
r1 = env.step(obs.episode_id, ProbeAction(input_repr="1"))
|
| 119 |
+
r2 = env.step(obs.episode_id, ProbeAction(input_repr="50"))
|
| 120 |
+
r3 = env.step(obs.episode_id, ProbeAction(input_repr="5"))
|
| 121 |
+
assert r1.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
|
| 122 |
+
assert r2.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
|
| 123 |
+
assert r3.info["coverage_bonus"] == pytest.approx(0.0)
|
| 124 |
+
assert r3.observation.coverage_buckets_seen == 2
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ---------- verifier -------------------------------------------------------
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def test_verifier_perfect_score_on_reference_impl():
|
| 131 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 132 |
+
code = (
|
| 133 |
+
"def fibonacci(n):\n"
|
| 134 |
+
" if not isinstance(n, int) or n <= 0 or n > 90:\n"
|
| 135 |
+
" raise ValueError('bad')\n"
|
| 136 |
+
" a, b = 0, 1\n"
|
| 137 |
+
" for _ in range(n - 1):\n"
|
| 138 |
+
" a, b = b, a + b\n"
|
| 139 |
+
" return b\n"
|
| 140 |
+
)
|
| 141 |
+
inputs = generate_fuzz_inputs(spec, count=30, seed=0)
|
| 142 |
+
edges = get_edge_inputs(spec)
|
| 143 |
+
result = verify_submission(code, spec.fn, inputs, target_name="fibonacci", edge_inputs=edges)
|
| 144 |
+
assert result.matches == 30 + len(edges)
|
| 145 |
+
assert result.execution_reward == pytest.approx(100.0)
|
| 146 |
+
assert result.edge_pass_rate == pytest.approx(1.0)
|
| 147 |
+
assert result.floor_penalty == 0.0
|
| 148 |
+
assert result.reward_hack_penalty == 0.0
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def test_verifier_partial_score_on_buggy_impl():
|
| 152 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 153 |
+
buggy = (
|
| 154 |
+
"def fibonacci(n):\n"
|
| 155 |
+
" if not isinstance(n, int) or n <= 0 or n > 90:\n"
|
| 156 |
+
" raise ValueError('bad')\n"
|
| 157 |
+
" a, b = 0, 1\n"
|
| 158 |
+
" for _ in range(n - 1):\n"
|
| 159 |
+
" a, b = b, a + b\n"
|
| 160 |
+
" return b + 1\n"
|
| 161 |
+
)
|
| 162 |
+
inputs = generate_fuzz_inputs(spec, count=30, seed=0)
|
| 163 |
+
result = verify_submission(buggy, spec.fn, inputs, target_name="fibonacci")
|
| 164 |
+
assert result.execution_reward == pytest.approx(0.0)
|
| 165 |
+
assert result.matches == 0
|
| 166 |
+
# Sub-50% match rate triggers the hard floor.
|
| 167 |
+
assert result.floor_penalty == 25.0
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_verifier_syntax_error_returns_define_error_and_full_penalty():
|
| 171 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 172 |
+
inputs = generate_fuzz_inputs(spec, count=10, seed=0)
|
| 173 |
+
result = verify_submission("def fib(:\n pass", spec.fn, inputs, target_name="fibonacci")
|
| 174 |
+
assert result.define_error is not None
|
| 175 |
+
assert result.execution_reward == 0.0
|
| 176 |
+
assert result.complexity_penalty == 50.0
|
| 177 |
+
assert result.floor_penalty == 25.0
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def test_verifier_missing_target_returns_error():
|
| 181 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 182 |
+
inputs = generate_fuzz_inputs(spec, count=10, seed=0)
|
| 183 |
+
result = verify_submission("def other(x): return x", spec.fn, inputs, target_name="fibonacci")
|
| 184 |
+
assert result.define_error is not None
|
| 185 |
+
assert result.execution_reward == 0.0
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def test_complexity_penalty_low_for_simple_fn():
|
| 189 |
+
code = "def f(x): return x\n"
|
| 190 |
+
assert calculate_complexity_penalty(code) < 1.0
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def test_complexity_penalty_high_for_branchy_fn():
|
| 194 |
+
body = "\n ".join(f"if x == {i}: return {i}" for i in range(100))
|
| 195 |
+
code = f"def f(x):\n {body}\n return -1\n"
|
| 196 |
+
assert calculate_complexity_penalty(code) > 5.0
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# ---------- anti-reward-hacking --------------------------------------------
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def test_sandbox_blocks_import_of_reference_module():
|
| 203 |
+
"""Critical regression: previously an agent could write::
|
| 204 |
+
|
| 205 |
+
from opensleuth_env.black_box import _fibonacci
|
| 206 |
+
def fibonacci(n): return _fibonacci(n)
|
| 207 |
+
|
| 208 |
+
and reward-hack to a perfect score. The hardened sandbox must block this.
|
| 209 |
+
"""
|
| 210 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 211 |
+
hack = (
|
| 212 |
+
"def fibonacci(n):\n"
|
| 213 |
+
" from opensleuth_env.black_box import _fibonacci\n"
|
| 214 |
+
" return _fibonacci(n)\n"
|
| 215 |
+
)
|
| 216 |
+
inputs = generate_fuzz_inputs(spec, count=10, seed=0)
|
| 217 |
+
result = verify_submission(hack, spec.fn, inputs, target_name="fibonacci")
|
| 218 |
+
# Either definition fails (no __import__) or per-call fails. Either way
|
| 219 |
+
# the agent must NOT score positively.
|
| 220 |
+
assert result.execution_reward < 50.0
|
| 221 |
+
# Static detector flagged the import attempt.
|
| 222 |
+
assert result.reward_hack_penalty >= 25.0
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def test_static_detector_flags_opensleuth_import():
|
| 226 |
+
code = "import opensleuth_env\ndef f(x): return x\n"
|
| 227 |
+
assert _looks_like_reference_import(code) is True
|
| 228 |
+
assert _looks_like_reference_import("def f(x): return x\n") is False
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def test_constant_function_collapse_is_penalised():
|
| 232 |
+
"""An agent that learns to always return the same value should be
|
| 233 |
+
penalised even if some random inputs happen to match (e.g. for
|
| 234 |
+
`digit_sum`, `lambda x: 0` matches only x=0)."""
|
| 235 |
+
spec = BLACK_BOX_FUNCTIONS["digit_sum"]
|
| 236 |
+
code = "def digit_sum(n):\n return 0\n"
|
| 237 |
+
inputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999]
|
| 238 |
+
result = verify_submission(code, spec.fn, inputs, target_name="digit_sum")
|
| 239 |
+
# All distinct inputs return 0 (one signature) while ref produces many.
|
| 240 |
+
assert result.reward_hack_penalty >= 15.0
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def test_sandbox_blocks_open_and_eval():
|
| 244 |
+
spec = BLACK_BOX_FUNCTIONS["fibonacci"]
|
| 245 |
+
bad = (
|
| 246 |
+
"def fibonacci(n):\n"
|
| 247 |
+
" open('/tmp/x', 'w')\n"
|
| 248 |
+
" return 0\n"
|
| 249 |
+
)
|
| 250 |
+
inputs = generate_fuzz_inputs(spec, count=5, seed=0)
|
| 251 |
+
result = verify_submission(bad, spec.fn, inputs, target_name="fibonacci")
|
| 252 |
+
# Either the per-call NameError on `open` makes everything mismatch,
|
| 253 |
+
# or it raises at definition time. Either way, low reward.
|
| 254 |
+
assert result.execution_reward < 50.0
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ---------- stratified scoring (edge vs random) ----------------------------
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def test_edge_cases_are_always_evaluated():
|
| 261 |
+
spec = BLACK_BOX_FUNCTIONS["reverse_string"]
|
| 262 |
+
# Submission that fails the empty-string edge case but works for non-empty.
|
| 263 |
+
code = (
|
| 264 |
+
"def reverse_string(s):\n"
|
| 265 |
+
" if s == '':\n"
|
| 266 |
+
" return 'OOPS'\n"
|
| 267 |
+
" return s[::-1]\n"
|
| 268 |
+
)
|
| 269 |
+
inputs = generate_fuzz_inputs(spec, count=20, seed=0)
|
| 270 |
+
edges = get_edge_inputs(spec)
|
| 271 |
+
assert "" in edges
|
| 272 |
+
result = verify_submission(
|
| 273 |
+
code, spec.fn, inputs, target_name="reverse_string", edge_inputs=edges
|
| 274 |
+
)
|
| 275 |
+
# Should pass most random + most edge except the empty-string edge case.
|
| 276 |
+
assert result.matches_by_category["edge"] == len(edges) - 1
|
| 277 |
+
assert result.edge_pass_rate < 1.0
|
| 278 |
+
assert result.matches_by_category["random"] >= 18 # very rare to roll empty
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# ---------- end-to-end submission via env ----------------------------------
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def test_env_submit_reference_implementation_gives_high_reward():
|
| 285 |
+
env = OpenSleuthEnv(fuzz_count=20)
|
| 286 |
+
obs = env.reset("reverse_string")
|
| 287 |
+
code = "def reverse_string(s):\n return s[::-1]\n"
|
| 288 |
+
resp = env.step(obs.episode_id, SubmitAction(code=code))
|
| 289 |
+
assert resp.done is True
|
| 290 |
+
# 100 - tiny complexity penalty + 50 perfect bonus.
|
| 291 |
+
assert resp.reward > 140.0
|
| 292 |
+
assert resp.info["execution_reward"] == pytest.approx(100.0)
|
| 293 |
+
assert resp.info["edge_pass_rate"] == pytest.approx(1.0)
|
| 294 |
+
assert resp.info["floor_penalty"] == 0.0
|
| 295 |
+
assert resp.info["reward_hack_penalty"] == 0.0
|
| 296 |
+
assert resp.info["perfect_bonus"] == 50.0
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def test_env_submit_buggy_function_lands_clearly_negative():
|
| 300 |
+
"""Wrong submissions must end up clearly negative so the trainer's GRPO
|
| 301 |
+
advantage penalises 'just emit any function'."""
|
| 302 |
+
env = OpenSleuthEnv(fuzz_count=10)
|
| 303 |
+
obs = env.reset("digit_sum")
|
| 304 |
+
code = "def digit_sum(n):\n return -1\n"
|
| 305 |
+
resp = env.step(obs.episode_id, SubmitAction(code=code))
|
| 306 |
+
assert resp.done is True
|
| 307 |
+
assert resp.info["execution_reward"] < 50.0
|
| 308 |
+
assert resp.reward < 0.0
|
| 309 |
+
assert resp.info["floor_penalty"] == 25.0
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def test_env_submit_import_hack_scores_clearly_negative():
|
| 313 |
+
env = OpenSleuthEnv(fuzz_count=10)
|
| 314 |
+
obs = env.reset("fibonacci")
|
| 315 |
+
code = (
|
| 316 |
+
"def fibonacci(n):\n"
|
| 317 |
+
" from opensleuth_env.black_box import _fibonacci\n"
|
| 318 |
+
" return _fibonacci(n)\n"
|
| 319 |
+
)
|
| 320 |
+
resp = env.step(obs.episode_id, SubmitAction(code=code))
|
| 321 |
+
assert resp.done is True
|
| 322 |
+
assert resp.reward < 0.0
|
| 323 |
+
assert resp.info["reward_hack_penalty"] >= 25.0
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ---------- spec metadata --------------------------------------------------
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def test_all_specs_have_difficulty_and_edge_cases():
|
| 330 |
+
valid = {"easy", "medium", "hard"}
|
| 331 |
+
for name, spec in BLACK_BOX_FUNCTIONS.items():
|
| 332 |
+
assert spec.difficulty in valid, f"{name} has invalid difficulty {spec.difficulty!r}"
|
| 333 |
+
assert isinstance(spec.edge_cases, list)
|
| 334 |
+
assert len(spec.edge_cases) >= 3, f"{name} should declare >=3 edge cases for robust scoring"
|
tests/test_openenv_conformance.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv 0.2.x protocol conformance tests for the OpenSleuth env.
|
| 2 |
+
|
| 3 |
+
These tests are *additive* and orthogonal to the existing legacy contract
|
| 4 |
+
covered in ``test_env.py`` / ``test_open_env.py``.
|
| 5 |
+
|
| 6 |
+
What we verify:
|
| 7 |
+
|
| 8 |
+
* The OpenEnv ``Environment`` adapter (:class:`OpenSleuthEnvironment`) implements
|
| 9 |
+
all four required methods (``reset`` / ``step`` / ``state`` / ``get_metadata``)
|
| 10 |
+
and returns instances of OpenEnv's ``Observation`` / ``State`` /
|
| 11 |
+
``EnvironmentMetadata`` base classes (so it would pass any ``isinstance``
|
| 12 |
+
check by an OpenEnv-aware harness).
|
| 13 |
+
* The ``/openenv/*`` HTTP sub-app exposes every endpoint OpenEnv 0.2.x
|
| 14 |
+
promises: ``/health``, ``/metadata``, ``/schema``, ``/state``, ``/reset``,
|
| 15 |
+
``/step``. (The ``/ws`` WebSocket is exercised separately via the
|
| 16 |
+
``smoke_openenv_client.py`` script run against the live Space.)
|
| 17 |
+
* ``/openenv/reset`` returns the canonical ``{"observation", "reward", "done"}``
|
| 18 |
+
envelope (NOT a bare observation, which is the legacy shape).
|
| 19 |
+
* ``/openenv/step`` accepts the canonical ``{"action": {...}}`` envelope (NOT
|
| 20 |
+
``{"episode_id", "action"}``, which is the legacy shape).
|
| 21 |
+
* The legacy bare ``/reset`` and ``/step`` routes the trainer uses are
|
| 22 |
+
untouched.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import pytest
|
| 28 |
+
|
| 29 |
+
pytest.importorskip(
|
| 30 |
+
"openenv.core.env_server.types",
|
| 31 |
+
reason="openenv-core not installed; conformance tests skipped.",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
from fastapi.testclient import TestClient
|
| 35 |
+
from openenv.core.env_server.types import (
|
| 36 |
+
EnvironmentMetadata,
|
| 37 |
+
Observation as OEObservation,
|
| 38 |
+
State as OEState,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
from opensleuth_env.openenv_adapter import (
|
| 42 |
+
OpenSleuthAction,
|
| 43 |
+
OpenSleuthEnvironment,
|
| 44 |
+
OpenSleuthObservation,
|
| 45 |
+
OpenSleuthState,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Adapter-level: exercises the Environment subclass directly (no HTTP).
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestEnvironmentSubclass:
|
| 55 |
+
def test_observation_inherits_openenv_base(self) -> None:
|
| 56 |
+
env = OpenSleuthEnvironment()
|
| 57 |
+
obs = env.reset()
|
| 58 |
+
assert isinstance(obs, OEObservation), (
|
| 59 |
+
"OpenSleuthObservation must subclass openenv.core...types.Observation "
|
| 60 |
+
"so OpenEnv tooling (rubrics, evals, web UI) can introspect it."
|
| 61 |
+
)
|
| 62 |
+
# Must expose the OpenEnv-required fields.
|
| 63 |
+
assert obs.done is False
|
| 64 |
+
assert obs.reward is None
|
| 65 |
+
assert isinstance(obs.metadata, dict)
|
| 66 |
+
|
| 67 |
+
def test_state_inherits_openenv_base(self) -> None:
|
| 68 |
+
env = OpenSleuthEnvironment()
|
| 69 |
+
env.reset()
|
| 70 |
+
state = env.state
|
| 71 |
+
assert isinstance(state, OEState)
|
| 72 |
+
assert state.episode_id is not None
|
| 73 |
+
assert state.step_count == 0
|
| 74 |
+
|
| 75 |
+
def test_metadata_is_openenv_environment_metadata(self) -> None:
|
| 76 |
+
env = OpenSleuthEnvironment()
|
| 77 |
+
meta = env.get_metadata()
|
| 78 |
+
assert isinstance(meta, EnvironmentMetadata)
|
| 79 |
+
assert meta.name == "OpenSleuth"
|
| 80 |
+
assert meta.description
|
| 81 |
+
assert meta.version
|
| 82 |
+
|
| 83 |
+
def test_reset_step_full_loop(self) -> None:
|
| 84 |
+
env = OpenSleuthEnvironment()
|
| 85 |
+
env.reset(target_name="fibonacci", max_steps=10, seed=0)
|
| 86 |
+
|
| 87 |
+
probe = env.step(
|
| 88 |
+
OpenSleuthAction(action_type="probe", input_repr="10")
|
| 89 |
+
)
|
| 90 |
+
assert probe.done is False
|
| 91 |
+
assert probe.reward is not None and probe.reward > 0
|
| 92 |
+
assert probe.probe_history[-1]["output_repr"] == "55"
|
| 93 |
+
assert env.state.step_count == 1
|
| 94 |
+
|
| 95 |
+
submit = env.step(
|
| 96 |
+
OpenSleuthAction(
|
| 97 |
+
action_type="submit",
|
| 98 |
+
code="def fibonacci(n):\n a,b=0,1\n for _ in range(n-1):\n a,b=b,a+b\n return b\n",
|
| 99 |
+
)
|
| 100 |
+
)
|
| 101 |
+
assert submit.done is True
|
| 102 |
+
assert submit.reward is not None
|
| 103 |
+
assert env.state.finished is True
|
| 104 |
+
|
| 105 |
+
def test_reset_with_no_args_uses_safe_default(self) -> None:
|
| 106 |
+
"""OpenEnv requires reset() to work with zero arguments. We use
|
| 107 |
+
'fibonacci' as the implicit default so a bare reset always produces
|
| 108 |
+
a valid episode."""
|
| 109 |
+
env = OpenSleuthEnvironment()
|
| 110 |
+
obs = env.reset()
|
| 111 |
+
assert obs.target_function_name == "fibonacci"
|
| 112 |
+
|
| 113 |
+
def test_supports_concurrent_sessions_flag(self) -> None:
|
| 114 |
+
"""OpenEnv's HTTPEnvServer refuses max_concurrent_envs > 1 unless
|
| 115 |
+
the env opts in via SUPPORTS_CONCURRENT_SESSIONS."""
|
| 116 |
+
assert OpenSleuthEnvironment.SUPPORTS_CONCURRENT_SESSIONS is True
|
| 117 |
+
|
| 118 |
+
def test_action_is_extra_forbid(self) -> None:
|
| 119 |
+
"""OpenEnv Action base sets extra='forbid' to catch typo'd fields
|
| 120 |
+
early. Our OpenSleuthAction must inherit that behavior."""
|
| 121 |
+
from pydantic import ValidationError
|
| 122 |
+
|
| 123 |
+
with pytest.raises(ValidationError):
|
| 124 |
+
OpenSleuthAction(action_type="probe", input_repr="1", made_up_field=1)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
# HTTP-level: verifies the /openenv/* sub-app routes that judges will hit.
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@pytest.fixture(scope="module")
|
| 133 |
+
def http_client() -> TestClient:
|
| 134 |
+
from server import app
|
| 135 |
+
|
| 136 |
+
with TestClient(app) as client:
|
| 137 |
+
yield client
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class TestOpenEnvHttpSurface:
|
| 141 |
+
"""The endpoints the OpenEnv spec / `openenv validate` look for."""
|
| 142 |
+
|
| 143 |
+
def test_health(self, http_client: TestClient) -> None:
|
| 144 |
+
r = http_client.get("/openenv/health")
|
| 145 |
+
assert r.status_code == 200, r.text
|
| 146 |
+
assert r.json() == {"status": "healthy"}
|
| 147 |
+
|
| 148 |
+
def test_metadata(self, http_client: TestClient) -> None:
|
| 149 |
+
r = http_client.get("/openenv/metadata")
|
| 150 |
+
assert r.status_code == 200, r.text
|
| 151 |
+
body = r.json()
|
| 152 |
+
for key in ("name", "description", "version"):
|
| 153 |
+
assert key in body, f"missing {key} in /openenv/metadata"
|
| 154 |
+
assert body["name"] == "OpenSleuth"
|
| 155 |
+
|
| 156 |
+
def test_schema(self, http_client: TestClient) -> None:
|
| 157 |
+
r = http_client.get("/openenv/schema")
|
| 158 |
+
assert r.status_code == 200, r.text
|
| 159 |
+
body = r.json()
|
| 160 |
+
for key in ("action", "observation", "state"):
|
| 161 |
+
assert key in body, f"missing {key} in /openenv/schema"
|
| 162 |
+
assert "properties" in body[key], (
|
| 163 |
+
f"/openenv/schema {key!r} is not a valid JSON schema"
|
| 164 |
+
)
|
| 165 |
+
# action discriminator should be visible in the schema
|
| 166 |
+
assert "action_type" in body["action"]["properties"]
|
| 167 |
+
|
| 168 |
+
def test_state(self, http_client: TestClient) -> None:
|
| 169 |
+
r = http_client.get("/openenv/state")
|
| 170 |
+
assert r.status_code == 200, r.text
|
| 171 |
+
body = r.json()
|
| 172 |
+
assert "episode_id" in body
|
| 173 |
+
assert "step_count" in body
|
| 174 |
+
|
| 175 |
+
def test_reset_returns_canonical_envelope(self, http_client: TestClient) -> None:
|
| 176 |
+
r = http_client.post("/openenv/reset", json={"target_name": "fibonacci"})
|
| 177 |
+
assert r.status_code == 200, r.text
|
| 178 |
+
body = r.json()
|
| 179 |
+
# Canonical OpenEnv shape: {"observation": {...}, "reward": ..., "done": ...}
|
| 180 |
+
assert set(body.keys()) == {"observation", "reward", "done"}, (
|
| 181 |
+
f"Expected OpenEnv envelope, got keys: {sorted(body)}"
|
| 182 |
+
)
|
| 183 |
+
assert body["done"] is False
|
| 184 |
+
assert body["observation"]["target_function_name"] == "fibonacci"
|
| 185 |
+
|
| 186 |
+
def test_reset_with_no_body_works(self, http_client: TestClient) -> None:
|
| 187 |
+
"""OpenEnv ResetRequest defaults to an empty body. Must still work."""
|
| 188 |
+
r = http_client.post("/openenv/reset")
|
| 189 |
+
assert r.status_code == 200, r.text
|
| 190 |
+
body = r.json()
|
| 191 |
+
assert "observation" in body
|
| 192 |
+
|
| 193 |
+
def test_step_canonical_envelope_with_probe(self, http_client: TestClient) -> None:
|
| 194 |
+
r = http_client.post(
|
| 195 |
+
"/openenv/step",
|
| 196 |
+
json={"action": {"action_type": "probe", "input_repr": "10"}},
|
| 197 |
+
)
|
| 198 |
+
assert r.status_code == 200, r.text
|
| 199 |
+
body = r.json()
|
| 200 |
+
assert set(body.keys()) == {"observation", "reward", "done"}
|
| 201 |
+
# Note: under HTTP (stateless), each /openenv/step gets a fresh env;
|
| 202 |
+
# we auto-reset so a probe still produces a valid history.
|
| 203 |
+
assert body["observation"]["probe_history"], "probe should produce history"
|
| 204 |
+
|
| 205 |
+
def test_step_rejects_unknown_action_field(self, http_client: TestClient) -> None:
|
| 206 |
+
r = http_client.post(
|
| 207 |
+
"/openenv/step",
|
| 208 |
+
json={"action": {"action_type": "probe", "input_repr": "1", "wat": True}},
|
| 209 |
+
)
|
| 210 |
+
# OpenEnv's deserialize_action raises ValidationError -> 422.
|
| 211 |
+
assert r.status_code == 422
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# ---------------------------------------------------------------------------
|
| 215 |
+
# Regression: the legacy trainer-facing routes must still work unchanged.
|
| 216 |
+
# ---------------------------------------------------------------------------
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
class TestLegacyContractPreserved:
|
| 220 |
+
def test_legacy_health(self, http_client: TestClient) -> None:
|
| 221 |
+
r = http_client.get("/health")
|
| 222 |
+
assert r.status_code == 200
|
| 223 |
+
assert r.json()["status"] == "ok"
|
| 224 |
+
|
| 225 |
+
def test_legacy_reset_returns_bare_observation(self, http_client: TestClient) -> None:
|
| 226 |
+
"""Trainer expects {episode_id, target_function_name, ...} at the top
|
| 227 |
+
level (NOT wrapped in {observation: ...}). Must NOT regress."""
|
| 228 |
+
r = http_client.post(
|
| 229 |
+
"/reset",
|
| 230 |
+
json={"target_name": "fibonacci", "seed": 0, "max_steps": 5},
|
| 231 |
+
)
|
| 232 |
+
assert r.status_code == 200, r.text
|
| 233 |
+
body = r.json()
|
| 234 |
+
assert "episode_id" in body, (
|
| 235 |
+
"Legacy /reset must return a bare observation, not the OpenEnv envelope. "
|
| 236 |
+
"If this fails the trainer will break."
|
| 237 |
+
)
|
| 238 |
+
assert "observation" not in body # don't accidentally double-wrap
|
| 239 |
+
|
| 240 |
+
def test_legacy_step_returns_step_response(self, http_client: TestClient) -> None:
|
| 241 |
+
reset = http_client.post(
|
| 242 |
+
"/reset",
|
| 243 |
+
json={"target_name": "fibonacci", "seed": 0, "max_steps": 5},
|
| 244 |
+
).json()
|
| 245 |
+
eid = reset["episode_id"]
|
| 246 |
+
r = http_client.post(
|
| 247 |
+
"/step",
|
| 248 |
+
json={
|
| 249 |
+
"episode_id": eid,
|
| 250 |
+
"action": {"action_type": "probe", "input_repr": "5"},
|
| 251 |
+
},
|
| 252 |
+
)
|
| 253 |
+
assert r.status_code == 200, r.text
|
| 254 |
+
body = r.json()
|
| 255 |
+
# Legacy shape: {observation, reward, done, info}
|
| 256 |
+
assert {"observation", "reward", "done", "info"} <= set(body.keys())
|
| 257 |
+
assert "execution_reward" not in body # only present on submit info
|