burtenshaw HF Staff commited on
Commit
ed784ca
·
verified ·
1 Parent(s): 47149b0

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +10 -6
  2. README.md +18 -1
  3. client.py +2 -2
  4. envs/tbench2_env/README.md +207 -0
  5. envs/tbench2_env/__init__.py +18 -0
  6. envs/tbench2_env/client.py +75 -0
  7. envs/tbench2_env/models.py +58 -0
  8. envs/tbench2_env/openenv.yaml +7 -0
  9. envs/tbench2_env/openenv_tbench2_env.egg-info/PKG-INFO +13 -0
  10. envs/tbench2_env/openenv_tbench2_env.egg-info/SOURCES.txt +14 -0
  11. envs/tbench2_env/openenv_tbench2_env.egg-info/dependency_links.txt +1 -0
  12. envs/tbench2_env/openenv_tbench2_env.egg-info/entry_points.txt +2 -0
  13. envs/tbench2_env/openenv_tbench2_env.egg-info/requires.txt +11 -0
  14. envs/tbench2_env/openenv_tbench2_env.egg-info/top_level.txt +1 -0
  15. envs/tbench2_env/pyproject.toml +44 -0
  16. envs/tbench2_env/server/Dockerfile +81 -0
  17. envs/tbench2_env/server/__init__.py +12 -0
  18. envs/tbench2_env/server/app.py +107 -0
  19. envs/tbench2_env/server/tbench2_env_environment.py +728 -0
  20. openenv_tbench2_env.egg-info/PKG-INFO +13 -0
  21. openenv_tbench2_env.egg-info/SOURCES.txt +14 -0
  22. openenv_tbench2_env.egg-info/dependency_links.txt +1 -0
  23. openenv_tbench2_env.egg-info/entry_points.txt +2 -0
  24. openenv_tbench2_env.egg-info/requires.txt +11 -0
  25. openenv_tbench2_env.egg-info/top_level.txt +1 -0
  26. pyproject.toml +1 -1
  27. server/Dockerfile +81 -0
  28. server/app.py +6 -3
  29. server/tbench2_env_environment.py +47 -13
  30. src/__init__.py +7 -0
  31. src/core/README.md +212 -0
  32. src/core/__init__.py +81 -0
  33. src/core/client_types.py +23 -0
  34. src/core/containers/__init__.py +7 -0
  35. src/core/containers/images/Dockerfile +64 -0
  36. src/core/containers/images/README.md +92 -0
  37. src/core/containers/runtime/__init__.py +25 -0
  38. src/core/containers/runtime/daytona_provider.py +572 -0
  39. src/core/containers/runtime/providers.py +669 -0
  40. src/core/containers/runtime/uv_provider.py +224 -0
  41. src/core/containers/test_local_docker_provider.py +260 -0
  42. src/core/env_client.py +484 -0
  43. src/core/env_server/__init__.py +150 -0
  44. src/core/env_server/base_transforms.py +29 -0
  45. src/core/env_server/exceptions.py +105 -0
  46. src/core/env_server/gradio_theme.py +128 -0
  47. src/core/env_server/gradio_ui.py +240 -0
  48. src/core/env_server/http_server.py +1646 -0
  49. src/core/env_server/interfaces.py +297 -0
  50. src/core/env_server/mcp_environment.py +645 -0
Dockerfile CHANGED
@@ -11,13 +11,13 @@
11
  # The build script (openenv build) handles context detection and sets appropriate build args.
12
 
13
  ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
- FROM ${BASE_IMAGE} AS builder
15
 
16
  WORKDIR /app
17
 
18
  # Ensure git is available (required for installing dependencies from VCS)
19
  RUN apt-get update && \
20
- apt-get install -y --no-install-recommends git && \
21
  rm -rf /var/lib/apt/lists/*
22
 
23
  # Build argument to control whether we're building standalone or in-repo
@@ -40,22 +40,26 @@ RUN if ! command -v uv >/dev/null 2>&1; then \
40
 
41
  # Install dependencies using uv sync
42
  # If uv.lock exists, use it; otherwise resolve on the fly
 
 
 
 
43
  RUN --mount=type=cache,target=/root/.cache/uv \
44
  if [ -f uv.lock ]; then \
45
- uv sync --frozen --no-install-project --no-editable; \
46
  else \
47
  uv sync --no-install-project --no-editable; \
48
  fi
49
 
50
  RUN --mount=type=cache,target=/root/.cache/uv \
51
  if [ -f uv.lock ]; then \
52
- uv sync --frozen --no-editable; \
53
  else \
54
  uv sync --no-editable; \
55
  fi
56
 
57
  # Final runtime stage
58
- FROM ${BASE_IMAGE}
59
 
60
  WORKDIR /app
61
 
@@ -70,6 +74,7 @@ ENV PATH="/app/.venv/bin:$PATH"
70
 
71
  # Set PYTHONPATH so imports work correctly
72
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
 
73
 
74
  # Health check
75
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
@@ -77,5 +82,4 @@ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
77
 
78
  # Run the FastAPI server
79
  # The module path is constructed to work with the /app/env structure
80
- ENV ENABLE_WEB_INTERFACE=true
81
  CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
11
  # The build script (openenv build) handles context detection and sets appropriate build args.
12
 
13
  ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ghcr.io/meta-pytorch/openenv-base:latest AS builder
15
 
16
  WORKDIR /app
17
 
18
  # Ensure git is available (required for installing dependencies from VCS)
19
  RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git gcc python3-dev && \
21
  rm -rf /var/lib/apt/lists/*
22
 
23
  # Build argument to control whether we're building standalone or in-repo
 
40
 
41
  # Install dependencies using uv sync
42
  # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
44
+ install -m 0755 /root/.local/bin/uv /usr/local/bin/uv && \
45
+ install -m 0755 /root/.local/bin/uvx /usr/local/bin/uvx
46
+
47
  RUN --mount=type=cache,target=/root/.cache/uv \
48
  if [ -f uv.lock ]; then \
49
+ uv sync --no-install-project --no-editable; \
50
  else \
51
  uv sync --no-install-project --no-editable; \
52
  fi
53
 
54
  RUN --mount=type=cache,target=/root/.cache/uv \
55
  if [ -f uv.lock ]; then \
56
+ uv sync --no-editable; \
57
  else \
58
  uv sync --no-editable; \
59
  fi
60
 
61
  # Final runtime stage
62
+ FROM ghcr.io/meta-pytorch/openenv-base:latest
63
 
64
  WORKDIR /app
65
 
 
74
 
75
  # Set PYTHONPATH so imports work correctly
76
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
77
+ ENV ENABLE_WEB_INTERFACE=true
78
 
79
  # Health check
80
  HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
 
82
 
83
  # Run the FastAPI server
84
  # The module path is constructed to work with the /app/env structure
 
85
  CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -2,17 +2,34 @@
2
  title: TB2 Environment Server
3
  emoji: "🧪"
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  app_port: 8000
9
  base_path: /web
10
  tags:
 
11
  - openenv
12
  - terminal-bench-2
13
  - spaces
14
  ---
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # TB2 Environment (Terminal-Bench 2)
17
 
18
  OpenEnv wrapper for [Terminal-Bench 2](https://github.com/laude-institute/terminal-bench-2) tasks. Supports two execution modes:
 
2
  title: TB2 Environment Server
3
  emoji: "🧪"
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
  app_port: 8000
9
  base_path: /web
10
  tags:
11
+ - openenv-0.2.3
12
  - openenv
13
  - terminal-bench-2
14
  - spaces
15
  ---
16
 
17
+ ## Hugging Face Space Deployment
18
+
19
+ This Space is built from OpenEnv environment `tbench2_env`.
20
+
21
+ - Space URL: `https://huggingface.co/spaces/openenv/tbench2`
22
+ - OpenEnv pinned ref: `0.2.3`
23
+ - Hub tag: `openenv`
24
+
25
+ ### Connecting from Code
26
+
27
+ ```python
28
+ from envs.tbench2_env import Env
29
+
30
+ env = Env(base_url="https://huggingface.co/spaces/openenv/tbench2")
31
+ ```
32
+
33
  # TB2 Environment (Terminal-Bench 2)
34
 
35
  OpenEnv wrapper for [Terminal-Bench 2](https://github.com/laude-institute/terminal-bench-2) tasks. Supports two execution modes:
client.py CHANGED
@@ -19,12 +19,12 @@ try:
19
 
20
  from .models import Tbench2Action, Tbench2Observation, Tbench2State
21
  except ImportError:
 
 
22
  # Standalone imports (when environment is standalone with openenv from pip)
23
  from openenv.core.client_types import StepResult
24
  from openenv.core.env_client import EnvClient
25
 
26
- from models import Tbench2Action, Tbench2Observation, Tbench2State
27
-
28
 
29
  class Tbench2Env(EnvClient[Tbench2Action, Tbench2Observation, Tbench2State]):
30
  """HTTP client for the TB2 environment."""
 
19
 
20
  from .models import Tbench2Action, Tbench2Observation, Tbench2State
21
  except ImportError:
22
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
23
+
24
  # Standalone imports (when environment is standalone with openenv from pip)
25
  from openenv.core.client_types import StepResult
26
  from openenv.core.env_client import EnvClient
27
 
 
 
28
 
29
  class Tbench2Env(EnvClient[Tbench2Action, Tbench2Observation, Tbench2State]):
30
  """HTTP client for the TB2 environment."""
envs/tbench2_env/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TB2 Environment Server
3
+ emoji: "🧪"
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - terminal-bench-2
13
+ - spaces
14
+ ---
15
+
16
+ # TB2 Environment (Terminal-Bench 2)
17
+
18
+ OpenEnv wrapper for [Terminal-Bench 2](https://github.com/laude-institute/terminal-bench-2) tasks. Supports two execution modes:
19
+
20
+ | Mode | Description | Use Case |
21
+ |------|-------------|----------|
22
+ | **Local** | Runs commands in the server process (no Docker) | Hugging Face Spaces, environments without Docker access |
23
+ | **Docker** | Runs each task in its own container | Full TB2.0 fidelity with custom task images |
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from tbench2_env import Tbench2Env, Tbench2Action
29
+
30
+ env = Tbench2Env(base_url="http://localhost:8000")
31
+ result = env.reset(task_id="headless-terminal")
32
+ print(result.observation.instruction)
33
+
34
+ result = env.step(Tbench2Action(action_type="exec", command="ls -la"))
35
+ print(result.observation.output)
36
+
37
+ result = env.step(Tbench2Action(action_type="evaluate"))
38
+ print(result.reward, result.done)
39
+
40
+ env.close()
41
+ ```
42
+
43
+ ## Building the Docker Image
44
+
45
+ Before using the environment, build the Docker image:
46
+
47
+ ```bash
48
+ # From project root
49
+ docker build -t tbench2-env:latest -f envs/tbench2_env/server/Dockerfile .
50
+ ```
51
+
52
+ ## Environment Details
53
+
54
+ ### Action
55
+ **Tbench2Action**: Controls interaction with the TB2 task session
56
+
57
+ | Field | Type | Default | Description |
58
+ |-------|------|---------|-------------|
59
+ | `action_type` | str | `"exec"` | Action to perform (`exec`, `write`, `view`, `wait`, `kill`, `write_file`, `evaluate`, `close`) |
60
+ | `command` | str | `""` | Shell command or input to send |
61
+ | `session_id` | str \| None | `None` | Session ID for streaming processes |
62
+ | `block` | bool | `True` | Whether to block until command completes |
63
+ | `wait_seconds` | float \| None | `None` | Time to wait (for `wait` action) |
64
+ | `file_path` | str | `""` | File path (for `write_file` action) |
65
+ | `content` | str | `""` | Content to write (for `write_file` action) |
66
+
67
+ ### Observation
68
+ **Tbench2Observation**: Contains the environment response
69
+
70
+ | Field | Type | Description |
71
+ |-------|------|-------------|
72
+ | `instruction` | str | Task instruction/prompt from the TB2 task |
73
+ | `output` | str | Command output (stdout/stderr) |
74
+ | `success` | bool | Whether the action succeeded |
75
+ | `error` | str | Error message if action failed |
76
+ | `task_id` | str | Current task identifier |
77
+ | `task_path` | str | Path to the task directory |
78
+ | `session_id` | str \| None | Session ID for streaming processes |
79
+ | `action_type` | str | The action type that produced this observation |
80
+ | `info` | dict | Additional metadata |
81
+
82
+ ### State
83
+ **Tbench2State**: Server-side state for the task session
84
+
85
+ | Field | Type | Description |
86
+ |-------|------|-------------|
87
+ | `task_id` | str | Current task identifier |
88
+ | `task_path` | str | Path to the task directory |
89
+ | `session_id` | str | Active session ID |
90
+ | `terminal_ready` | bool | Whether the terminal is ready for commands |
91
+ | `last_action_type` | str | Last action type executed |
92
+ | `last_command` | str | Last command executed |
93
+ | `last_output` | str | Output from last command |
94
+
95
+ ## Execution Modes
96
+
97
+ ### Local Mode (Default)
98
+
99
+ Commands execute directly in the server process. Ideal for HF Spaces where Docker-in-Docker is unavailable.
100
+
101
+ ```bash
102
+ # Default - local mode
103
+ python -m tbench2_env.server.app
104
+
105
+ # Or explicitly set mode
106
+ TB2_MODE=local python -m tbench2_env.server.app
107
+ ```
108
+
109
+ **Note:** Local mode ignores Docker images specified in task.toml. Tasks requiring specific runtime environments may fail.
110
+
111
+ ### Docker Mode
112
+
113
+ Each task runs in its own Docker container, using the image specified in the task's `task.toml`:
114
+
115
+ ```bash
116
+ # Enable Docker mode
117
+ TB2_MODE=docker python -m tbench2_env.server.app
118
+ ```
119
+
120
+ **Requirements:**
121
+ - Docker socket mounted at `/var/run/docker.sock`
122
+ - Sufficient disk space for container images
123
+ - Network access to pull images if not cached
124
+
125
+ **Environment Variables for Docker Mode:**
126
+ - `TB2_MODE=docker` - Enable Docker-backed execution
127
+ - Docker socket must be accessible (mounted volume)
128
+
129
+ ## Action Types
130
+
131
+ | Action | Description | Required Fields |
132
+ |--------|-------------|-----------------|
133
+ | `exec` | Run a shell command | `command`, optionally `block`, `session_id` |
134
+ | `write` | Send input to a running session | `session_id`, `command` |
135
+ | `view` | Read pending output | `session_id` |
136
+ | `wait` | Wait for output | `session_id`, optionally `wait_seconds` |
137
+ | `kill` | Terminate a running session | `session_id` |
138
+ | `write_file` | Write content to a file | `file_path`, `content` |
139
+ | `evaluate` | Run pytest tests, return reward | (none) |
140
+ | `close` | Stop and cleanup | (none) |
141
+
142
+ ## Session IDs (Streaming Processes)
143
+
144
+ `session_id` is **only** required when you start a non-blocking process and want to interact with it (`write`, `view`, `wait`, `kill`). For plain `exec` commands, you can omit it.
145
+
146
+ Example (Python):
147
+ ```python
148
+ # Start a long-running process
149
+ env.step(Tbench2Action(action_type="exec", command="python -i", block=False, session_id="sess1"))
150
+
151
+ # Send input to it
152
+ env.step(Tbench2Action(action_type="write", session_id="sess1", command="print(2+2)\n"))
153
+
154
+ # Read its output
155
+ env.step(Tbench2Action(action_type="view", session_id="sess1"))
156
+ ```
157
+
158
+ ## Environment Variables
159
+
160
+ | Variable | Default | Description |
161
+ |----------|---------|-------------|
162
+ | `TB2_MODE` | `local` | Execution mode: `local` or `docker` |
163
+ | `TB2_TASKS_DIR` | (auto-download) | Path to local Terminal-Bench-2 repo checkout |
164
+ | `TB2_OUTPUT_DIR` | `/tmp/tbench2_env_runs` | Directory for session logs and cache |
165
+ | `TB2_CACHE_DIR` | `$TB2_OUTPUT_DIR/repo_cache` | Where to extract TB2 repo |
166
+ | `TB2_REPO_URL` | (GitHub main.zip) | Repo zip URL for auto-download |
167
+
168
+ ## Reward
169
+
170
+ Binary reward on `evaluate` action:
171
+ - `1.0` - All pytest tests pass (exit code 0)
172
+ - `0.0` - Tests fail (non-zero exit code)
173
+
174
+ Intermediate steps return `reward=None`.
175
+
176
+ ## Running the Server
177
+
178
+ ```bash
179
+ # Install dependencies
180
+ uv sync --all-extras
181
+
182
+ # Local mode (default, for Spaces)
183
+ python -m tbench2_env.server.app --port 8000
184
+
185
+ # Docker mode (full TB2.0 compatibility)
186
+ TB2_MODE=docker python -m tbench2_env.server.app --port 8000
187
+
188
+ # With local TB2 repo
189
+ TB2_TASKS_DIR=/path/to/terminal-bench-2 python -m tbench2_env.server.app
190
+ ```
191
+
192
+ ## Project Structure
193
+
194
+ ```
195
+ tbench2_env/
196
+ ├── __init__.py # Module exports (Tbench2Env, Tbench2Action, etc.)
197
+ ├── README.md # This file
198
+ ├── client.py # Tbench2Env client implementation
199
+ ├── models.py # Tbench2Action, Tbench2Observation, Tbench2State
200
+ ├── openenv.yaml # OpenEnv configuration
201
+ ├── pyproject.toml # Package dependencies
202
+ └── server/
203
+ ├── __init__.py # Server exports
204
+ ├── app.py # FastAPI application
205
+ ├── tbench2_env_environment.py # Core environment logic
206
+ └── Dockerfile # Container image definition
207
+ ```
envs/tbench2_env/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env Environment."""
8
+
9
+ from .client import Tbench2Env
10
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
11
+
12
+
13
+ __all__ = [
14
+ "Tbench2Action",
15
+ "Tbench2Observation",
16
+ "Tbench2Env",
17
+ "Tbench2State",
18
+ ]
envs/tbench2_env/client.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 Environment Client."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.client_types import StepResult
18
+ from openenv.core.env_client import EnvClient
19
+
20
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
21
+ except ImportError:
22
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
23
+
24
+ # Standalone imports (when environment is standalone with openenv from pip)
25
+ from openenv.core.client_types import StepResult
26
+ from openenv.core.env_client import EnvClient
27
+
28
+
29
+ class Tbench2Env(EnvClient[Tbench2Action, Tbench2Observation, Tbench2State]):
30
+ """HTTP client for the TB2 environment."""
31
+
32
+ def _step_payload(self, action: Tbench2Action) -> dict[str, Any]:
33
+ return {
34
+ "action_type": action.action_type,
35
+ "command": action.command,
36
+ "session_id": action.session_id,
37
+ "block": action.block,
38
+ "wait_seconds": action.wait_seconds,
39
+ "file_path": action.file_path,
40
+ "content": action.content,
41
+ }
42
+
43
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[Tbench2Observation]:
44
+ obs_data = payload.get("observation", {})
45
+ observation = Tbench2Observation(
46
+ instruction=obs_data.get("instruction", ""),
47
+ output=obs_data.get("output", ""),
48
+ success=obs_data.get("success", True),
49
+ error=obs_data.get("error", ""),
50
+ task_id=obs_data.get("task_id", ""),
51
+ task_path=obs_data.get("task_path", ""),
52
+ session_id=obs_data.get("session_id"),
53
+ action_type=obs_data.get("action_type", ""),
54
+ info=obs_data.get("info", {}),
55
+ reward=payload.get("reward"),
56
+ done=payload.get("done", False),
57
+ metadata=obs_data.get("metadata", {}),
58
+ )
59
+ return StepResult(
60
+ observation=observation,
61
+ reward=payload.get("reward"),
62
+ done=payload.get("done", False),
63
+ )
64
+
65
+ def _parse_state(self, payload: dict[str, Any]) -> Tbench2State:
66
+ return Tbench2State(
67
+ episode_id=payload.get("episode_id"),
68
+ step_count=payload.get("step_count", 0),
69
+ task_id=payload.get("task_id", ""),
70
+ task_path=payload.get("task_path", ""),
71
+ terminal_ready=payload.get("terminal_ready", False),
72
+ last_action_type=payload.get("last_action_type", ""),
73
+ last_command=payload.get("last_command", ""),
74
+ last_output=payload.get("last_output", ""),
75
+ )
envs/tbench2_env/models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the TB2 environment.
9
+ """
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.env_server.types import Action, Observation, State
18
+ except ImportError:
19
+ # Standalone imports (when environment is standalone with openenv from pip)
20
+ from openenv.core.env_server.types import Action, Observation, State
21
+
22
+
23
+ class Tbench2Action(Action):
24
+ """Action for interacting with a TB2 task session."""
25
+
26
+ action_type: str = Field(default="exec")
27
+ command: str = Field(default="")
28
+ session_id: str | None = Field(default=None)
29
+ block: bool = Field(default=True)
30
+ wait_seconds: float | None = Field(default=None)
31
+ file_path: str = Field(default="")
32
+ content: str = Field(default="")
33
+
34
+
35
+ class Tbench2Observation(Observation):
36
+ """Observation returned from the TB2 environment."""
37
+
38
+ instruction: str = Field(default="")
39
+ output: str = Field(default="")
40
+ success: bool = Field(default=True)
41
+ error: str = Field(default="")
42
+ task_id: str = Field(default="")
43
+ task_path: str = Field(default="")
44
+ session_id: str | None = Field(default=None)
45
+ action_type: str = Field(default="")
46
+ info: dict = Field(default_factory=dict)
47
+
48
+
49
+ class Tbench2State(State):
50
+ """Server-side state for a TB2 task."""
51
+
52
+ task_id: str = Field(default="")
53
+ task_path: str = Field(default="")
54
+ session_id: str = Field(default="")
55
+ terminal_ready: bool = Field(default=False)
56
+ last_action_type: str = Field(default="")
57
+ last_command: str = Field(default="")
58
+ last_output: str = Field(default="")
envs/tbench2_env/openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: tbench2
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
envs/tbench2_env/openenv_tbench2_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-tbench2_env
3
+ Version: 0.1.0
4
+ Summary: Tbench2 Env environment for OpenEnv
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: pytest>=8.4.0
8
+ Requires-Dist: camel-ai
9
+ Requires-Dist: docker>=7.0.0
10
+ Requires-Dist: tomli>=2.0.0; python_version < "3.11"
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
13
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
envs/tbench2_env/openenv_tbench2_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ ./__init__.py
4
+ ./client.py
5
+ ./models.py
6
+ openenv_tbench2_env.egg-info/PKG-INFO
7
+ openenv_tbench2_env.egg-info/SOURCES.txt
8
+ openenv_tbench2_env.egg-info/dependency_links.txt
9
+ openenv_tbench2_env.egg-info/entry_points.txt
10
+ openenv_tbench2_env.egg-info/requires.txt
11
+ openenv_tbench2_env.egg-info/top_level.txt
12
+ server/__init__.py
13
+ server/app.py
14
+ server/tbench2_env_environment.py
envs/tbench2_env/openenv_tbench2_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
envs/tbench2_env/openenv_tbench2_env.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = tbench2_env.server.app:main
envs/tbench2_env/openenv_tbench2_env.egg-info/requires.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ pytest>=8.4.0
3
+ camel-ai
4
+ docker>=7.0.0
5
+
6
+ [:python_version < "3.11"]
7
+ tomli>=2.0.0
8
+
9
+ [dev]
10
+ pytest>=8.0.0
11
+ pytest-cov>=4.0.0
envs/tbench2_env/openenv_tbench2_env.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tbench2_env
envs/tbench2_env/pyproject.toml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-tbench2_env"
13
+ version = "0.1.0"
14
+ description = "Tbench2 Env environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git@v0.2.3",
20
+ "pytest>=8.4.0",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ "camel-ai",
24
+ # Docker-backed mode (optional, for full TB2.0 compatibility)
25
+ "docker>=7.0.0",
26
+ # TOML parsing (tomllib for Python 3.11+, tomli for older versions)
27
+ "tomli>=2.0.0; python_version < '3.11'",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=8.0.0",
33
+ "pytest-cov>=4.0.0",
34
+ ]
35
+
36
+ [project.scripts]
37
+ # Server entry point - enables running via: uv run --project . server
38
+ # or: python -m tbench2_env.server.app
39
+ server = "tbench2_env.server.app:main"
40
+
41
+ [tool.setuptools]
42
+ include-package-data = true
43
+ packages = ["tbench2_env", "tbench2_env.server"]
44
+ package-dir = { "tbench2_env" = ".", "tbench2_env.server" = "server" }
envs/tbench2_env/server/Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git gcc python3-dev && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=tbench2_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+ ENV ENABLE_WEB_INTERFACE=true
74
+
75
+ # Health check
76
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
77
+ CMD curl -f http://localhost:8000/health || exit 1
78
+
79
+ # Run the FastAPI server
80
+ # The module path is constructed to work with the /app/env structure
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
envs/tbench2_env/server/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env environment server components."""
8
+
9
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
10
+
11
+
12
+ __all__ = ["Tbench2Environment", "Tbench2DockerEnvironment"]
envs/tbench2_env/server/app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the Tbench2 Env Environment.
9
+
10
+ This module creates an HTTP server that exposes the Tbench2Environment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ import os
32
+
33
+
34
+ try:
35
+ from openenv.core.env_server.http_server import create_app
36
+
37
+ # In-repo imports
38
+ from tbench2_env.models import Tbench2Action, Tbench2Observation
39
+
40
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
41
+ except Exception as e: # pragma: no cover
42
+ from models import Tbench2Action, Tbench2Observation
43
+
44
+ # Standalone imports (when environment is standalone with openenv from pip)
45
+ from openenv.core.env_server.http_server import create_app
46
+ from server.tbench2_env_environment import (
47
+ Tbench2DockerEnvironment,
48
+ Tbench2Environment,
49
+ )
50
+
51
+ _IMPORT_ERROR = e
52
+
53
+
54
+ # Determine which environment class to use based on TB2_MODE
55
+ _TB2_MODE = os.getenv("TB2_MODE", "local").lower()
56
+
57
+ if _TB2_MODE == "docker":
58
+ _DEFAULT_ENVIRONMENT = Tbench2DockerEnvironment
59
+ _ENV_SUFFIX = " (Docker mode)"
60
+ elif _TB2_MODE == "auto":
61
+ # Auto-detect: try Docker, fall back to local
62
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
63
+ _ENV_SUFFIX = " (auto-detect mode)"
64
+ else:
65
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
66
+ _ENV_SUFFIX = " (local mode)"
67
+
68
+
69
+ # Create the app with web interface and README integration
70
+ app = create_app(
71
+ _DEFAULT_ENVIRONMENT,
72
+ Tbench2Action,
73
+ Tbench2Observation,
74
+ env_name="tbench2_env" + _ENV_SUFFIX,
75
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
76
+ )
77
+
78
+
79
+ def main(host: str = "0.0.0.0", port: int = 8000):
80
+ """
81
+ Entry point for direct execution via uv run or python -m.
82
+
83
+ This function enables running the server without Docker:
84
+ uv run --project . server
85
+ uv run --project . server --port 8001
86
+ python -m tbench2_env.server.app
87
+
88
+ Args:
89
+ host: Host address to bind to (default: "0.0.0.0")
90
+ port: Port number to listen on (default: 8000)
91
+
92
+ For production deployments, consider using uvicorn directly with
93
+ multiple workers:
94
+ uvicorn tbench2_env.server.app:app --workers 4
95
+ """
96
+ import uvicorn
97
+
98
+ uvicorn.run(app, host=host, port=port)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ import argparse
103
+
104
+ parser = argparse.ArgumentParser()
105
+ parser.add_argument("--port", type=int, default=8000)
106
+ args = parser.parse_args()
107
+ main(port=args.port)
envs/tbench2_env/server/tbench2_env_environment.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 environment server implementation (Spaces-compatible local mode)."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import sys
14
+ import urllib.request
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any
18
+ from uuid import uuid4
19
+
20
+
21
+ if sys.version_info >= (3, 11):
22
+ import tomllib
23
+ else:
24
+ import tomli as tomllib
25
+
26
+ from openenv.core.env_server.interfaces import Environment
27
+
28
+
29
+ # Support both in-repo and standalone imports
30
+ try:
31
+ # In-repo imports (when running from OpenEnv repository)
32
+ from tbench2_env.models import Tbench2Action, Tbench2Observation, Tbench2State
33
+ except ImportError:
34
+ # Standalone imports (when environment is standalone with openenv from pip)
35
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
36
+
37
+ _CAMEL_IMPORT_ERROR: Exception | None = None
38
+
39
+
40
+ def _require_terminal_toolkit() -> Any:
41
+ global _CAMEL_IMPORT_ERROR
42
+ if _CAMEL_IMPORT_ERROR is not None:
43
+ raise RuntimeError(
44
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
45
+ ) from _CAMEL_IMPORT_ERROR
46
+
47
+ try:
48
+ from camel.toolkits import TerminalToolkit
49
+ except Exception as exc: # pragma: no cover
50
+ _CAMEL_IMPORT_ERROR = exc
51
+ raise RuntimeError(
52
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
53
+ ) from exc
54
+
55
+ return TerminalToolkit
56
+
57
+
58
+ def _download_tb2_repo(cache_dir: Path) -> Path:
59
+ repo_url = os.getenv(
60
+ "TB2_REPO_URL",
61
+ "https://github.com/laude-institute/terminal-bench-2/archive/refs/heads/main.zip",
62
+ )
63
+ cache_dir.mkdir(parents=True, exist_ok=True)
64
+ archive_path = cache_dir / "terminal-bench-2.zip"
65
+
66
+ if not archive_path.exists():
67
+ urllib.request.urlretrieve(repo_url, archive_path)
68
+
69
+ with zipfile.ZipFile(archive_path) as zf:
70
+ root = zf.namelist()[0].split("/")[0]
71
+ extract_dir = cache_dir / root
72
+ if not extract_dir.exists():
73
+ zf.extractall(cache_dir)
74
+
75
+ return extract_dir
76
+
77
+
78
+ def _read_instruction(task_dir: Path) -> str:
79
+ instruction_path = task_dir / "instruction.md"
80
+ if instruction_path.exists():
81
+ return instruction_path.read_text(encoding="utf-8")
82
+ return ""
83
+
84
+
85
+ def _read_timeout(task_dir: Path, fallback: float) -> float:
86
+ task_toml = task_dir / "task.toml"
87
+ if not task_toml.exists():
88
+ return fallback
89
+ try:
90
+ data = tomllib.loads(task_toml.read_text(encoding="utf-8"))
91
+ except Exception:
92
+ return fallback
93
+ verifier = data.get("verifier", {})
94
+ return float(verifier.get("timeout_sec", fallback))
95
+
96
+
97
+ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
98
+ """OpenEnv wrapper around Terminal-Bench 2 tasks (local execution)."""
99
+
100
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
101
+
102
+ def __init__(
103
+ self,
104
+ tasks_dir: str | None = None,
105
+ output_dir: str | None = None,
106
+ command_timeout_s: float = 20.0,
107
+ safe_mode: bool = False,
108
+ default_task_id: str | None = None,
109
+ ) -> None:
110
+ super().__init__()
111
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
112
+ self.output_dir = Path(
113
+ output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs")
114
+ )
115
+ self.command_timeout_s = command_timeout_s
116
+ self.safe_mode = safe_mode
117
+ self.default_task_id = default_task_id or os.getenv(
118
+ "TB2_DEFAULT_TASK_ID", "headless-terminal"
119
+ )
120
+
121
+ self._state = Tbench2State()
122
+ self._task_dir: Path | None = None
123
+ self._terminal_toolkit = None
124
+ self._instruction = ""
125
+
126
+ def reset(
127
+ self,
128
+ seed: int | None = None,
129
+ episode_id: str | None = None,
130
+ **kwargs: Any,
131
+ ) -> Tbench2Observation:
132
+ del seed
133
+
134
+ TerminalToolkit = _require_terminal_toolkit()
135
+
136
+ task_id = (
137
+ kwargs.get("task_id") or kwargs.get("task_name") or self.default_task_id
138
+ )
139
+ task_path = kwargs.get("task_path") or kwargs.get("path")
140
+
141
+ task_dir = self._resolve_task_path(task_id, task_path)
142
+ resolved_task_id = task_id or task_dir.name
143
+
144
+ self._instruction = _read_instruction(task_dir)
145
+ self._task_dir = task_dir
146
+
147
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
148
+ session_logs_dir = (
149
+ self.output_dir / trial_name / "terminal_toolkit_session_logs"
150
+ )
151
+ session_logs_dir.mkdir(parents=True, exist_ok=True)
152
+
153
+ self._terminal_toolkit = TerminalToolkit(
154
+ timeout=self.command_timeout_s,
155
+ working_directory=str(task_dir),
156
+ use_docker_backend=False,
157
+ session_logs_dir=session_logs_dir,
158
+ safe_mode=self.safe_mode,
159
+ )
160
+
161
+ self._state = Tbench2State(
162
+ episode_id=episode_id or str(uuid4()),
163
+ step_count=0,
164
+ task_id=resolved_task_id,
165
+ task_path=str(task_dir),
166
+ terminal_ready=True,
167
+ )
168
+
169
+ return Tbench2Observation(
170
+ instruction=self._instruction,
171
+ output="",
172
+ success=True,
173
+ error="",
174
+ task_id=resolved_task_id,
175
+ task_path=str(task_dir),
176
+ session_id=None,
177
+ action_type="reset",
178
+ info={},
179
+ reward=0.0,
180
+ done=False,
181
+ )
182
+
183
+ def step(
184
+ self,
185
+ action: Tbench2Action,
186
+ timeout_s: float | None = None,
187
+ **kwargs: Any,
188
+ ) -> Tbench2Observation:
189
+ del timeout_s, kwargs
190
+
191
+ if not isinstance(action, Tbench2Action):
192
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
193
+
194
+ if self._terminal_toolkit is None or self._task_dir is None:
195
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
196
+
197
+ self._state.step_count += 1
198
+ self._state.last_action_type = action.action_type
199
+ self._state.last_command = action.command
200
+
201
+ output = ""
202
+ error = ""
203
+ success = True
204
+ reward = None
205
+ done = False
206
+ info: dict[str, Any] = {}
207
+ session_id = action.session_id or "tb2-session"
208
+
209
+ try:
210
+ if action.action_type == "exec":
211
+ output = self._terminal_toolkit.shell_exec(
212
+ command=action.command,
213
+ block=action.block,
214
+ id=session_id,
215
+ )
216
+ elif action.action_type == "write":
217
+ self._ensure_session_id(action.session_id, action.action_type)
218
+ output = self._terminal_toolkit.shell_write_to_process(
219
+ id=action.session_id,
220
+ command=action.command,
221
+ )
222
+ elif action.action_type == "view":
223
+ self._ensure_session_id(action.session_id, action.action_type)
224
+ output = self._terminal_toolkit.shell_view(id=action.session_id)
225
+ elif action.action_type == "wait":
226
+ self._ensure_session_id(action.session_id, action.action_type)
227
+ wait_seconds = action.wait_seconds or 0.0
228
+ output = self._terminal_toolkit.shell_wait(
229
+ id=action.session_id,
230
+ wait_seconds=wait_seconds,
231
+ )
232
+ elif action.action_type == "kill":
233
+ self._ensure_session_id(action.session_id, action.action_type)
234
+ self._terminal_toolkit.shell_kill_process(id=action.session_id)
235
+ output = f"Killed session {action.session_id}"
236
+ elif action.action_type == "write_file":
237
+ self._terminal_toolkit.shell_write_content_to_file(
238
+ content=action.content,
239
+ file_path=action.file_path,
240
+ )
241
+ output = f"Wrote content to {action.file_path}"
242
+ elif action.action_type == "evaluate":
243
+ output, reward, info = self._evaluate_task()
244
+ done = True
245
+ elif action.action_type == "close":
246
+ self.close()
247
+ output = "Closed TB2 environment."
248
+ done = True
249
+ else:
250
+ raise ValueError(f"Unsupported action_type: {action.action_type}")
251
+ except Exception as exc: # pragma: no cover
252
+ success = False
253
+ error = str(exc)
254
+
255
+ self._state.last_output = output
256
+ self._state.session_id = session_id or ""
257
+
258
+ return Tbench2Observation(
259
+ instruction=self._instruction,
260
+ output=output,
261
+ success=success,
262
+ error=error,
263
+ task_id=self._state.task_id,
264
+ task_path=self._state.task_path,
265
+ session_id=session_id or "",
266
+ action_type=action.action_type,
267
+ info=info,
268
+ reward=reward,
269
+ done=done,
270
+ )
271
+
272
+ @property
273
+ def state(self) -> Tbench2State:
274
+ return self._state
275
+
276
+ def close(self) -> None:
277
+ self._terminal_toolkit = None
278
+ self._task_dir = None
279
+ self._instruction = ""
280
+
281
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
282
+ if task_path:
283
+ resolved = Path(task_path).expanduser().resolve()
284
+ if not resolved.exists():
285
+ raise FileNotFoundError(f"Task path not found: {resolved}")
286
+ return resolved
287
+
288
+ if not task_id:
289
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
290
+
291
+ if not self.tasks_dir:
292
+ cache_dir = Path(
293
+ os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache"))
294
+ )
295
+ repo_dir = _download_tb2_repo(cache_dir)
296
+ resolved = repo_dir / task_id
297
+ else:
298
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
299
+
300
+ if not resolved.exists():
301
+ raise FileNotFoundError(f"Task path not found: {resolved}")
302
+ return resolved
303
+
304
+ def _ensure_session_id(self, session_id: str | None, action_type: str) -> None:
305
+ if not session_id:
306
+ raise ValueError(f"session_id is required for action_type='{action_type}'")
307
+
308
+ def _evaluate_task(self) -> tuple[str, float, dict[str, Any]]:
309
+ if self._task_dir is None:
310
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
311
+ if self._terminal_toolkit is None:
312
+ raise RuntimeError("Terminal toolkit not initialized.")
313
+
314
+ _read_timeout(self._task_dir, fallback=900.0) # Validate timeout config
315
+ tests_dir = self._task_dir / "tests"
316
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
317
+ output = self._terminal_toolkit.shell_exec(
318
+ id="tb2-tests",
319
+ command=cmd,
320
+ block=True,
321
+ )
322
+
323
+ exit_code = 1
324
+ marker = "__TB2_EXIT_CODE__"
325
+ for line in output.splitlines()[::-1]:
326
+ if marker in line:
327
+ try:
328
+ exit_code = int(line.split(":", 1)[1].strip())
329
+ except Exception:
330
+ exit_code = 1
331
+ break
332
+
333
+ reward = 1.0 if exit_code == 0 else 0.0
334
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
335
+ return output, reward, info
336
+
337
+
338
+ class Tbench2DockerEnvironment(
339
+ Environment[Tbench2Action, Tbench2Observation, Tbench2State]
340
+ ):
341
+ """OpenEnv wrapper around Terminal-Bench 2 tasks with Docker isolation.
342
+
343
+ This environment runs each task in its own Docker container, reading
344
+ the image specification from task.toml's [environment] section.
345
+
346
+ Requires:
347
+ - Docker socket mounted (/var/run/docker.sock)
348
+ - Sufficient disk space for container images
349
+ """
350
+
351
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
352
+
353
+ def __init__(
354
+ self,
355
+ tasks_dir: str | None = None,
356
+ output_dir: str | None = None,
357
+ command_timeout_s: float = 300.0,
358
+ safe_mode: bool = True,
359
+ default_task_id: str | None = None,
360
+ ) -> None:
361
+ super().__init__()
362
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
363
+ self.output_dir = Path(
364
+ output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs")
365
+ )
366
+ self.command_timeout_s = command_timeout_s
367
+ self.safe_mode = safe_mode
368
+ self.default_task_id = default_task_id or os.getenv(
369
+ "TB2_DEFAULT_TASK_ID", "headless-terminal"
370
+ )
371
+
372
+ self._state = Tbench2State()
373
+ self._task_dir: Path | None = None
374
+ self._docker_client = None
375
+ self._container = None
376
+ self._instruction = ""
377
+ self._task_image = ""
378
+ self._task_config: dict[str, Any] = {}
379
+
380
+ def _get_docker_client(self) -> Any:
381
+ """Lazy initialization of Docker client."""
382
+ if self._docker_client is None:
383
+ try:
384
+ import docker
385
+
386
+ self._docker_client = docker.from_env()
387
+ except Exception as exc:
388
+ raise RuntimeError(
389
+ f"Docker client not available. Ensure Docker socket is mounted. Error: {exc}"
390
+ ) from exc
391
+ return self._docker_client
392
+
393
+ def reset(
394
+ self,
395
+ seed: int | None = None,
396
+ episode_id: str | None = None,
397
+ **kwargs: Any,
398
+ ) -> Tbench2Observation:
399
+ del seed
400
+
401
+ task_id = (
402
+ kwargs.get("task_id") or kwargs.get("task_name") or self.default_task_id
403
+ )
404
+ task_path = kwargs.get("task_path") or kwargs.get("path")
405
+
406
+ task_dir = self._resolve_task_path(task_id, task_path)
407
+ resolved_task_id = task_id or task_dir.name
408
+
409
+ # Read task configuration including Docker image
410
+ task_toml_path = task_dir / "task.toml"
411
+ if task_toml_path.exists():
412
+ self._task_config = tomllib.loads(
413
+ task_toml_path.read_text(encoding="utf-8")
414
+ )
415
+ self._task_image = self._task_config.get("environment", {}).get(
416
+ "docker_image", ""
417
+ )
418
+ else:
419
+ self._task_image = ""
420
+ self._task_config = {}
421
+
422
+ self._instruction = _read_instruction(task_dir)
423
+ self._task_dir = task_dir
424
+
425
+ # Create trial directory for logs
426
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
427
+ trial_dir = self.output_dir / trial_name
428
+ trial_dir.mkdir(parents=True, exist_ok=True)
429
+
430
+ # Start Docker container if image is specified
431
+ if self._task_image:
432
+ self._start_container(task_dir, trial_dir)
433
+ else:
434
+ # Fallback to local mode if no image specified
435
+ self._state = Tbench2State(
436
+ episode_id=episode_id or str(uuid4()),
437
+ step_count=0,
438
+ task_id=resolved_task_id,
439
+ task_path=str(task_dir),
440
+ terminal_ready=not self._task_image, # Ready if no container needed
441
+ )
442
+
443
+ return Tbench2Observation(
444
+ instruction=self._instruction,
445
+ output="",
446
+ success=True,
447
+ error="",
448
+ task_id=resolved_task_id,
449
+ task_path=str(task_dir),
450
+ session_id=None,
451
+ action_type="reset",
452
+ info={"docker_image": self._task_image} if self._task_image else {},
453
+ reward=0.0,
454
+ done=False,
455
+ )
456
+
457
+ def _start_container(self, task_dir: Path, trial_dir: Path) -> None:
458
+ """Start a Docker container for the task.
459
+
460
+ Uses file copying instead of bind mounts to support Docker-in-Docker
461
+ scenarios where the server runs inside a container. Bind mounts reference
462
+ host paths, which don't exist when the server is containerized.
463
+ """
464
+ docker = self._get_docker_client()
465
+
466
+ try:
467
+ # Pull image if needed
468
+ try:
469
+ docker.images.get(self._task_image)
470
+ except Exception:
471
+ logging.info(f"Pulling image {self._task_image}...")
472
+ docker.images.pull(self._task_image)
473
+
474
+ # Start container WITHOUT bind mounts (for DinD compatibility)
475
+ self._container = docker.containers.run(
476
+ image=self._task_image,
477
+ command="sleep infinity",
478
+ detach=True,
479
+ network_mode="host",
480
+ working_dir="/task",
481
+ remove=False,
482
+ )
483
+
484
+ # Copy task files into container using tar archive
485
+ # This works in Docker-in-Docker because we read files from our
486
+ # filesystem and stream them to the container via the Docker API
487
+ self._copy_dir_to_container(task_dir, "/task")
488
+
489
+ self._state = Tbench2State(
490
+ episode_id=str(uuid4()),
491
+ step_count=0,
492
+ task_id=task_dir.name,
493
+ task_path=str(task_dir),
494
+ terminal_ready=True,
495
+ )
496
+
497
+ except Exception as exc:
498
+ raise RuntimeError(f"Failed to start container: {exc}") from exc
499
+
500
+ def _copy_dir_to_container(self, src_dir: Path, dest_path: str) -> None:
501
+ """Copy a directory into the container using tar archive.
502
+
503
+ This method streams files via the Docker API, avoiding bind mount
504
+ issues in Docker-in-Docker scenarios.
505
+ """
506
+ import io
507
+ import tarfile
508
+
509
+ if self._container is None:
510
+ raise RuntimeError("Container not started")
511
+
512
+ # Create tar archive in memory
513
+ tar_stream = io.BytesIO()
514
+ with tarfile.open(fileobj=tar_stream, mode="w") as tar:
515
+ for item in src_dir.rglob("*"):
516
+ arcname = str(item.relative_to(src_dir))
517
+ tar.add(str(item), arcname=arcname)
518
+
519
+ tar_stream.seek(0)
520
+
521
+ # Copy to container
522
+ self._container.put_archive(dest_path, tar_stream.getvalue())
523
+
524
+ def _exec_in_container(
525
+ self, command: str, workdir: str = "/task"
526
+ ) -> tuple[int, str]:
527
+ """Execute a command inside the container."""
528
+ if self._container is None:
529
+ raise RuntimeError("Container not started. Call reset() first.")
530
+
531
+ exit_code, output = self._container.exec_run(
532
+ cmd=f"bash -c 'cd {workdir} && {command}'",
533
+ workdir="/task",
534
+ stdout=True,
535
+ stderr=True,
536
+ )
537
+ return exit_code, output.decode("utf-8", errors="replace")
538
+
539
+ def step(
540
+ self,
541
+ action: Tbench2Action,
542
+ timeout_s: float | None = None,
543
+ **kwargs: Any,
544
+ ) -> Tbench2Observation:
545
+ del timeout_s, kwargs
546
+
547
+ if not isinstance(action, Tbench2Action):
548
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
549
+
550
+ if self._task_dir is None:
551
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
552
+
553
+ self._state.step_count += 1
554
+ self._state.last_action_type = action.action_type
555
+ self._state.last_command = action.command
556
+
557
+ output = ""
558
+ error = ""
559
+ success = True
560
+ reward = None
561
+ done = False
562
+ info: dict[str, Any] = {}
563
+ session_id = action.session_id or "tb2-session"
564
+
565
+ try:
566
+ if action.action_type == "exec":
567
+ if self._container:
568
+ exit_code, output = self._exec_in_container(action.command)
569
+ success = exit_code == 0
570
+ else:
571
+ # Fallback to local execution
572
+ import subprocess
573
+
574
+ result = subprocess.run(
575
+ action.command,
576
+ shell=True,
577
+ capture_output=True,
578
+ text=True,
579
+ timeout=self.command_timeout_s,
580
+ )
581
+ output = result.stdout + result.stderr
582
+ success = result.returncode == 0
583
+
584
+ elif action.action_type == "write_file":
585
+ if self._container:
586
+ # Write to container
587
+ exit_code, _ = self._exec_in_container(
588
+ f"cat > {action.file_path} << 'EOF'\n{action.content}\nEOF"
589
+ )
590
+ success = exit_code == 0
591
+ output = f"Wrote to {action.file_path}"
592
+ else:
593
+ # Local write
594
+ Path(action.file_path).write_text(action.content)
595
+ output = f"Wrote to {action.file_path}"
596
+
597
+ elif action.action_type == "evaluate":
598
+ if self._container:
599
+ output, reward, info = self._evaluate_docker()
600
+ else:
601
+ output, reward, info = self._evaluate_local()
602
+ done = True
603
+
604
+ elif action.action_type == "close":
605
+ self.close()
606
+ output = "Closed TB2 environment."
607
+ done = True
608
+
609
+ else:
610
+ raise ValueError(
611
+ f"Unsupported action_type in Docker mode: {action.action_type}"
612
+ )
613
+
614
+ except Exception as exc:
615
+ success = False
616
+ error = str(exc)
617
+
618
+ self._state.last_output = output
619
+ self._state.session_id = session_id or ""
620
+
621
+ return Tbench2Observation(
622
+ instruction=self._instruction,
623
+ output=output,
624
+ success=success,
625
+ error=error,
626
+ task_id=self._state.task_id,
627
+ task_path=self._state.task_path,
628
+ session_id=session_id or "",
629
+ action_type=action.action_type,
630
+ info=info,
631
+ reward=reward,
632
+ done=done,
633
+ )
634
+
635
+ def _evaluate_docker(self) -> tuple[str, float, dict[str, Any]]:
636
+ """Evaluate task inside Docker container."""
637
+ if self._container is None:
638
+ raise RuntimeError("Container not started.")
639
+ assert self._task_dir is not None, "Task directory not set"
640
+
641
+ # Run pytest in the container's /task directory
642
+ # Use exit code marker for consistency with local mode
643
+ cmd = "cd /task && python -m pytest -q tests/ -rA; echo __TB2_EXIT_CODE__:$?"
644
+
645
+ exit_code, output = self._container.exec_run(
646
+ cmd=f"bash -c '{cmd}'",
647
+ workdir="/task",
648
+ stdout=True,
649
+ stderr=True,
650
+ )
651
+ output_str = output.decode("utf-8", errors="replace")
652
+
653
+ # Parse exit code from marker (same logic as local mode)
654
+ ec = 1
655
+ marker = "__TB2_EXIT_CODE__"
656
+ for line in output_str.splitlines()[::-1]:
657
+ if marker in line:
658
+ try:
659
+ ec = int(line.split(":", 1)[1].strip())
660
+ except Exception:
661
+ ec = 1
662
+ break
663
+
664
+ reward = 1.0 if ec == 0 else 0.0
665
+ info = {"tests_passed": ec == 0, "exit_code": ec}
666
+ return output_str, reward, info
667
+
668
+ def _evaluate_local(self) -> tuple[str, float, dict[str, Any]]:
669
+ """Evaluate task locally (fallback)."""
670
+ if self._task_dir is None:
671
+ raise RuntimeError("Task not initialized.")
672
+
673
+ tests_dir = self._task_dir / "tests"
674
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
675
+
676
+ import subprocess
677
+
678
+ result = subprocess.run(
679
+ cmd,
680
+ shell=True,
681
+ capture_output=True,
682
+ text=True,
683
+ timeout=900.0,
684
+ )
685
+ output = result.stdout + result.stderr
686
+ exit_code = result.returncode
687
+
688
+ reward = 1.0 if exit_code == 0 else 0.0
689
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
690
+ return output, reward, info
691
+
692
+ @property
693
+ def state(self) -> Tbench2State:
694
+ return self._state
695
+
696
+ def close(self) -> None:
697
+ if self._container:
698
+ try:
699
+ self._container.stop(timeout=10)
700
+ self._container.remove(force=True)
701
+ except Exception:
702
+ pass
703
+ self._container = None
704
+ self._task_dir = None
705
+ self._instruction = ""
706
+
707
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
708
+ if task_path:
709
+ resolved = Path(task_path).expanduser().resolve()
710
+ if not resolved.exists():
711
+ raise FileNotFoundError(f"Task path not found: {resolved}")
712
+ return resolved
713
+
714
+ if not task_id:
715
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
716
+
717
+ if not self.tasks_dir:
718
+ cache_dir = Path(
719
+ os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache"))
720
+ )
721
+ repo_dir = _download_tb2_repo(cache_dir)
722
+ resolved = repo_dir / task_id
723
+ else:
724
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
725
+
726
+ if not resolved.exists():
727
+ raise FileNotFoundError(f"Task path not found: {resolved}")
728
+ return resolved
openenv_tbench2_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-tbench2_env
3
+ Version: 0.1.0
4
+ Summary: Tbench2 Env environment for OpenEnv
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: pytest>=8.4.0
8
+ Requires-Dist: camel-ai
9
+ Requires-Dist: docker>=7.0.0
10
+ Requires-Dist: tomli>=2.0.0; python_version < "3.11"
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
13
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
openenv_tbench2_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ ./__init__.py
4
+ ./client.py
5
+ ./models.py
6
+ openenv_tbench2_env.egg-info/PKG-INFO
7
+ openenv_tbench2_env.egg-info/SOURCES.txt
8
+ openenv_tbench2_env.egg-info/dependency_links.txt
9
+ openenv_tbench2_env.egg-info/entry_points.txt
10
+ openenv_tbench2_env.egg-info/requires.txt
11
+ openenv_tbench2_env.egg-info/top_level.txt
12
+ server/__init__.py
13
+ server/app.py
14
+ server/tbench2_env_environment.py
openenv_tbench2_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
openenv_tbench2_env.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = tbench2_env.server.app:main
openenv_tbench2_env.egg-info/requires.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ pytest>=8.4.0
3
+ camel-ai
4
+ docker>=7.0.0
5
+
6
+ [:python_version < "3.11"]
7
+ tomli>=2.0.0
8
+
9
+ [dev]
10
+ pytest>=8.0.0
11
+ pytest-cov>=4.0.0
openenv_tbench2_env.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tbench2_env
pyproject.toml CHANGED
@@ -16,7 +16,7 @@ requires-python = ">=3.10"
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
  # install from github
19
- "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git@v0.2.1",
20
  "pytest>=8.4.0",
21
  # Environment-specific dependencies
22
  # Add all dependencies needed for your environment here
 
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
  # install from github
19
+ "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git@v0.2.3",
20
  "pytest>=8.4.0",
21
  # Environment-specific dependencies
22
  # Add all dependencies needed for your environment here
server/Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git gcc python3-dev && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=tbench2_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+ ENV ENABLE_WEB_INTERFACE=true
74
+
75
+ # Health check
76
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
77
+ CMD curl -f http://localhost:8000/health || exit 1
78
+
79
+ # Run the FastAPI server
80
+ # The module path is constructed to work with the /app/env structure
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
server/app.py CHANGED
@@ -39,11 +39,14 @@ try:
39
 
40
  from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
41
  except Exception as e: # pragma: no cover
 
 
42
  # Standalone imports (when environment is standalone with openenv from pip)
43
  from openenv.core.env_server.http_server import create_app
44
- from server.tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
45
-
46
- from models import Tbench2Action, Tbench2Observation
 
47
 
48
  _IMPORT_ERROR = e
49
 
 
39
 
40
  from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
41
  except Exception as e: # pragma: no cover
42
+ from models import Tbench2Action, Tbench2Observation
43
+
44
  # Standalone imports (when environment is standalone with openenv from pip)
45
  from openenv.core.env_server.http_server import create_app
46
+ from server.tbench2_env_environment import (
47
+ Tbench2DockerEnvironment,
48
+ Tbench2Environment,
49
+ )
50
 
51
  _IMPORT_ERROR = e
52
 
server/tbench2_env_environment.py CHANGED
@@ -105,12 +105,18 @@ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2S
105
  output_dir: str | None = None,
106
  command_timeout_s: float = 20.0,
107
  safe_mode: bool = False,
 
108
  ) -> None:
109
  super().__init__()
110
  self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
111
- self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
 
 
112
  self.command_timeout_s = command_timeout_s
113
  self.safe_mode = safe_mode
 
 
 
114
 
115
  self._state = Tbench2State()
116
  self._task_dir: Path | None = None
@@ -127,7 +133,9 @@ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2S
127
 
128
  TerminalToolkit = _require_terminal_toolkit()
129
 
130
- task_id = kwargs.get("task_id") or kwargs.get("task_name")
 
 
131
  task_path = kwargs.get("task_path") or kwargs.get("path")
132
 
133
  task_dir = self._resolve_task_path(task_id, task_path)
@@ -137,7 +145,9 @@ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2S
137
  self._task_dir = task_dir
138
 
139
  trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
140
- session_logs_dir = self.output_dir / trial_name / "terminal_toolkit_session_logs"
 
 
141
  session_logs_dir.mkdir(parents=True, exist_ok=True)
142
 
143
  self._terminal_toolkit = TerminalToolkit(
@@ -279,7 +289,9 @@ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2S
279
  raise ValueError("Provide task_id or task_path to reset TB2 environment.")
280
 
281
  if not self.tasks_dir:
282
- cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
 
 
283
  repo_dir = _download_tb2_repo(cache_dir)
284
  resolved = repo_dir / task_id
285
  else:
@@ -323,7 +335,9 @@ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2S
323
  return output, reward, info
324
 
325
 
326
- class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
 
 
327
  """OpenEnv wrapper around Terminal-Bench 2 tasks with Docker isolation.
328
 
329
  This environment runs each task in its own Docker container, reading
@@ -342,12 +356,18 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
342
  output_dir: str | None = None,
343
  command_timeout_s: float = 300.0,
344
  safe_mode: bool = True,
 
345
  ) -> None:
346
  super().__init__()
347
  self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
348
- self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
 
 
349
  self.command_timeout_s = command_timeout_s
350
  self.safe_mode = safe_mode
 
 
 
351
 
352
  self._state = Tbench2State()
353
  self._task_dir: Path | None = None
@@ -378,7 +398,9 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
378
  ) -> Tbench2Observation:
379
  del seed
380
 
381
- task_id = kwargs.get("task_id") or kwargs.get("task_name")
 
 
382
  task_path = kwargs.get("task_path") or kwargs.get("path")
383
 
384
  task_dir = self._resolve_task_path(task_id, task_path)
@@ -387,8 +409,12 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
387
  # Read task configuration including Docker image
388
  task_toml_path = task_dir / "task.toml"
389
  if task_toml_path.exists():
390
- self._task_config = tomllib.loads(task_toml_path.read_text(encoding="utf-8"))
391
- self._task_image = self._task_config.get("environment", {}).get("docker_image", "")
 
 
 
 
392
  else:
393
  self._task_image = ""
394
  self._task_config = {}
@@ -495,7 +521,9 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
495
  # Copy to container
496
  self._container.put_archive(dest_path, tar_stream.getvalue())
497
 
498
- def _exec_in_container(self, command: str, workdir: str = "/task") -> tuple[int, str]:
 
 
499
  """Execute a command inside the container."""
500
  if self._container is None:
501
  raise RuntimeError("Container not started. Call reset() first.")
@@ -556,7 +584,9 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
556
  elif action.action_type == "write_file":
557
  if self._container:
558
  # Write to container
559
- exit_code, _ = self._exec_in_container(f"cat > {action.file_path} << 'EOF'\n{action.content}\nEOF")
 
 
560
  success = exit_code == 0
561
  output = f"Wrote to {action.file_path}"
562
  else:
@@ -577,7 +607,9 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
577
  done = True
578
 
579
  else:
580
- raise ValueError(f"Unsupported action_type in Docker mode: {action.action_type}")
 
 
581
 
582
  except Exception as exc:
583
  success = False
@@ -683,7 +715,9 @@ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tb
683
  raise ValueError("Provide task_id or task_path to reset TB2 environment.")
684
 
685
  if not self.tasks_dir:
686
- cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
 
 
687
  repo_dir = _download_tb2_repo(cache_dir)
688
  resolved = repo_dir / task_id
689
  else:
 
105
  output_dir: str | None = None,
106
  command_timeout_s: float = 20.0,
107
  safe_mode: bool = False,
108
+ default_task_id: str | None = None,
109
  ) -> None:
110
  super().__init__()
111
  self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
112
+ self.output_dir = Path(
113
+ output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs")
114
+ )
115
  self.command_timeout_s = command_timeout_s
116
  self.safe_mode = safe_mode
117
+ self.default_task_id = default_task_id or os.getenv(
118
+ "TB2_DEFAULT_TASK_ID", "headless-terminal"
119
+ )
120
 
121
  self._state = Tbench2State()
122
  self._task_dir: Path | None = None
 
133
 
134
  TerminalToolkit = _require_terminal_toolkit()
135
 
136
+ task_id = (
137
+ kwargs.get("task_id") or kwargs.get("task_name") or self.default_task_id
138
+ )
139
  task_path = kwargs.get("task_path") or kwargs.get("path")
140
 
141
  task_dir = self._resolve_task_path(task_id, task_path)
 
145
  self._task_dir = task_dir
146
 
147
  trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
148
+ session_logs_dir = (
149
+ self.output_dir / trial_name / "terminal_toolkit_session_logs"
150
+ )
151
  session_logs_dir.mkdir(parents=True, exist_ok=True)
152
 
153
  self._terminal_toolkit = TerminalToolkit(
 
289
  raise ValueError("Provide task_id or task_path to reset TB2 environment.")
290
 
291
  if not self.tasks_dir:
292
+ cache_dir = Path(
293
+ os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache"))
294
+ )
295
  repo_dir = _download_tb2_repo(cache_dir)
296
  resolved = repo_dir / task_id
297
  else:
 
335
  return output, reward, info
336
 
337
 
338
+ class Tbench2DockerEnvironment(
339
+ Environment[Tbench2Action, Tbench2Observation, Tbench2State]
340
+ ):
341
  """OpenEnv wrapper around Terminal-Bench 2 tasks with Docker isolation.
342
 
343
  This environment runs each task in its own Docker container, reading
 
356
  output_dir: str | None = None,
357
  command_timeout_s: float = 300.0,
358
  safe_mode: bool = True,
359
+ default_task_id: str | None = None,
360
  ) -> None:
361
  super().__init__()
362
  self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
363
+ self.output_dir = Path(
364
+ output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs")
365
+ )
366
  self.command_timeout_s = command_timeout_s
367
  self.safe_mode = safe_mode
368
+ self.default_task_id = default_task_id or os.getenv(
369
+ "TB2_DEFAULT_TASK_ID", "headless-terminal"
370
+ )
371
 
372
  self._state = Tbench2State()
373
  self._task_dir: Path | None = None
 
398
  ) -> Tbench2Observation:
399
  del seed
400
 
401
+ task_id = (
402
+ kwargs.get("task_id") or kwargs.get("task_name") or self.default_task_id
403
+ )
404
  task_path = kwargs.get("task_path") or kwargs.get("path")
405
 
406
  task_dir = self._resolve_task_path(task_id, task_path)
 
409
  # Read task configuration including Docker image
410
  task_toml_path = task_dir / "task.toml"
411
  if task_toml_path.exists():
412
+ self._task_config = tomllib.loads(
413
+ task_toml_path.read_text(encoding="utf-8")
414
+ )
415
+ self._task_image = self._task_config.get("environment", {}).get(
416
+ "docker_image", ""
417
+ )
418
  else:
419
  self._task_image = ""
420
  self._task_config = {}
 
521
  # Copy to container
522
  self._container.put_archive(dest_path, tar_stream.getvalue())
523
 
524
+ def _exec_in_container(
525
+ self, command: str, workdir: str = "/task"
526
+ ) -> tuple[int, str]:
527
  """Execute a command inside the container."""
528
  if self._container is None:
529
  raise RuntimeError("Container not started. Call reset() first.")
 
584
  elif action.action_type == "write_file":
585
  if self._container:
586
  # Write to container
587
+ exit_code, _ = self._exec_in_container(
588
+ f"cat > {action.file_path} << 'EOF'\n{action.content}\nEOF"
589
+ )
590
  success = exit_code == 0
591
  output = f"Wrote to {action.file_path}"
592
  else:
 
607
  done = True
608
 
609
  else:
610
+ raise ValueError(
611
+ f"Unsupported action_type in Docker mode: {action.action_type}"
612
+ )
613
 
614
  except Exception as exc:
615
  success = False
 
715
  raise ValueError("Provide task_id or task_path to reset TB2 environment.")
716
 
717
  if not self.tasks_dir:
718
+ cache_dir = Path(
719
+ os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache"))
720
+ )
721
  repo_dir = _download_tb2_repo(cache_dir)
722
  resolved = repo_dir / task_id
723
  else:
src/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """EnvTorch: Standardized agentic execution environments."""
src/core/README.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # <img width="35" height="35" alt="image" src="https://github.com/user-attachments/assets/2700a971-e5d6-4036-b03f-2f89c9791609" /> OpenEnv: Agentic Execution Environments
2
+
3
+ An e2e framework for creating, deploying and using isolated execution environments for agentic RL training, built using Gymnasium style simple APIs. OpenEnv provides a standard for interacting with agentic execution environments via simple Gymnasium style APIs - step(), reset(), state(). Users of agentic execution environments can interact with the environment during RL training loops using these simple APIs.
4
+
5
+ In addition to making it easier for researchers and RL framework writers, we also provide tools for environment creators making it easier for them to create richer environments and make them available over familiar protocols like HTTP and packaged using canonical technologies like docker. Environment creators can use the OpenEnv framework to create environments that are isolated, secure, and easy to deploy and use.
6
+
7
+
8
+ ## Overview
9
+ `openenv.core` provides the foundational building blocks for creating and interacting with containerized environments over HTTP. It enables you to build agent environments that can be deployed as Docker containers and accessed via a simple HTTP API.
10
+
11
+ > ⚠️ **Early Development Warning** OpenEnv is currently in an experimental
12
+ > stage. You should expect bugs, incomplete features, and APIs that may change
13
+ > in future versions. The project welcomes bugfixes, but to make sure things are
14
+ > well coordinated you should discuss any significant change before starting the
15
+ > work. It's recommended that you signal your intention to contribute in the
16
+ > issue tracker, either by filing a new issue or by claiming an existing one.
17
+
18
+
19
+ # OpenEnv Core
20
+
21
+ Core components for OpenEnv - a framework for building HTTP-based agentic environments.
22
+
23
+ ## Features
24
+
25
+ - **EnvClient**: Async-first client for interacting with remote environments
26
+ - **SyncEnvClient**: Synchronous wrapper via `.sync()` for sync codebases
27
+ - **HTTPEnvServer**: FastAPI-based server wrapper for exposing environments over HTTP/WebSocket
28
+ - **Container Providers**: Pluggable architecture for running containers (Docker, Kubernetes, etc.)
29
+ - **Type System**: Strongly-typed Action/Observation/State interfaces
30
+ - **Web Interface**: Optional web UI for interacting with environments
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install "openenv[core]"
36
+ ```
37
+
38
+ For development:
39
+ ```bash
40
+ pip install "openenv[core]"
41
+ ```
42
+
43
+ ## Quick Start
44
+
45
+ ### Creating an Environment Client
46
+
47
+ EnvClient is **async by default**. Use `async with` and `await` for all operations:
48
+
49
+ ```python
50
+ import asyncio
51
+ from openenv.core import EnvClient, StepResult
52
+ from dataclasses import dataclass
53
+ from typing import Any
54
+
55
+ @dataclass
56
+ class MyAction:
57
+ text: str
58
+
59
+ @dataclass
60
+ class MyObservation:
61
+ response: str
62
+
63
+ class MyEnvClient(EnvClient[MyAction, MyObservation, Any]):
64
+ def _step_payload(self, action: MyAction) -> dict:
65
+ return {"text": action.text}
66
+
67
+ def _parse_result(self, payload: dict) -> StepResult[MyObservation]:
68
+ obs_data = payload["observation"]
69
+ return StepResult(
70
+ observation=MyObservation(**obs_data),
71
+ reward=payload.get("reward"),
72
+ done=payload.get("done", False)
73
+ )
74
+
75
+ def _parse_state(self, payload: dict) -> Any:
76
+ return payload
77
+
78
+ # Async usage (recommended)
79
+ async def main():
80
+ client = await MyEnvClient.from_docker_image("my-env:latest")
81
+ async with client:
82
+ result = await client.reset()
83
+ step_result = await client.step(MyAction(text="hello"))
84
+
85
+ asyncio.run(main())
86
+
87
+ # Sync usage (via .sync() wrapper)
88
+ with MyEnvClient(base_url="http://localhost:8000").sync() as client:
89
+ result = client.reset()
90
+ step_result = client.step(MyAction(text="hello"))
91
+ ```
92
+
93
+ ### Creating an Environment Server
94
+
95
+ ```python
96
+ from openenv.core.env_server import Environment, HTTPEnvServer, create_app
97
+ from dataclasses import dataclass
98
+
99
+ @dataclass
100
+ class MyAction:
101
+ text: str
102
+
103
+ @dataclass
104
+ class MyObservation:
105
+ response: str
106
+ reward: float = 0.0
107
+ done: bool = False
108
+
109
+ class MyEnvironment(Environment):
110
+ def reset(self) -> MyObservation:
111
+ return MyObservation(response="Ready")
112
+
113
+ def step(self, action: MyAction) -> MyObservation:
114
+ return MyObservation(
115
+ response=f"Echo: {action.text}",
116
+ reward=1.0,
117
+ done=False
118
+ )
119
+
120
+ # Create FastAPI app
121
+ env = MyEnvironment()
122
+ app = create_app(env, MyAction, MyObservation)
123
+
124
+ # Run with: uvicorn module:app --host 0.0.0.0 --port 8000
125
+ ```
126
+
127
+ ## Container Providers
128
+
129
+ OpenEnv Core supports multiple container providers:
130
+
131
+ ### Local Docker Provider
132
+
133
+ ```python
134
+ from openenv.core.containers.runtime import LocalDockerProvider
135
+
136
+ provider = LocalDockerProvider()
137
+ base_url = provider.start_container("my-env:latest")
138
+ provider.wait_for_ready(base_url)
139
+ # Use environment...
140
+ provider.stop_container()
141
+ ```
142
+
143
+ ### Kubernetes Provider (Coming Soon)
144
+
145
+ ```python
146
+ from openenv.core.containers.runtime import KubernetesProvider
147
+
148
+ provider = KubernetesProvider(namespace="envs")
149
+ base_url = provider.start_container("my-env:latest")
150
+ # Use environment...
151
+ provider.stop_container()
152
+ ```
153
+
154
+
155
+ ## API Reference
156
+
157
+ ### EnvClient
158
+
159
+ Async base class for environment clients. Key methods:
160
+
161
+ - `async connect()`: Establish WebSocket connection
162
+ - `async reset(**kwargs)`: Reset environment
163
+ - `async step(action)`: Execute action
164
+ - `async state()`: Get current state
165
+ - `async close()`: Close connection and cleanup
166
+ - `sync()`: Return a SyncEnvClient wrapper for synchronous usage
167
+
168
+ Abstract methods to implement:
169
+ - `_step_payload(action)`: Convert action to JSON
170
+ - `_parse_result(payload)`: Parse response to StepResult
171
+ - `_parse_state(payload)`: Parse state response
172
+
173
+ ### SyncEnvClient
174
+
175
+ Synchronous wrapper around EnvClient. Use `client.sync()` to get one:
176
+
177
+ ```python
178
+ sync_client = async_client.sync()
179
+ with sync_client:
180
+ result = sync_client.reset()
181
+ result = sync_client.step(action)
182
+ ```
183
+
184
+ ### HTTPEnvServer
185
+
186
+ Server wrapper with these methods:
187
+
188
+ - `register_routes(app)`: Register endpoints on FastAPI app
189
+ - `_deserialize_action(data)`: Convert JSON to Action
190
+ - `_serialize_observation(obs)`: Convert Observation to JSON
191
+
192
+ ### Environment Interface
193
+
194
+ Base interface for environment implementations:
195
+
196
+ - `reset()`: Reset environment and return initial observation
197
+ - `step(action)`: Execute action and return observation
198
+ - `state`: Property returning current environment state
199
+
200
+ ## License
201
+
202
+ This project is licensed under the BSD-3-Clause License - see the LICENSE file for details.
203
+
204
+ ## Contributing
205
+
206
+ Contributions are welcome! Please see the main OpenEnv repository for contribution guidelines.
207
+
208
+ ## Links
209
+
210
+ - **Homepage**: https://github.com/meta-pytorch/OpenEnv
211
+ - **Documentation**: https://github.com/meta-pytorch/OpenEnv/blob/main/README.md
212
+ - **Bug Tracker**: https://github.com/meta-pytorch/OpenEnv/issues
src/core/__init__.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Core components for agentic environments."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from importlib import import_module
12
+ from typing import TYPE_CHECKING
13
+
14
+ from . import env_server
15
+ from .env_server import * # noqa: F403
16
+
17
+ if TYPE_CHECKING:
18
+ from .env_client import EnvClient
19
+ from .generic_client import GenericAction, GenericEnvClient
20
+ from .llm_client import (
21
+ AnthropicClient,
22
+ create_llm_client,
23
+ LLMClient,
24
+ LLMResponse,
25
+ OpenAIClient,
26
+ ToolCall,
27
+ )
28
+ from .mcp_client import MCPClientBase, MCPToolClient
29
+ from .sync_client import SyncEnvClient
30
+
31
+ __all__ = [
32
+ "EnvClient",
33
+ "SyncEnvClient",
34
+ "GenericEnvClient",
35
+ "GenericAction",
36
+ "MCPClientBase",
37
+ "MCPToolClient",
38
+ "AnthropicClient",
39
+ "LLMClient",
40
+ "LLMResponse",
41
+ "OpenAIClient",
42
+ "ToolCall",
43
+ "create_llm_client",
44
+ ] + env_server.__all__ # type: ignore
45
+
46
+
47
+ _LAZY_ATTRS = {
48
+ "EnvClient": (".env_client", "EnvClient"),
49
+ "SyncEnvClient": (".sync_client", "SyncEnvClient"),
50
+ "GenericEnvClient": (".generic_client", "GenericEnvClient"),
51
+ "GenericAction": (".generic_client", "GenericAction"),
52
+ "MCPClientBase": (".mcp_client", "MCPClientBase"),
53
+ "MCPToolClient": (".mcp_client", "MCPToolClient"),
54
+ "AnthropicClient": (".llm_client", "AnthropicClient"),
55
+ "LLMClient": (".llm_client", "LLMClient"),
56
+ "LLMResponse": (".llm_client", "LLMResponse"),
57
+ "OpenAIClient": (".llm_client", "OpenAIClient"),
58
+ "ToolCall": (".llm_client", "ToolCall"),
59
+ "create_llm_client": (".llm_client", "create_llm_client"),
60
+ }
61
+
62
+
63
+ def __getattr__(name: str):
64
+ if name in _LAZY_ATTRS:
65
+ module_path, attr_name = _LAZY_ATTRS[name]
66
+ module = import_module(module_path, __name__)
67
+ value = getattr(module, attr_name)
68
+ globals()[name] = value
69
+ return value
70
+
71
+ try:
72
+ value = getattr(env_server, name)
73
+ except AttributeError as exc:
74
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from exc
75
+
76
+ globals()[name] = value
77
+ return value
78
+
79
+
80
+ def __dir__() -> list[str]:
81
+ return sorted(set(globals().keys()) | set(__all__))
src/core/client_types.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Type definitions for EnvTorch
2
+ from dataclasses import dataclass
3
+ from typing import Generic, Optional, TypeVar
4
+
5
+ # Generic type for observations
6
+ ObsT = TypeVar("ObsT")
7
+ StateT = TypeVar("StateT")
8
+
9
+
10
+ @dataclass
11
+ class StepResult(Generic[ObsT]):
12
+ """
13
+ Represents the result of one environment step.
14
+
15
+ Attributes:
16
+ observation: The environment's observation after the action.
17
+ reward: Scalar reward for this step (optional).
18
+ done: Whether the episode is finished.
19
+ """
20
+
21
+ observation: ObsT
22
+ reward: Optional[float] = None
23
+ done: bool = False
src/core/containers/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Container management for environment servers."""
src/core/containers/images/Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ #
8
+ # OpenEnv Base Image
9
+ #
10
+ # This is the standard base image for all OpenEnv environment servers.
11
+ # It includes the minimal dependencies needed to run HTTP environment servers
12
+ # and uv for fast dependency management.
13
+ #
14
+ # Build from repo root: docker build -t openenv-base:latest -f src/openenv/core/containers/images/Dockerfile .
15
+ # Tag: docker tag openenv-base:latest openenv-base:0.2.0
16
+ #
17
+
18
+ FROM ghcr.io/astral-sh/uv:0.5.27-python3.11-bookworm-slim AS builder
19
+
20
+ # Set working directory
21
+ WORKDIR /app
22
+
23
+ # Copy core pyproject.toml and lockfile for dependency installation
24
+ COPY pyproject.toml uv.lock* ./
25
+
26
+ # Install core dependencies using uv with cache mount
27
+ RUN --mount=type=cache,target=/root/.cache/uv \
28
+ uv pip install --system -r pyproject.toml
29
+
30
+ # Final runtime stage
31
+ FROM python:3.11-slim
32
+
33
+ # Set metadata
34
+ LABEL maintainer="OpenEnv Team"
35
+ LABEL description="Base image for OpenEnv based environment servers with uv"
36
+ LABEL version="0.2.0"
37
+
38
+ # Install system dependencies
39
+ RUN apt-get update && apt-get install -y --no-install-recommends \
40
+ curl \
41
+ ca-certificates \
42
+ && rm -rf /var/lib/apt/lists/*
43
+
44
+ # Copy uv from builder
45
+ COPY --from=builder /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/
46
+
47
+ # Copy installed Python packages from builder
48
+ COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
49
+
50
+ # Copy console scripts installed by pip (uvicorn, fastapi, etc.)
51
+ COPY --from=builder /usr/local/bin/uvicorn /usr/local/bin/fastapi /usr/local/bin/
52
+
53
+ # Set working directory
54
+ WORKDIR /app
55
+
56
+ # Default environment variables
57
+ ENV PYTHONPATH=/app/src
58
+ ENV PYTHONUNBUFFERED=1
59
+ ENV UV_SYSTEM_PYTHON=1
60
+
61
+ # Default expose port (can be overridden)
62
+ EXPOSE 8000
63
+
64
+ # Note: CMD should be specified in child Dockerfiles
src/core/containers/images/README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenEnv Base Image
2
+
3
+ Standard base image for all OpenEnv environment servers.
4
+
5
+ ## What's Included
6
+
7
+ | Layer | Size | Contents |
8
+ |-------|------|----------|
9
+ | python:3.11-slim | 200 MB | Base Python runtime |
10
+ | + Dependencies | 100 MB | FastAPI, uvicorn, requests |
11
+ | **Total** | **~300 MB** | Ready for environment servers |
12
+
13
+ ## Image Sizes
14
+
15
+ ```
16
+ openenv-base:latest 300 MB (python + fastapi + uvicorn)
17
+ ```
18
+ echo-env:latest 500 MB (python + fastapi + uvicorn + app)
19
+ coding-env:latest 520 MB (python + fastapi + uvicorn + app + tools)
20
+ another-env:latest 510 MB (python + fastapi + uvicorn + app)
21
+ ---
22
+ Total: 1.5 GB (with lots of duplication)
23
+ ```
24
+
25
+ ### With Base Images (✅ Solution)
26
+ ```
27
+ openenv-base:latest 300 MB (python + fastapi + uvicorn)
28
+ echo-env:latest 50 MB (app only, uses base)
29
+ coding-env:latest 70 MB (app + tools, uses base)
30
+ another-env:latest 45 MB (app only, uses base)
31
+ ---
32
+ Total: 465 MB (base shared, minimal duplication)
33
+ ```
34
+
35
+ ## Building the Base Image
36
+
37
+ ```bash
38
+ # From project root
39
+ docker build -t openenv-base:latest -f src/openenv/core/containers/images/Dockerfile .
40
+ ```
41
+
42
+ ## Usage in Environment Dockerfiles
43
+
44
+ Each environment Dockerfile should start with:
45
+
46
+ ```dockerfile
47
+ FROM openenv-base:latest
48
+
49
+ # Copy only environment-specific files
50
+ COPY src/openenv/core/ /app/src/openenv/core/
51
+ COPY envs/my_env/ /app/envs/my_env/
52
+
53
+ # Run the server
54
+ CMD ["uvicorn", "envs.my_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
55
+ ```
56
+
57
+ ## Base Image Contents
58
+
59
+ - Python 3.11-slim
60
+ - FastAPI >= 0.104.0
61
+ - Uvicorn >= 0.24.0
62
+ - Requests >= 2.25.0
63
+ - curl (for health checks)
64
+
65
+ ## Example: Building Echo Environment
66
+
67
+ ```bash
68
+ # Step 1: Build base image (do this once)
69
+ docker build -t openenv-base:latest -f src/openenv/core/containers/images/Dockerfile .
70
+
71
+ # Step 2: Build echo environment (uses base)
72
+ docker build -t echo-env:latest -f envs/echo_env/server/Dockerfile .
73
+
74
+ # Step 3: Run echo environment
75
+ docker run -p 8000:8000 echo-env:latest
76
+ ```
77
+
78
+ ## Updating the Base
79
+
80
+ When dependencies need updating:
81
+
82
+ 1. Update `src/openenv/core/containers/images/Dockerfile`
83
+ 2. Rebuild base image
84
+ 3. Rebuild all environment images (they'll use new base)
85
+
86
+ ```bash
87
+ # Update base
88
+ docker build -t openenv-base:latest -f src/openenv/core/containers/images/Dockerfile .
89
+
90
+ # Rebuild environments (they automatically use new base)
91
+ docker build -t echo-env:latest -f envs/echo_env/server/Dockerfile .
92
+ ```
src/core/containers/runtime/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Container runtime providers."""
8
+
9
+ from .providers import (
10
+ ContainerProvider,
11
+ DockerSwarmProvider,
12
+ KubernetesProvider,
13
+ LocalDockerProvider,
14
+ RuntimeProvider,
15
+ )
16
+ from .uv_provider import UVProvider
17
+
18
+ __all__ = [
19
+ "ContainerProvider",
20
+ "DockerSwarmProvider",
21
+ "LocalDockerProvider",
22
+ "KubernetesProvider",
23
+ "RuntimeProvider",
24
+ "UVProvider",
25
+ ]
src/core/containers/runtime/daytona_provider.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Daytona container provider for running OpenEnv environments in Daytona cloud sandboxes.
9
+
10
+ Requires the ``daytona`` SDK: ``pip install daytona>=0.10``
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import shlex
18
+ import time
19
+ from typing import Any, Callable, Dict, Optional
20
+
21
+ import yaml
22
+
23
+ from .providers import ContainerProvider
24
+
25
+
26
+ class DaytonaProvider(ContainerProvider):
27
+ """
28
+ Container provider that runs environments in Daytona cloud sandboxes.
29
+
30
+ Example:
31
+ >>> provider = DaytonaProvider(api_key="your-key")
32
+ >>> image = DaytonaProvider.image_from_dockerfile("envs/echo_env/server/Dockerfile")
33
+ >>> base_url = provider.start_container(image)
34
+ >>> provider.wait_for_ready(base_url)
35
+ >>> provider.stop_container()
36
+ """
37
+
38
+ _dockerfile_registry: Dict[str, Dict[str, Any]] = {}
39
+
40
+ def __init__(
41
+ self,
42
+ *,
43
+ api_key: Optional[str] = None,
44
+ public: bool = False,
45
+ resources: Optional[Any] = None,
46
+ auto_stop_interval: int = 15,
47
+ target: Optional[str] = None,
48
+ on_snapshot_create_logs: Optional[Callable[[str], None]] = None,
49
+ cmd: Optional[str] = None,
50
+ create_timeout: float = 300,
51
+ ):
52
+ """
53
+ Args:
54
+ api_key: Daytona API key. Falls back to ``DAYTONA_API_KEY`` env var.
55
+ public: If True, the sandbox preview is publicly accessible.
56
+ resources: Optional ``daytona.Resources`` instance for CPU/memory.
57
+ auto_stop_interval: Minutes of inactivity before auto-stop (0 disables).
58
+ target: Daytona target region (e.g. "us").
59
+ on_snapshot_create_logs: Callback for snapshot build log lines.
60
+ cmd: Shell command to start the server inside the sandbox.
61
+ create_timeout: Seconds to wait for sandbox creation (default 300).
62
+ Heavy images (e.g. with Playwright/Chromium) may need more.
63
+ """
64
+ from daytona import Daytona, DaytonaConfig
65
+
66
+ config_kwargs: Dict[str, Any] = {}
67
+ resolved_key = api_key or os.environ.get("DAYTONA_API_KEY")
68
+ if resolved_key:
69
+ config_kwargs["api_key"] = resolved_key
70
+ if target:
71
+ config_kwargs["target"] = target
72
+
73
+ self._daytona = Daytona(DaytonaConfig(**config_kwargs))
74
+ self._public = public
75
+ self._resources = resources
76
+ self._auto_stop_interval = auto_stop_interval
77
+ self._on_snapshot_create_logs = on_snapshot_create_logs
78
+ self._cmd = cmd
79
+ self._create_timeout = create_timeout
80
+ self._sandbox: Any = None
81
+ self._preview_url: Optional[str] = None
82
+
83
+ def _discover_server_cmd(self, sandbox: Any, port: int = 8000) -> str:
84
+ """Discover the server command from ``openenv.yaml`` inside *sandbox*.
85
+
86
+ Finds the file, reads the ``app`` field, and constructs a command
87
+ of the form ``cd <env_root> && python -m uvicorn <app> --host 0.0.0.0 --port <port>``.
88
+
89
+ Raises:
90
+ ValueError: If ``openenv.yaml`` is not found or lacks an ``app`` field.
91
+ """
92
+ yaml_path = self._find_openenv_yaml(sandbox)
93
+ if yaml_path is None:
94
+ raise ValueError(
95
+ "Could not find openenv.yaml inside the sandbox. "
96
+ "Pass an explicit cmd= to DaytonaProvider or start_container()."
97
+ )
98
+
99
+ cat_resp = sandbox.process.exec(f"cat {shlex.quote(yaml_path)}", timeout=10)
100
+ content = cat_resp.result if hasattr(cat_resp, "result") else str(cat_resp)
101
+ app = self._parse_app_field(content)
102
+ if app is None:
103
+ raise ValueError(
104
+ f"openenv.yaml at {yaml_path} does not contain an 'app' field. "
105
+ "Pass an explicit cmd= to DaytonaProvider or start_container()."
106
+ )
107
+
108
+ # The directory containing openenv.yaml is the env root
109
+ env_root = yaml_path.rsplit("/", 1)[0]
110
+ return (
111
+ f"cd {shlex.quote(env_root)} && "
112
+ f"python -m uvicorn {shlex.quote(app)} --host 0.0.0.0 --port {port}"
113
+ )
114
+
115
+ def _find_openenv_yaml(self, sandbox: Any) -> Optional[str]:
116
+ """Locate ``openenv.yaml`` inside the sandbox.
117
+
118
+ Tries the modern layout path ``/app/env/openenv.yaml`` first,
119
+ then falls back to a ``find`` command for the old layout.
120
+ """
121
+ # Fast path: modern Dockerfile layout
122
+ resp = sandbox.process.exec(
123
+ "test -f /app/env/openenv.yaml && echo found", timeout=10
124
+ )
125
+ out = resp.result if hasattr(resp, "result") else str(resp)
126
+ if "found" in (out or ""):
127
+ return "/app/env/openenv.yaml"
128
+
129
+ # Fallback: search for it (redirect stderr so error messages
130
+ # like "No such file or directory" don't get mistaken for paths).
131
+ resp = sandbox.process.exec(
132
+ "find /app -maxdepth 4 -name openenv.yaml -print -quit 2>/dev/null",
133
+ timeout=10,
134
+ )
135
+ path = (resp.result if hasattr(resp, "result") else str(resp) or "").strip()
136
+ if path and path.startswith("/"):
137
+ return path
138
+
139
+ return None
140
+
141
+ @staticmethod
142
+ def _parse_app_field(yaml_content: str) -> Optional[str]:
143
+ """Extract the ``app`` value from raw openenv.yaml content.
144
+
145
+ Uses PyYAML to handle comments, quotes, and nested keys correctly.
146
+ """
147
+ try:
148
+ data = yaml.safe_load(yaml_content) or {}
149
+ except Exception:
150
+ return None
151
+
152
+ if not isinstance(data, dict):
153
+ return None
154
+
155
+ value = data.get("app")
156
+ if isinstance(value, str):
157
+ value = value.strip()
158
+ return value if value else None
159
+ return None
160
+
161
+ @staticmethod
162
+ def _parse_dockerfile_cmd(dockerfile_content: str) -> Optional[str]:
163
+ """Extract the server command from the last ``CMD`` in a Dockerfile.
164
+
165
+ Handles exec form (``CMD ["prog", "arg"]``) and shell form
166
+ (``CMD prog arg``). When a Dockerfile has multiple ``CMD``
167
+ instructions (e.g. multi-stage builds), the last one wins - same
168
+ semantics as Docker itself. Lines where ``CMD`` appears inside a
169
+ comment are ignored.
170
+
171
+ Returns:
172
+ The command as a single string, or ``None`` if no ``CMD`` found.
173
+ """
174
+ import re
175
+
176
+ last_cmd: Optional[str] = None
177
+ for line in dockerfile_content.splitlines():
178
+ stripped = line.strip()
179
+ # Skip comments
180
+ if stripped.startswith("#"):
181
+ continue
182
+ match = re.match(r"CMD\s+(.+)", stripped, flags=re.IGNORECASE)
183
+ if match:
184
+ last_cmd = match.group(1).strip()
185
+
186
+ if last_cmd is None:
187
+ return None
188
+
189
+ # Exec form: CMD ["executable", "param1", ...]
190
+ if last_cmd.startswith("["):
191
+ try:
192
+ parts = json.loads(last_cmd)
193
+ if isinstance(parts, list) and all(isinstance(p, str) for p in parts):
194
+ return " ".join(parts)
195
+ except (json.JSONDecodeError, TypeError):
196
+ pass
197
+
198
+ # Shell form: CMD executable param1 ...
199
+ return last_cmd if last_cmd else None
200
+
201
+ @staticmethod
202
+ def strip_buildkit_syntax(dockerfile_content: str) -> str:
203
+ """Remove BuildKit ``--mount=...`` flags from ``RUN`` instructions.
204
+
205
+ Handles single-line flags, multi-line continuations, and multiple
206
+ ``--mount`` flags spread across continuation lines. Only leading
207
+ ``--mount`` flags are removed (before the actual command starts).
208
+
209
+ Daytona's ``Image.from_dockerfile`` does not support BuildKit
210
+ ``--mount`` syntax. This helper strips the flags so that standard
211
+ Dockerfiles (like the ones generated by ``openenv build``) can
212
+ be used directly.
213
+ """
214
+ import re
215
+
216
+ def strip_leading_mounts(text: str) -> str:
217
+ remaining = text
218
+ while True:
219
+ match = re.match(r"\s*--mount=\S+\s*", remaining)
220
+ if not match:
221
+ return remaining
222
+ remaining = remaining[match.end() :]
223
+
224
+ lines = dockerfile_content.split("\n")
225
+ result: list[str] = []
226
+ in_run = False
227
+ in_mount_prefix = False
228
+
229
+ for line in lines:
230
+ line_out = line
231
+ run_start = False
232
+ if re.match(r"\s*RUN(\s+|$)", line, flags=re.IGNORECASE):
233
+ in_run = True
234
+ in_mount_prefix = True
235
+ run_start = True
236
+
237
+ if in_run and in_mount_prefix:
238
+ original_ends_with_slash = line_out.rstrip().endswith("\\")
239
+ if run_start:
240
+ match = re.match(r"(\s*RUN\s+)(.*)$", line_out, flags=re.IGNORECASE)
241
+ if match:
242
+ run_prefix, remainder = match.group(1), match.group(2)
243
+ else:
244
+ run_prefix, remainder = line_out, ""
245
+ new_remainder = strip_leading_mounts(remainder)
246
+ line_out = run_prefix + new_remainder
247
+ content_for_check = new_remainder
248
+ else:
249
+ new_remainder = strip_leading_mounts(line_out)
250
+ line_out = new_remainder
251
+ content_for_check = new_remainder
252
+
253
+ if original_ends_with_slash and not line_out.rstrip().endswith("\\"):
254
+ line_out = line_out.rstrip() + " \\"
255
+
256
+ if content_for_check.strip() not in ("", "\\"):
257
+ in_mount_prefix = False
258
+
259
+ if in_run and not line_out.rstrip().endswith("\\"):
260
+ in_run = False
261
+ in_mount_prefix = False
262
+
263
+ result.append(line_out)
264
+
265
+ return "\n".join(result)
266
+
267
+ @classmethod
268
+ def image_from_dockerfile(
269
+ cls,
270
+ dockerfile_path: str,
271
+ context_dir: str | None = None,
272
+ ) -> str:
273
+ """Validate a Dockerfile and return a ``dockerfile:`` URI for
274
+ :meth:`start_container`.
275
+
276
+ Eagerly validates the Dockerfile (existence, COPY sources,
277
+ BuildKit stripping) and stores the processed content in an
278
+ internal registry. The actual ``daytona.Image`` is created
279
+ later inside ``start_container``.
280
+
281
+ Args:
282
+ dockerfile_path: Path to the Dockerfile on disk.
283
+ context_dir: Build context directory. Defaults to the
284
+ Dockerfile's grandparent directory, matching the
285
+ ``openenv init`` convention where Dockerfiles live in
286
+ ``<env>/server/Dockerfile`` and the build context is
287
+ ``<env>/``. Pass explicitly for non-standard layouts
288
+ (e.g. ``context_dir="."`` for repo-root contexts).
289
+
290
+ Returns:
291
+ A ``"dockerfile:<abs_path>"`` string to pass to
292
+ ``start_container``.
293
+
294
+ Raises:
295
+ FileNotFoundError: If *dockerfile_path* does not exist.
296
+ ValueError: If *context_dir* is given but does not exist,
297
+ or if COPY sources in the Dockerfile cannot be found
298
+ under the resolved context directory.
299
+ """
300
+ import pathlib
301
+ import re
302
+
303
+ src = pathlib.Path(dockerfile_path).resolve()
304
+ if not src.is_file():
305
+ raise FileNotFoundError(f"Dockerfile not found: {dockerfile_path}")
306
+
307
+ if context_dir is not None:
308
+ ctx = pathlib.Path(context_dir)
309
+ if not ctx.is_dir():
310
+ raise ValueError(f"context_dir does not exist: {context_dir}")
311
+ else:
312
+ # Default: grandparent of the Dockerfile, matching the
313
+ # openenv init layout (<env>/server/Dockerfile -> <env>/).
314
+ ctx = src.parent.parent
315
+
316
+ content = src.read_text()
317
+ stripped = cls.strip_buildkit_syntax(content)
318
+
319
+ # Validate that COPY sources exist under the context directory.
320
+ # This catches mismatches early (e.g. a Dockerfile expecting repo
321
+ # root as context when we defaulted to the env directory).
322
+ for line in stripped.splitlines():
323
+ m = re.match(r"^\s*COPY\s+(?!--from=)(\S+)\s+", line, re.IGNORECASE)
324
+ if not m:
325
+ continue
326
+ copy_src = m.group(1)
327
+ if copy_src.startswith("/"):
328
+ continue
329
+ resolved = ctx / copy_src
330
+ if not resolved.exists() and not any(ctx.glob(copy_src)):
331
+ raise ValueError(
332
+ f"Dockerfile COPY source '{copy_src}' not found "
333
+ f"under context_dir '{ctx}'. This Dockerfile may "
334
+ f"expect a different build context (e.g. the repo "
335
+ f"root). Pass context_dir explicitly."
336
+ )
337
+
338
+ # Parse CMD from the original Dockerfile so start_container can
339
+ # use it as a fallback when openenv.yaml is unavailable.
340
+ parsed_cmd = cls._parse_dockerfile_cmd(content)
341
+
342
+ cls._dockerfile_registry[str(src)] = {
343
+ "stripped_content": stripped,
344
+ "context_dir": str(ctx),
345
+ "server_cmd": parsed_cmd,
346
+ }
347
+
348
+ return f"dockerfile:{src}"
349
+
350
+ def start_container(
351
+ self,
352
+ image: str,
353
+ port: Optional[int] = None,
354
+ env_vars: Optional[Dict[str, str]] = None,
355
+ **kwargs: Any,
356
+ ) -> str:
357
+ """
358
+ Create a Daytona sandbox from a Docker image or snapshot.
359
+
360
+ Daytona does not execute the image's CMD (known bug — ENTRYPOINT
361
+ runs, CMD does not). The server command is resolved in order:
362
+
363
+ 1. Explicit ``cmd`` passed to the constructor.
364
+ 2. ``cmd`` key in ``**kwargs`` (popped before forwarding).
365
+ 3. Auto-discovered from ``openenv.yaml`` inside the sandbox.
366
+ 4. ``CMD`` parsed from the Dockerfile (when *image* came from
367
+ ``image_from_dockerfile``).
368
+
369
+ Args:
370
+ image: Docker image name (e.g. ``"echo-env:latest"``),
371
+ ``"snapshot:<name>"`` to create from a pre-built snapshot,
372
+ or ``"dockerfile:<path>"`` returned by
373
+ :meth:`image_from_dockerfile`.
374
+ port: Must be ``None`` or ``8000``. Daytona exposes port 8000
375
+ via its preview proxy; other ports raise ``ValueError``.
376
+ env_vars: Environment variables forwarded to the sandbox.
377
+ **kwargs: ``cmd`` (str) to override the server command;
378
+ remaining kwargs passed through to ``Daytona.create()``.
379
+
380
+ Returns:
381
+ HTTPS preview URL for the sandbox (base_url).
382
+ """
383
+ if port is not None and port != 8000:
384
+ raise ValueError(
385
+ f"DaytonaProvider only supports port 8000 (got {port}). "
386
+ "The Daytona preview proxy routes to port 8000 inside the sandbox."
387
+ )
388
+
389
+ # Resolve the server command (may be None; discovery happens after
390
+ # sandbox creation when we can inspect the filesystem).
391
+ cmd = kwargs.pop("cmd", None) or self._cmd
392
+
393
+ # CMD parsed from Dockerfile (populated for "dockerfile:" images).
394
+ parsed_cmd: Optional[str] = None
395
+
396
+ # Build creation params
397
+ create_kwargs: Dict[str, Any] = {}
398
+ if env_vars:
399
+ create_kwargs["env_vars"] = env_vars
400
+ if self._public:
401
+ create_kwargs["public"] = True
402
+ if self._auto_stop_interval != 15:
403
+ create_kwargs["auto_stop_interval"] = self._auto_stop_interval
404
+
405
+ if image.startswith("snapshot:"):
406
+ from daytona import CreateSandboxFromSnapshotParams
407
+
408
+ snapshot_name = image[len("snapshot:") :]
409
+ params = CreateSandboxFromSnapshotParams(
410
+ snapshot=snapshot_name, **create_kwargs
411
+ )
412
+ elif image.startswith("dockerfile:"):
413
+ from daytona import CreateSandboxFromImageParams, Image
414
+
415
+ dockerfile_path = image[len("dockerfile:") :]
416
+ meta = self._dockerfile_registry.get(dockerfile_path)
417
+ if meta is None:
418
+ raise ValueError(
419
+ f"No registered Dockerfile metadata for {dockerfile_path}. "
420
+ "Call DaytonaProvider.image_from_dockerfile() first."
421
+ )
422
+
423
+ parsed_cmd = meta.get("server_cmd")
424
+
425
+ # Build the daytona Image from the pre-stripped content.
426
+ import pathlib
427
+ import uuid
428
+
429
+ ctx = pathlib.Path(meta["context_dir"])
430
+ tmp_name = f".daytona-{uuid.uuid4().hex[:8]}.dockerfile"
431
+ tmp_path = ctx / tmp_name
432
+ try:
433
+ tmp_path.write_text(meta["stripped_content"])
434
+ daytona_image = Image.from_dockerfile(str(tmp_path))
435
+ finally:
436
+ tmp_path.unlink(missing_ok=True)
437
+
438
+ img_kwargs: Dict[str, Any] = {
439
+ "image": daytona_image,
440
+ **create_kwargs,
441
+ }
442
+ if self._resources is not None:
443
+ img_kwargs["resources"] = self._resources
444
+ params = CreateSandboxFromImageParams(**img_kwargs)
445
+ else:
446
+ from daytona import CreateSandboxFromImageParams
447
+
448
+ img_kwargs = {"image": image, **create_kwargs}
449
+ if self._resources is not None:
450
+ img_kwargs["resources"] = self._resources
451
+ params = CreateSandboxFromImageParams(**img_kwargs)
452
+
453
+ # Create sandbox
454
+ extra: Dict[str, Any] = dict(kwargs)
455
+ if self._on_snapshot_create_logs is not None:
456
+ extra["on_snapshot_create_logs"] = self._on_snapshot_create_logs
457
+
458
+ self._sandbox = self._daytona.create(
459
+ params, timeout=self._create_timeout, **extra
460
+ )
461
+
462
+ try:
463
+ # Discover server command from openenv.yaml if not explicitly set.
464
+ if cmd is None:
465
+ try:
466
+ cmd = self._discover_server_cmd(self._sandbox)
467
+ except ValueError:
468
+ # Fall back to CMD parsed from Dockerfile (if available).
469
+ if parsed_cmd:
470
+ cmd = parsed_cmd
471
+ else:
472
+ raise
473
+
474
+ # Wrap in bash -c so compound commands (cd ... && uvicorn ...)
475
+ # are handled correctly by nohup. Write PID so we can check
476
+ # if the process crashed later in wait_for_ready().
477
+ escaped_cmd = shlex.quote(cmd)
478
+ self._sandbox.process.exec(
479
+ f"nohup bash -c {escaped_cmd} > /tmp/openenv-server.log 2>&1 &"
480
+ " echo $! > /tmp/openenv-server.pid",
481
+ timeout=10,
482
+ )
483
+
484
+ # Get a signed preview URL for port 8000. The token is
485
+ # embedded in the URL itself so no extra headers are needed.
486
+ signed = self._sandbox.create_signed_preview_url(
487
+ 8000, expires_in_seconds=86400
488
+ )
489
+ self._preview_url = signed.url
490
+ except Exception:
491
+ self.stop_container()
492
+ raise
493
+
494
+ return self._preview_url
495
+
496
+ def refresh_preview_url(self) -> str:
497
+ """Get a fresh signed preview URL (valid for 24h).
498
+
499
+ Daytona signed URLs expire after at most 24 hours. Call this to
500
+ get a new one for long-running sessions. The returned URL points
501
+ to the same sandbox — clients will need to reconnect using it.
502
+ """
503
+ if self._sandbox is None:
504
+ raise RuntimeError("No active sandbox to refresh URL for.")
505
+ signed = self._sandbox.create_signed_preview_url(8000, expires_in_seconds=86400)
506
+ self._preview_url = signed.url
507
+ return self._preview_url
508
+
509
+ def stop_container(self) -> None:
510
+ """Delete the Daytona sandbox."""
511
+ if self._sandbox is None:
512
+ return
513
+
514
+ try:
515
+ self._daytona.delete(self._sandbox)
516
+ finally:
517
+ self._sandbox = None
518
+ self._preview_url = None
519
+
520
+ def wait_for_ready(self, base_url: str, timeout_s: float = 120.0) -> None:
521
+ """
522
+ Poll the /health endpoint until the sandbox is ready.
523
+
524
+ Uses a longer default timeout (120s) than Docker providers because
525
+ Daytona sandboxes may have cold-start latency.
526
+
527
+ Args:
528
+ base_url: Preview URL returned by ``start_container()``.
529
+ timeout_s: Maximum seconds to wait.
530
+
531
+ Raises:
532
+ TimeoutError: If the sandbox doesn't become ready in time.
533
+ RuntimeError: If the server process died (detected via PID check).
534
+ """
535
+ import requests
536
+
537
+ health_url = f"{base_url}/health"
538
+
539
+ deadline = time.time() + timeout_s
540
+ while time.time() < deadline:
541
+ try:
542
+ response = requests.get(health_url, timeout=5.0)
543
+ if response.status_code == 200:
544
+ return
545
+ except requests.RequestException:
546
+ pass
547
+
548
+ # Early exit: if the server process died, raise immediately
549
+ # instead of waiting for the full health-check timeout.
550
+ if self._sandbox is not None:
551
+ resp = self._sandbox.process.exec(
552
+ "kill -0 $(cat /tmp/openenv-server.pid) 2>/dev/null"
553
+ " && echo RUNNING || echo DEAD",
554
+ timeout=10,
555
+ )
556
+ out = resp.result if hasattr(resp, "result") else str(resp)
557
+ if "DEAD" in (out or ""):
558
+ log_resp = self._sandbox.process.exec(
559
+ "cat /tmp/openenv-server.log 2>/dev/null", timeout=10
560
+ )
561
+ log = (
562
+ log_resp.result
563
+ if hasattr(log_resp, "result")
564
+ else str(log_resp)
565
+ )
566
+ raise RuntimeError(f"Server process died.\nLog:\n{log}")
567
+
568
+ time.sleep(1.0)
569
+
570
+ raise TimeoutError(
571
+ f"Daytona sandbox at {base_url} did not become ready within {timeout_s}s"
572
+ )
src/core/containers/runtime/providers.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Container provider abstractions for running environment servers.
9
+
10
+ This module provides a pluggable architecture for different container providers
11
+ (local Docker, Kubernetes, cloud providers, etc.) to be used with EnvClient.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from typing import Any, Dict, Optional, Sequence
18
+
19
+
20
+ class ContainerProvider(ABC):
21
+ """
22
+ Abstract base class for container providers.
23
+
24
+ Providers implement this interface to support different container platforms:
25
+ - LocalDockerProvider: Runs containers on local Docker daemon
26
+ - KubernetesProvider: Runs containers in Kubernetes cluster
27
+ - FargateProvider: Runs containers on AWS Fargate
28
+ - CloudRunProvider: Runs containers on Google Cloud Run
29
+
30
+ The provider manages a single container lifecycle and provides the base URL
31
+ for connecting to it.
32
+
33
+ Example:
34
+ >>> provider = LocalDockerProvider()
35
+ >>> base_url = provider.start_container("echo-env:latest")
36
+ >>> print(base_url) # http://localhost:8000
37
+ >>> # Use the environment via base_url
38
+ >>> provider.stop_container()
39
+ """
40
+
41
+ @abstractmethod
42
+ def start_container(
43
+ self,
44
+ image: str,
45
+ port: Optional[int] = None,
46
+ env_vars: Optional[Dict[str, str]] = None,
47
+ **kwargs: Any,
48
+ ) -> str:
49
+ """
50
+ Start a container from the specified image.
51
+
52
+ Args:
53
+ image: Container image name (e.g., "echo-env:latest")
54
+ port: Port to expose (if None, provider chooses)
55
+ env_vars: Environment variables to pass to container
56
+ **kwargs: Provider-specific options
57
+
58
+ Returns:
59
+ Base URL to connect to the container (e.g., "http://localhost:8000")
60
+
61
+ Raises:
62
+ RuntimeError: If container fails to start
63
+ """
64
+ pass
65
+
66
+ @abstractmethod
67
+ def stop_container(self) -> None:
68
+ """
69
+ Stop and remove the running container.
70
+
71
+ This cleans up the container that was started by start_container().
72
+ """
73
+ pass
74
+
75
+ @abstractmethod
76
+ def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
77
+ """
78
+ Wait for the container to be ready to accept requests.
79
+
80
+ This typically polls the /health endpoint until it returns 200.
81
+
82
+ Args:
83
+ base_url: Base URL of the container
84
+ timeout_s: Maximum time to wait
85
+
86
+ Raises:
87
+ TimeoutError: If container doesn't become ready in time
88
+ """
89
+ pass
90
+
91
+
92
+ class LocalDockerProvider(ContainerProvider):
93
+ """
94
+ Container provider for local Docker daemon.
95
+
96
+ This provider runs containers on the local machine using Docker.
97
+ Useful for development and testing.
98
+
99
+ Example:
100
+ >>> provider = LocalDockerProvider()
101
+ >>> base_url = provider.start_container("echo-env:latest")
102
+ >>> # Container running on http://localhost:<random-port>
103
+ >>> provider.stop_container()
104
+ """
105
+
106
+ def __init__(self):
107
+ """Initialize the local Docker provider."""
108
+ self._container_id: Optional[str] = None
109
+ self._container_name: Optional[str] = None
110
+
111
+ # Check if Docker is available
112
+ import subprocess
113
+
114
+ try:
115
+ subprocess.run(
116
+ ["docker", "version"],
117
+ check=True,
118
+ capture_output=True,
119
+ timeout=5,
120
+ )
121
+ except (
122
+ subprocess.CalledProcessError,
123
+ FileNotFoundError,
124
+ subprocess.TimeoutExpired,
125
+ ):
126
+ raise RuntimeError(
127
+ "Docker is not available. Please install Docker Desktop or Docker Engine."
128
+ )
129
+
130
+ def start_container(
131
+ self,
132
+ image: str,
133
+ port: Optional[int] = None,
134
+ env_vars: Optional[Dict[str, str]] = None,
135
+ **kwargs: Any,
136
+ ) -> str:
137
+ """
138
+ Start a Docker container locally.
139
+
140
+ Args:
141
+ image: Docker image name
142
+ port: Port to expose (if None, finds available port)
143
+ env_vars: Environment variables for the container
144
+ **kwargs: Additional Docker run options
145
+
146
+ Returns:
147
+ Base URL to connect to the container
148
+ """
149
+ import subprocess
150
+ import time
151
+
152
+ # Find available port if not specified
153
+ if port is None:
154
+ port = self._find_available_port()
155
+
156
+ # Generate container name
157
+ self._container_name = self._generate_container_name(image)
158
+
159
+ # Build docker run command
160
+ cmd = [
161
+ "docker",
162
+ "run",
163
+ "-d", # Detached
164
+ "--name",
165
+ self._container_name,
166
+ "-p",
167
+ f"{port}:8000", # Map port
168
+ ]
169
+
170
+ # Add environment variables
171
+ if env_vars:
172
+ for key, value in env_vars.items():
173
+ cmd.extend(["-e", f"{key}={value}"])
174
+
175
+ # Add image
176
+ cmd.append(image)
177
+
178
+ # Run container
179
+ try:
180
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
181
+ self._container_id = result.stdout.strip()
182
+ except subprocess.CalledProcessError as e:
183
+ error_msg = f"Failed to start Docker container.\nCommand: {' '.join(cmd)}\nExit code: {e.returncode}\nStderr: {e.stderr}\nStdout: {e.stdout}"
184
+ raise RuntimeError(error_msg) from e
185
+
186
+ # Wait a moment for container to start
187
+ time.sleep(1)
188
+
189
+ base_url = f"http://localhost:{port}"
190
+ return base_url
191
+
192
+ def stop_container(self) -> None:
193
+ """
194
+ Stop and remove the Docker container.
195
+ """
196
+ if self._container_id is None:
197
+ return
198
+
199
+ import subprocess
200
+
201
+ try:
202
+ # Stop container
203
+ subprocess.run(
204
+ ["docker", "stop", self._container_id],
205
+ capture_output=True,
206
+ check=True,
207
+ timeout=10,
208
+ )
209
+
210
+ # Remove container
211
+ subprocess.run(
212
+ ["docker", "rm", self._container_id],
213
+ capture_output=True,
214
+ check=True,
215
+ timeout=10,
216
+ )
217
+ except subprocess.CalledProcessError:
218
+ # Container might already be stopped/removed
219
+ pass
220
+ finally:
221
+ self._container_id = None
222
+ self._container_name = None
223
+
224
+ def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
225
+ """
226
+ Wait for container to be ready by polling /health endpoint.
227
+
228
+ Args:
229
+ base_url: Base URL of the container
230
+ timeout_s: Maximum time to wait
231
+
232
+ Raises:
233
+ TimeoutError: If container doesn't become ready
234
+ """
235
+ import time
236
+
237
+ import requests
238
+
239
+ start_time = time.time()
240
+ health_url = f"{base_url}/health"
241
+
242
+ # Bypass proxy for localhost to avoid proxy issues
243
+ proxies = {"http": None, "https": None}
244
+
245
+ while time.time() - start_time < timeout_s:
246
+ try:
247
+ response = requests.get(health_url, timeout=2.0, proxies=proxies)
248
+ if response.status_code == 200:
249
+ return
250
+ except requests.RequestException:
251
+ pass
252
+
253
+ time.sleep(0.5)
254
+
255
+ raise TimeoutError(
256
+ f"Container at {base_url} did not become ready within {timeout_s}s"
257
+ )
258
+
259
+ def _find_available_port(self) -> int:
260
+ """
261
+ Find an available port on localhost.
262
+
263
+ Returns:
264
+ An available port number
265
+ """
266
+ import socket
267
+
268
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
269
+ s.bind(("", 0))
270
+ s.listen(1)
271
+ port = s.getsockname()[1]
272
+ return port
273
+
274
+ def _generate_container_name(self, image: str) -> str:
275
+ """
276
+ Generate a unique container name based on image name and timestamp.
277
+
278
+ Args:
279
+ image: Docker image name
280
+
281
+ Returns:
282
+ A unique container name
283
+ """
284
+ import time
285
+
286
+ clean_image = image.split("/")[-1].split(":")[0]
287
+ timestamp = int(time.time() * 1000)
288
+ return f"{clean_image}-{timestamp}"
289
+
290
+
291
+ class DockerSwarmProvider(ContainerProvider):
292
+ """
293
+ Container provider that uses Docker Swarm services for local concurrency.
294
+
295
+ This provider creates a replicated Swarm service backed by the local Docker
296
+ engine. The built-in load-balancer fans requests across the replicas,
297
+ allowing multiple container instances to run concurrently on the developer
298
+ workstation (mirroring the workflow described in the Docker stack docs).
299
+ """
300
+
301
+ def __init__(
302
+ self,
303
+ *,
304
+ auto_init_swarm: bool = True,
305
+ overlay_network: Optional[str] = None,
306
+ ):
307
+ """
308
+ Args:
309
+ auto_init_swarm: Whether to call ``docker swarm init`` when Swarm
310
+ is not active. Otherwise, user must manually initialize Swarm.
311
+ overlay_network: Optional overlay network name for the service.
312
+ When provided, the network is created with
313
+ ``docker network create --driver overlay --attachable`` if it
314
+ does not already exist.
315
+ """
316
+ self._service_name: Optional[str] = None
317
+ self._service_id: Optional[str] = None
318
+ self._published_port: Optional[int] = None
319
+ self._overlay_network = overlay_network
320
+ self._auto_init_swarm = auto_init_swarm
321
+
322
+ self._ensure_docker_available()
323
+ self._ensure_swarm_initialized()
324
+ if self._overlay_network:
325
+ self._ensure_overlay_network(self._overlay_network)
326
+
327
+ def start_container(
328
+ self,
329
+ image: str,
330
+ port: Optional[int] = None,
331
+ env_vars: Optional[Dict[str, str]] = None,
332
+ **kwargs: Any,
333
+ ) -> str:
334
+ """
335
+ Start (or scale) a Swarm service for the given image.
336
+
337
+ Supported kwargs:
338
+ replicas (int): Number of container replicas (default: 2).
339
+ cpu_limit (float | str): CPU limit passed to ``--limit-cpu``.
340
+ memory_limit (str): Memory limit passed to ``--limit-memory``.
341
+ constraints (Sequence[str]): Placement constraints.
342
+ labels (Dict[str, str]): Service labels.
343
+ command (Sequence[str] | str): Override container command.
344
+ """
345
+ import shlex
346
+ import subprocess
347
+ import time
348
+
349
+ allowed_kwargs = {
350
+ "replicas",
351
+ "cpu_limit",
352
+ "memory_limit",
353
+ "constraints",
354
+ "labels",
355
+ "command",
356
+ }
357
+ unknown = set(kwargs) - allowed_kwargs
358
+ if unknown:
359
+ raise ValueError(f"Unsupported kwargs for DockerSwarmProvider: {unknown}")
360
+
361
+ replicas = int(kwargs.get("replicas", 2))
362
+ cpu_limit = kwargs.get("cpu_limit")
363
+ memory_limit = kwargs.get("memory_limit")
364
+ constraints: Optional[Sequence[str]] = kwargs.get("constraints")
365
+ labels: Optional[Dict[str, str]] = kwargs.get("labels")
366
+ command_override = kwargs.get("command")
367
+
368
+ if port is None:
369
+ port = self._find_available_port()
370
+
371
+ self._service_name = self._generate_service_name(image)
372
+ self._published_port = port
373
+
374
+ cmd = [
375
+ "docker",
376
+ "service",
377
+ "create",
378
+ "--detach",
379
+ "--name",
380
+ self._service_name,
381
+ "--replicas",
382
+ str(max(1, replicas)),
383
+ "--publish",
384
+ f"{port}:8000",
385
+ ]
386
+
387
+ if self._overlay_network:
388
+ cmd.extend(["--network", self._overlay_network])
389
+
390
+ if env_vars:
391
+ for key, value in env_vars.items():
392
+ cmd.extend(["--env", f"{key}={value}"])
393
+
394
+ if cpu_limit is not None:
395
+ cmd.extend(["--limit-cpu", str(cpu_limit)])
396
+
397
+ if memory_limit is not None:
398
+ cmd.extend(["--limit-memory", str(memory_limit)])
399
+
400
+ if constraints:
401
+ for constraint in constraints:
402
+ cmd.extend(["--constraint", constraint])
403
+
404
+ if labels:
405
+ for key, value in labels.items():
406
+ cmd.extend(["--label", f"{key}={value}"])
407
+
408
+ cmd.append(image)
409
+
410
+ if command_override:
411
+ if isinstance(command_override, str):
412
+ cmd.extend(shlex.split(command_override))
413
+ else:
414
+ cmd.extend(command_override)
415
+
416
+ try:
417
+ result = subprocess.run(
418
+ cmd,
419
+ capture_output=True,
420
+ text=True,
421
+ check=True,
422
+ )
423
+ self._service_id = result.stdout.strip()
424
+ except subprocess.CalledProcessError as e:
425
+ error_msg = (
426
+ "Failed to start Docker Swarm service.\n"
427
+ f"Command: {' '.join(cmd)}\n"
428
+ f"Exit code: {e.returncode}\n"
429
+ f"Stdout: {e.stdout}\n"
430
+ f"Stderr: {e.stderr}"
431
+ )
432
+ raise RuntimeError(error_msg) from e
433
+
434
+ # Give Swarm a brief moment to schedule the tasks.
435
+ time.sleep(1.0)
436
+
437
+ return f"http://localhost:{port}"
438
+
439
+ def stop_container(self) -> None:
440
+ """
441
+ Remove the Swarm service (and keep the Swarm manager running).
442
+ """
443
+ if not self._service_name:
444
+ return
445
+
446
+ import subprocess
447
+
448
+ try:
449
+ subprocess.run(
450
+ ["docker", "service", "rm", self._service_name],
451
+ capture_output=True,
452
+ check=True,
453
+ timeout=10,
454
+ )
455
+ except subprocess.CalledProcessError:
456
+ # Service may already be gone; ignore.
457
+ pass
458
+ finally:
459
+ self._service_name = None
460
+ self._service_id = None
461
+ self._published_port = None
462
+
463
+ def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
464
+ """
465
+ Wait for at least one replica to become healthy by polling /health.
466
+
467
+ Note: With Swarm's load balancer, requests round-robin across replicas,
468
+ so this only verifies that at least one replica is responding. Some
469
+ replicas may still be starting when this returns.
470
+ """
471
+ import time
472
+
473
+ import requests
474
+
475
+ deadline = time.time() + timeout_s
476
+ health_url = f"{base_url}/health"
477
+
478
+ # Bypass proxy for localhost to avoid proxy issues
479
+ proxies = {"http": None, "https": None}
480
+
481
+ while time.time() < deadline:
482
+ try:
483
+ response = requests.get(health_url, timeout=2.0, proxies=proxies)
484
+ if response.status_code == 200:
485
+ return
486
+ except requests.RequestException:
487
+ pass
488
+
489
+ time.sleep(0.5)
490
+
491
+ raise TimeoutError(
492
+ f"Swarm service at {base_url} did not become ready within {timeout_s}s"
493
+ )
494
+
495
+ def _ensure_docker_available(self) -> None:
496
+ import subprocess
497
+
498
+ try:
499
+ subprocess.run(
500
+ ["docker", "version"],
501
+ check=True,
502
+ capture_output=True,
503
+ timeout=5,
504
+ )
505
+ except (
506
+ subprocess.CalledProcessError,
507
+ FileNotFoundError,
508
+ subprocess.TimeoutExpired,
509
+ ) as exc:
510
+ raise RuntimeError(
511
+ "Docker is not available. Please install Docker Desktop or Docker Engine."
512
+ ) from exc
513
+
514
+ def _ensure_swarm_initialized(self) -> None:
515
+ import subprocess
516
+
517
+ try:
518
+ result = subprocess.run(
519
+ ["docker", "info", "--format", "{{.Swarm.LocalNodeState}}"],
520
+ capture_output=True,
521
+ text=True,
522
+ check=True,
523
+ timeout=5,
524
+ )
525
+ state = result.stdout.strip().lower()
526
+ if state == "active":
527
+ return
528
+ except subprocess.CalledProcessError:
529
+ state = "unknown"
530
+
531
+ if not self._auto_init_swarm:
532
+ raise RuntimeError(
533
+ f"Docker Swarm is not active (state={state}). Enable Swarm manually or pass auto_init_swarm=True."
534
+ )
535
+
536
+ try:
537
+ subprocess.run(
538
+ ["docker", "swarm", "init"],
539
+ check=True,
540
+ capture_output=True,
541
+ timeout=10,
542
+ )
543
+ except subprocess.CalledProcessError as e:
544
+ raise RuntimeError("Failed to initialize Docker Swarm") from e
545
+
546
+ def _ensure_overlay_network(self, network: str) -> None:
547
+ import subprocess
548
+
549
+ inspect = subprocess.run(
550
+ ["docker", "network", "inspect", network],
551
+ capture_output=True,
552
+ text=True,
553
+ check=False,
554
+ )
555
+ if inspect.returncode == 0:
556
+ return
557
+
558
+ try:
559
+ subprocess.run(
560
+ [
561
+ "docker",
562
+ "network",
563
+ "create",
564
+ "--driver",
565
+ "overlay",
566
+ "--attachable",
567
+ network,
568
+ ],
569
+ check=True,
570
+ capture_output=True,
571
+ timeout=10,
572
+ )
573
+ except subprocess.CalledProcessError as e:
574
+ raise RuntimeError(f"Failed to create overlay network '{network}'") from e
575
+
576
+ def _find_available_port(self) -> int:
577
+ import socket
578
+
579
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
580
+ s.bind(("", 0))
581
+ s.listen(1)
582
+ port = s.getsockname()[1]
583
+ return port
584
+
585
+ def _generate_service_name(self, image: str) -> str:
586
+ import time
587
+
588
+ clean_image = image.split("/")[-1].split(":")[0]
589
+ timestamp = int(time.time() * 1000)
590
+ return f"{clean_image}-swarm-{timestamp}"
591
+
592
+
593
+ class KubernetesProvider(ContainerProvider):
594
+ """
595
+ Container provider for Kubernetes clusters.
596
+
597
+ This provider creates pods in a Kubernetes cluster and exposes them
598
+ via services or port-forwarding.
599
+
600
+ Example:
601
+ >>> provider = KubernetesProvider(namespace="envtorch-dev")
602
+ >>> base_url = provider.start_container("echo-env:latest")
603
+ >>> # Pod running in k8s, accessible via service or port-forward
604
+ >>> provider.stop_container()
605
+ """
606
+
607
+ pass
608
+
609
+
610
+ class RuntimeProvider(ABC):
611
+ """
612
+ Abstract base class for runtime providers that are not container providers.
613
+ Providers implement this interface to support different runtime platforms:
614
+ - UVProvider: Runs environments via `uv run`
615
+
616
+ The provider manages a single runtime lifecycle and provides the base URL
617
+ for connecting to it.
618
+
619
+ Example:
620
+ >>> provider = UVProvider(project_path="/path/to/env")
621
+ >>> base_url = provider.start()
622
+ >>> print(base_url) # http://localhost:8000
623
+ >>> provider.stop()
624
+ """
625
+
626
+ @abstractmethod
627
+ def start(
628
+ self,
629
+ port: Optional[int] = None,
630
+ env_vars: Optional[Dict[str, str]] = None,
631
+ **kwargs: Any,
632
+ ) -> str:
633
+ """
634
+ Start a runtime from the specified image.
635
+
636
+ Args:
637
+ image: Runtime image name
638
+ port: Port to expose (if None, provider chooses)
639
+ env_vars: Environment variables for the runtime
640
+ **kwargs: Additional runtime options
641
+ """
642
+
643
+ @abstractmethod
644
+ def stop(self) -> None:
645
+ """
646
+ Stop the runtime.
647
+ """
648
+ pass
649
+
650
+ @abstractmethod
651
+ def wait_for_ready(self, timeout_s: float = 30.0) -> None:
652
+ """
653
+ Wait for the runtime to be ready to accept requests.
654
+ """
655
+ pass
656
+
657
+ def __enter__(self) -> "RuntimeProvider":
658
+ """
659
+ Enter the runtime provider.
660
+ """
661
+ self.start()
662
+ return self
663
+
664
+ def __exit__(self, exc_type, exc, tb) -> None:
665
+ """
666
+ Exit the runtime provider.
667
+ """
668
+ self.stop()
669
+ return False
src/core/containers/runtime/uv_provider.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Providers for launching ASGI applications via ``uv run``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import socket
7
+ import subprocess
8
+ import time
9
+ from typing import Dict, Optional
10
+
11
+ import requests
12
+
13
+ from .providers import RuntimeProvider
14
+
15
+
16
+ def _check_uv_installed() -> None:
17
+ try:
18
+ subprocess.check_output(["uv", "--version"])
19
+ except FileNotFoundError as exc:
20
+ raise RuntimeError(
21
+ "`uv` executable not found. Install uv from https://docs.astral.sh and ensure it is on PATH."
22
+ ) from exc
23
+
24
+
25
+ def _find_free_port() -> int:
26
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
27
+ sock.bind(("", 0))
28
+ sock.listen(1)
29
+ return sock.getsockname()[1]
30
+
31
+
32
+ def _create_uv_command(
33
+ *,
34
+ host: str,
35
+ port: int,
36
+ reload: bool,
37
+ workers: int,
38
+ app: str,
39
+ project_path: str,
40
+ ) -> list[str]:
41
+ command: list[str] = ["uv", "run", "--isolated", "--project", project_path]
42
+
43
+ command.append("--")
44
+ command.extend(
45
+ [
46
+ "uvicorn",
47
+ app,
48
+ "--host",
49
+ host,
50
+ "--port",
51
+ str(port),
52
+ "--workers",
53
+ str(workers),
54
+ ]
55
+ )
56
+
57
+ if reload:
58
+ command.append("--reload")
59
+
60
+ return command
61
+
62
+
63
+ def _poll_health(health_url: str, timeout_s: float) -> None:
64
+ """Poll a health endpoint until it returns HTTP 200 or times out."""
65
+
66
+ deadline = time.time() + timeout_s
67
+ while time.time() < deadline:
68
+ try:
69
+ timeout = max(0.0001, min(deadline - time.time(), 2.0))
70
+ response = requests.get(health_url, timeout=timeout)
71
+ if response.status_code == 200:
72
+ return
73
+ except requests.RequestException:
74
+ continue
75
+
76
+ time.sleep(0.5)
77
+
78
+ raise TimeoutError(f"Server did not become ready within {timeout_s:.1f} seconds")
79
+
80
+
81
+ class UVProvider(RuntimeProvider):
82
+ """
83
+ RuntimeProvider implementation backed by ``uv run``.
84
+
85
+ Args:
86
+ project_path: Local path to a uv project (passed to ``uv run --project``)
87
+ app: ASGI application path for uvicorn (defaults to ``server.app:app``)
88
+ host: Host interface to bind to (defaults to ``0.0.0.0``)
89
+ reload: Whether to enable uvicorn's reload mode
90
+ env_vars: Environment variables to pass through to the spawned process
91
+ context_timeout_s: How long to wait for the environment to become ready
92
+
93
+ Example:
94
+ >>> provider = UVProvider(project_path="/path/to/env")
95
+ >>> base_url = provider.start()
96
+ >>> print(base_url) # http://localhost:8000
97
+ >>> # Use the environment via base_url
98
+ >>> provider.stop()
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ *,
104
+ project_path: str,
105
+ app: str = "server.app:app",
106
+ host: str = "0.0.0.0",
107
+ reload: bool = False,
108
+ env_vars: Optional[Dict[str, str]] = None,
109
+ context_timeout_s: float = 60.0,
110
+ ):
111
+ """Initialize the UVProvider."""
112
+ self.project_path = os.path.abspath(project_path)
113
+ self.app = app
114
+ self.host = host
115
+ self.reload = reload
116
+ self.env_vars = env_vars
117
+ self.context_timeout_s = context_timeout_s
118
+ _check_uv_installed()
119
+ self._process = None
120
+ self._base_url = None
121
+
122
+ def start(
123
+ self,
124
+ port: Optional[int] = None,
125
+ env_vars: Optional[Dict[str, str]] = None,
126
+ workers: int = 1,
127
+ **_: Dict[str, str],
128
+ ) -> str:
129
+ """
130
+ Start the environment via `uv run`.
131
+
132
+ Args:
133
+ port: The port to bind the environment to
134
+ env_vars: Environment variables to pass to the environment
135
+ workers: The number of workers to use
136
+
137
+ Returns:
138
+ The base URL of the environment
139
+
140
+ Raises:
141
+ RuntimeError: If the environment is already running
142
+ """
143
+ if self._process is not None and self._process.poll() is None:
144
+ raise RuntimeError("UVProvider is already running")
145
+
146
+ bind_port = port or _find_free_port()
147
+
148
+ command = _create_uv_command(
149
+ host=self.host,
150
+ port=bind_port,
151
+ reload=self.reload,
152
+ workers=workers,
153
+ app=self.app,
154
+ project_path=self.project_path,
155
+ )
156
+
157
+ env = os.environ.copy()
158
+
159
+ if self.env_vars:
160
+ env.update(self.env_vars)
161
+ if env_vars:
162
+ env.update(env_vars)
163
+
164
+ try:
165
+ self._process = subprocess.Popen(command, env=env)
166
+ except OSError as exc:
167
+ raise RuntimeError(f"Failed to launch `uv run`: {exc}") from exc
168
+
169
+ client_host = "127.0.0.1" if self.host in {"0.0.0.0", "::"} else self.host
170
+ self._base_url = f"http://{client_host}:{bind_port}"
171
+ return self._base_url
172
+
173
+ def wait_for_ready(self, timeout_s: float = 60.0) -> None:
174
+ """
175
+ Wait for the environment to become ready.
176
+
177
+ Args:
178
+ timeout_s: The timeout to wait for the environment to become ready
179
+
180
+ Raises:
181
+ RuntimeError: If the environment is not running
182
+ TimeoutError: If the environment does not become ready within the timeout
183
+ """
184
+ if self._process and self._process.poll() is not None:
185
+ code = self._process.returncode
186
+ raise RuntimeError(f"uv process exited prematurely with code {code}")
187
+
188
+ _poll_health(f"{self._base_url}/health", timeout_s=timeout_s)
189
+
190
+ def stop(self) -> None:
191
+ """
192
+ Stop the environment.
193
+
194
+ Raises:
195
+ RuntimeError: If the environment is not running
196
+ """
197
+ if self._process is None:
198
+ return
199
+
200
+ if self._process.poll() is None:
201
+ self._process.terminate()
202
+ try:
203
+ self._process.wait(timeout=10.0)
204
+ except subprocess.TimeoutExpired:
205
+ self._process.kill()
206
+ self._process.wait(timeout=5.0)
207
+
208
+ self._process = None
209
+ self._base_url = None
210
+
211
+ @property
212
+ def base_url(self) -> str:
213
+ """
214
+ The base URL of the environment.
215
+
216
+ Returns:
217
+ The base URL of the environment
218
+
219
+ Raises:
220
+ RuntimeError: If the environment is not running
221
+ """
222
+ if self._base_url is None:
223
+ raise RuntimeError("UVProvider has not been started")
224
+ return self._base_url
src/core/containers/test_local_docker_provider.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ End-to-end test for LocalDockerProvider.
4
+
5
+ This script tests the complete flow:
6
+ 1. Start a container using LocalDockerProvider
7
+ 2. Wait for it to be ready
8
+ 3. Make HTTP requests to test the environment
9
+ 4. Clean up the container
10
+ """
11
+
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ # Add src to path
16
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
17
+
18
+ import requests
19
+ from openenv.core.containers.runtime import LocalDockerProvider
20
+
21
+
22
+ # TODO: Remove this test or make it a functional test sicne this will be tested in e2e test for echo env
23
+ def test_local_docker_provider():
24
+ """Test LocalDockerProvider end-to-end."""
25
+ print("=" * 60)
26
+ print("LocalDockerProvider End-to-End Test")
27
+ print("=" * 60)
28
+ print()
29
+
30
+ provider = None
31
+
32
+ try:
33
+ # Step 1: Create provider
34
+ print("Step 1: Creating LocalDockerProvider...")
35
+ provider = LocalDockerProvider()
36
+ print("✓ Provider created\n")
37
+
38
+ # Step 2: Start container
39
+ print("Step 2: Starting echo-env container...")
40
+ base_url = provider.start_container("echo-env:latest")
41
+ print(f"✓ Container started at: {base_url}")
42
+ if provider._container_id:
43
+ print(f" Container ID: {provider._container_id[:12]}...")
44
+ if provider._container_name:
45
+ print(f" Container name: {provider._container_name}\n")
46
+
47
+ # Step 3: Wait for ready
48
+ print("Step 3: Waiting for container to be ready...")
49
+ provider.wait_for_ready(base_url, timeout_s=30.0)
50
+ print("✓ Container is ready!\n")
51
+
52
+ # Step 4: Test health endpoint
53
+ print("Step 4: Testing /health endpoint...")
54
+ response = requests.get(f"{base_url}/health")
55
+ print(f" Status: {response.status_code}")
56
+ print(f" Response: {response.json()}")
57
+ assert response.status_code == 200
58
+ assert response.json()["status"] == "healthy"
59
+ print("✓ Health check passed\n")
60
+
61
+ # Step 5: Test reset endpoint
62
+ print("Step 5: Testing /reset endpoint...")
63
+ response = requests.post(
64
+ f"{base_url}/reset",
65
+ json={},
66
+ headers={"Content-Type": "application/json"},
67
+ )
68
+ print(f" Status: {response.status_code}")
69
+ data = response.json()
70
+ print(f" Message: {data['observation']['echoed_message']}")
71
+ print(f" Reward: {data['reward']}")
72
+ print(f" Done: {data['done']}")
73
+ assert response.status_code == 200
74
+ assert data["observation"]["echoed_message"] == "Echo environment ready!"
75
+ print("✓ Reset test passed\n")
76
+
77
+ # Step 6: Test step endpoint
78
+ print("Step 6: Testing /step endpoint...")
79
+ response = requests.post(
80
+ f"{base_url}/step",
81
+ json={"action": {"message": "Hello from LocalDockerProvider!"}},
82
+ headers={"Content-Type": "application/json"},
83
+ )
84
+ print(f" Status: {response.status_code}")
85
+ data = response.json()
86
+ print(f" Echoed: {data['observation']['echoed_message']}")
87
+ print(f" Length: {data['observation']['message_length']}")
88
+ print(f" Reward: {data['reward']}")
89
+ assert response.status_code == 200
90
+ assert (
91
+ data["observation"]["echoed_message"] == "Hello from LocalDockerProvider!"
92
+ )
93
+ assert data["observation"]["message_length"] == 31
94
+ print("✓ Step test passed\n")
95
+
96
+ # Step 7: Test state endpoint
97
+ print("Step 7: Testing /state endpoint...")
98
+ response = requests.get(f"{base_url}/state")
99
+ print(f" Status: {response.status_code}")
100
+ data = response.json()
101
+ print(f" Episode ID: {data['episode_id']}")
102
+ print(f" Step count: {data['step_count']}")
103
+ assert response.status_code == 200
104
+ assert data["step_count"] == 1 # One step from above
105
+ print("✓ State test passed\n")
106
+
107
+ # Step 8: Multiple steps
108
+ print("Step 8: Testing multiple steps...")
109
+ for i in range(3):
110
+ response = requests.post(
111
+ f"{base_url}/step",
112
+ json={"action": {"message": f"Message {i + 1}"}},
113
+ headers={"Content-Type": "application/json"},
114
+ )
115
+ assert response.status_code == 200
116
+ print(f" Step {i + 1}: ✓")
117
+
118
+ # Check state updated
119
+ response = requests.get(f"{base_url}/state")
120
+ data = response.json()
121
+ assert data["step_count"] == 4 # 1 + 3 more steps
122
+ print(f" Final step count: {data['step_count']}")
123
+ print("✓ Multiple steps test passed\n")
124
+
125
+ print("=" * 60)
126
+ print("✓ All tests passed!")
127
+ print("=" * 60)
128
+ print()
129
+
130
+ return True
131
+
132
+ except Exception as e:
133
+ print(f"\n❌ Test failed: {e}")
134
+ import traceback
135
+
136
+ traceback.print_exc()
137
+ return False
138
+
139
+ finally:
140
+ # Step 9: Cleanup
141
+ if provider is not None:
142
+ print("\nStep 9: Cleaning up container...")
143
+ try:
144
+ provider.stop_container()
145
+ print("✓ Container stopped and removed\n")
146
+ except Exception as e:
147
+ print(f"⚠️ Cleanup warning: {e}\n")
148
+
149
+
150
+ def test_provider_with_custom_port():
151
+ """Test provider with custom port."""
152
+ print("=" * 60)
153
+ print("LocalDockerProvider with Custom Port Test")
154
+ print("=" * 60)
155
+ print()
156
+
157
+ provider = None
158
+
159
+ try:
160
+ provider = LocalDockerProvider()
161
+
162
+ print("Starting container on custom port 8123...")
163
+ base_url = provider.start_container("echo-env:latest", port=8123)
164
+ print(f"✓ Started at: {base_url}")
165
+ assert ":8123" in base_url
166
+
167
+ print("Waiting for ready...")
168
+ provider.wait_for_ready(base_url)
169
+ print("✓ Ready!")
170
+
171
+ print("Testing health...")
172
+ response = requests.get(f"{base_url}/health")
173
+ assert response.status_code == 200
174
+ print("✓ Health check passed")
175
+
176
+ print("\n✓ Custom port test passed!\n")
177
+ return True
178
+
179
+ except Exception as e:
180
+ print(f"\n❌ Test failed: {e}")
181
+ return False
182
+
183
+ finally:
184
+ if provider is not None:
185
+ provider.stop_container()
186
+ print("✓ Cleaned up\n")
187
+
188
+
189
+ def test_provider_with_env_vars():
190
+ """Test provider with environment variables."""
191
+ print("=" * 60)
192
+ print("LocalDockerProvider with Environment Variables Test")
193
+ print("=" * 60)
194
+ print()
195
+
196
+ provider = None
197
+
198
+ try:
199
+ provider = LocalDockerProvider()
200
+
201
+ print("Starting container with environment variables...")
202
+ base_url = provider.start_container(
203
+ "echo-env:latest", env_vars={"DEBUG": "true", "LOG_LEVEL": "info"}
204
+ )
205
+ print(f"✓ Started at: {base_url}")
206
+
207
+ print("Waiting for ready...")
208
+ provider.wait_for_ready(base_url)
209
+ print("✓ Ready!")
210
+
211
+ print("Testing health...")
212
+ response = requests.get(f"{base_url}/health")
213
+ assert response.status_code == 200
214
+ print("✓ Health check passed")
215
+
216
+ print("\n✓ Environment variables test passed!\n")
217
+ return True
218
+
219
+ except Exception as e:
220
+ print(f"\n❌ Test failed: {e}")
221
+ return False
222
+
223
+ finally:
224
+ if provider is not None:
225
+ provider.stop_container()
226
+ print("✓ Cleaned up\n")
227
+
228
+
229
+ if __name__ == "__main__":
230
+ print()
231
+ print("🐳 LocalDockerProvider Test Suite")
232
+ print()
233
+
234
+ results = []
235
+
236
+ # Run basic test
237
+ results.append(("Basic End-to-End", test_local_docker_provider()))
238
+
239
+ # Run custom port test
240
+ results.append(("Custom Port", test_provider_with_custom_port()))
241
+
242
+ # Run environment variables test
243
+ results.append(("Environment Variables", test_provider_with_env_vars()))
244
+
245
+ # Summary
246
+ print("=" * 60)
247
+ print("Test Summary")
248
+ print("=" * 60)
249
+ for name, passed in results:
250
+ status = "✓ PASSED" if passed else "✗ FAILED"
251
+ print(f"{name:25} {status}")
252
+ print("=" * 60)
253
+
254
+ all_passed = all(result for _, result in results)
255
+ if all_passed:
256
+ print("\n🎉 All tests passed!")
257
+ exit(0)
258
+ else:
259
+ print("\n❌ Some tests failed")
260
+ exit(1)
src/core/env_client.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Environment client for persistent sessions.
9
+
10
+ This module provides a WebSocket-based client that maintains a persistent connection
11
+ to an environment server, enabling efficient multi-step interactions without
12
+ the overhead of HTTP request/response cycles.
13
+
14
+ The client is async by default. For synchronous usage, use the `.sync()` method
15
+ to get a `SyncEnvClient` wrapper.
16
+
17
+ Example (async):
18
+ >>> async with GenericEnvClient(base_url="ws://localhost:8000") as env:
19
+ ... result = await env.reset()
20
+ ... result = await env.step({"code": "print('hello')"})
21
+
22
+ Example (sync wrapper):
23
+ >>> env = GenericEnvClient(base_url="ws://localhost:8000").sync()
24
+ >>> with env:
25
+ ... result = env.reset()
26
+ ... result = env.step({"code": "print('hello')"})
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import asyncio
32
+ import json
33
+ import os
34
+ from abc import ABC, abstractmethod
35
+ from typing import Any, Dict, Generic, Optional, Type, TYPE_CHECKING, TypeVar
36
+
37
+ from .client_types import StateT, StepResult
38
+ from .containers.runtime import LocalDockerProvider, UVProvider
39
+ from .utils import convert_to_ws_url
40
+
41
+ if TYPE_CHECKING:
42
+ from websockets.asyncio.client import ClientConnection
43
+
44
+ from .containers.runtime import ContainerProvider, RuntimeProvider
45
+ from .sync_client import SyncEnvClient
46
+
47
+ from websockets.asyncio.client import connect as ws_connect
48
+
49
+ ActT = TypeVar("ActT")
50
+ ObsT = TypeVar("ObsT")
51
+ EnvClientT = TypeVar("EnvClientT", bound="EnvClient")
52
+
53
+
54
+ class EnvClient(ABC, Generic[ActT, ObsT, StateT]):
55
+ """
56
+ Async environment client for persistent sessions.
57
+
58
+ This client maintains a persistent WebSocket connection to an environment
59
+ server, enabling efficient multi-step interactions. Each client instance
60
+ corresponds to a dedicated environment session on the server.
61
+
62
+ The client is async by default. For synchronous usage, use the `.sync()`
63
+ method to get a `SyncEnvClient` wrapper.
64
+
65
+ Features:
66
+ - Lower latency for sequential interactions
67
+ - Session state is maintained server-side
68
+ - Better suited for long-running episodes
69
+ - Async by default for modern Python async/await patterns
70
+
71
+ Example (async):
72
+ >>> from envs.coding_env.client import CodingEnv
73
+ >>>
74
+ >>> # Connect to a server using async context manager
75
+ >>> async with CodingEnv(base_url="ws://localhost:8000") as env:
76
+ ... result = await env.reset(seed=42)
77
+ ... while not result.done:
78
+ ... action = agent.predict(result.observation)
79
+ ... result = await env.step(action)
80
+
81
+ Example (sync wrapper):
82
+ >>> env = CodingEnv(base_url="ws://localhost:8000").sync()
83
+ >>> with env:
84
+ ... result = env.reset(seed=42)
85
+ ... result = env.step(action)
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ base_url: str,
91
+ connect_timeout_s: float = 10.0,
92
+ message_timeout_s: float = 60.0,
93
+ max_message_size_mb: float = 100.0,
94
+ provider: Optional["ContainerProvider | RuntimeProvider"] = None,
95
+ mode: Optional[str] = None,
96
+ ):
97
+ """
98
+ Initialize environment client.
99
+
100
+ Args:
101
+ base_url: Base URL of the environment server (http:// or ws://).
102
+ Will be converted to ws:// if http:// is provided.
103
+ connect_timeout_s: Timeout for establishing WebSocket connection
104
+ message_timeout_s: Timeout for receiving responses to messages
105
+ max_message_size_mb: Maximum WebSocket message size in megabytes.
106
+ Default 100MB to handle large observations (screenshots, DOM, etc.)
107
+ provider: Optional container/runtime provider for lifecycle management.
108
+ Can be a ContainerProvider (Docker) or RuntimeProvider (UV).
109
+ mode: Communication mode: 'simulation' for Gym-style API (default) or
110
+ 'production' for MCP JSON-RPC protocol. Can also be set via the
111
+ OPENENV_CLIENT_MODE environment variable. Constructor parameter
112
+ takes precedence over environment variable. Case-insensitive.
113
+ """
114
+ # Determine mode (constructor > env var > default)
115
+ if mode is None:
116
+ mode = os.environ.get("OPENENV_CLIENT_MODE", "simulation")
117
+
118
+ # Normalize and validate mode
119
+ mode = mode.lower()
120
+ if mode not in ("simulation", "production"):
121
+ raise ValueError(
122
+ f"Invalid mode: '{mode}'. Must be 'simulation' or 'production'. "
123
+ f"Set via constructor parameter or OPENENV_CLIENT_MODE environment variable."
124
+ )
125
+
126
+ # Store mode (use object.__setattr__ to bypass immutability)
127
+ object.__setattr__(self, "_mode", mode)
128
+
129
+ # Convert HTTP URL to WebSocket URL
130
+ ws_url = convert_to_ws_url(base_url)
131
+
132
+ self._ws_url = f"{ws_url}/ws"
133
+ self._connect_timeout = connect_timeout_s
134
+ self._message_timeout = message_timeout_s
135
+ self._max_message_size = int(
136
+ max_message_size_mb * 1024 * 1024
137
+ ) # Convert MB to bytes
138
+ self._provider = provider
139
+ self._ws: Optional[ClientConnection] = None
140
+
141
+ def __setattr__(self, name: str, value: Any) -> None:
142
+ """Prevent modification of _mode after initialization."""
143
+ if name == "_mode" and hasattr(self, "_mode"):
144
+ raise AttributeError("Cannot modify mode after initialization")
145
+ super().__setattr__(name, value)
146
+
147
+ async def connect(self) -> "EnvClient":
148
+ """
149
+ Establish WebSocket connection to the server.
150
+
151
+ Returns:
152
+ self for method chaining
153
+
154
+ Raises:
155
+ ConnectionError: If connection cannot be established
156
+ """
157
+ if self._ws is not None:
158
+ return self
159
+
160
+ # Bypass proxy for localhost connections
161
+ ws_url_lower = self._ws_url.lower()
162
+ is_localhost = "localhost" in ws_url_lower or "127.0.0.1" in ws_url_lower
163
+
164
+ old_no_proxy = os.environ.get("NO_PROXY")
165
+ if is_localhost:
166
+ # Set NO_PROXY to bypass proxy for localhost
167
+ current_no_proxy = old_no_proxy or ""
168
+ if "localhost" not in current_no_proxy.lower():
169
+ os.environ["NO_PROXY"] = (
170
+ f"{current_no_proxy},localhost,127.0.0.1"
171
+ if current_no_proxy
172
+ else "localhost,127.0.0.1"
173
+ )
174
+
175
+ try:
176
+ self._ws = await ws_connect(
177
+ self._ws_url,
178
+ open_timeout=self._connect_timeout,
179
+ max_size=self._max_message_size,
180
+ )
181
+ except Exception as e:
182
+ raise ConnectionError(f"Failed to connect to {self._ws_url}: {e}") from e
183
+ finally:
184
+ # Restore original NO_PROXY value
185
+ if is_localhost:
186
+ if old_no_proxy is None:
187
+ os.environ.pop("NO_PROXY", None)
188
+ else:
189
+ os.environ["NO_PROXY"] = old_no_proxy
190
+
191
+ return self
192
+
193
+ async def disconnect(self) -> None:
194
+ """Close the WebSocket connection."""
195
+ if self._ws is not None:
196
+ try:
197
+ # Send close message
198
+ await self._send({"type": "close"})
199
+ except Exception:
200
+ pass # Best effort
201
+ try:
202
+ await self._ws.close()
203
+ except Exception:
204
+ pass
205
+ self._ws = None
206
+
207
+ async def _ensure_connected(self) -> None:
208
+ """Ensure WebSocket connection is established."""
209
+ if self._ws is None:
210
+ await self.connect()
211
+
212
+ async def _send(self, message: Dict[str, Any]) -> None:
213
+ """Send a message over the WebSocket."""
214
+ await self._ensure_connected()
215
+ assert self._ws is not None
216
+ await self._ws.send(json.dumps(message))
217
+
218
+ async def _receive(self) -> Dict[str, Any]:
219
+ """Receive and parse a message from the WebSocket."""
220
+ assert self._ws is not None
221
+ raw = await asyncio.wait_for(self._ws.recv(), timeout=self._message_timeout)
222
+ return json.loads(raw)
223
+
224
+ async def _send_and_receive(self, message: Dict[str, Any]) -> Dict[str, Any]:
225
+ """Send a message and wait for response."""
226
+ await self._send(message)
227
+ response = await self._receive()
228
+
229
+ # Check for error response
230
+ if response.get("type") == "error":
231
+ error_data = response.get("data", {})
232
+ raise RuntimeError(
233
+ f"Server error: {error_data.get('message', 'Unknown error')} "
234
+ f"(code: {error_data.get('code', 'UNKNOWN')})"
235
+ )
236
+
237
+ return response
238
+
239
+ @classmethod
240
+ async def from_docker_image(
241
+ cls: Type[EnvClientT],
242
+ image: str,
243
+ provider: Optional["ContainerProvider"] = None,
244
+ **kwargs: Any,
245
+ ) -> EnvClientT:
246
+ """
247
+ Create an environment client by spinning up a Docker container.
248
+
249
+ Args:
250
+ image: Docker image name to run (e.g., "coding-env:latest")
251
+ provider: Container provider to use (defaults to LocalDockerProvider)
252
+ **kwargs: Additional arguments to pass to provider.start_container()
253
+
254
+ Returns:
255
+ Connected client instance
256
+ """
257
+ if provider is None:
258
+ provider = LocalDockerProvider()
259
+
260
+ # Start container
261
+ base_url = provider.start_container(image, **kwargs)
262
+
263
+ # Wait for server to be ready
264
+ provider.wait_for_ready(base_url)
265
+
266
+ # Create and connect client
267
+ client = cls(base_url=base_url, provider=provider)
268
+ await client.connect()
269
+
270
+ return client
271
+
272
+ @classmethod
273
+ async def from_env(
274
+ cls: Type[EnvClientT],
275
+ repo_id: str,
276
+ *,
277
+ use_docker: bool = True,
278
+ provider: Optional["ContainerProvider | RuntimeProvider"] = None,
279
+ **provider_kwargs: Any,
280
+ ) -> EnvClientT:
281
+ """
282
+ Create a client from a Hugging Face Space.
283
+
284
+ Args:
285
+ repo_id: Hugging Face space identifier ``{org}/{space}``.
286
+ use_docker: When ``True`` (default) pull from the HF registry and
287
+ launch via :class:`LocalDockerProvider`. When ``False`` run the
288
+ space locally with :class:`UVProvider`.
289
+ provider: Optional provider instance to reuse. Must be a
290
+ :class:`ContainerProvider` when ``use_docker=True`` and a
291
+ :class:`RuntimeProvider` otherwise.
292
+ provider_kwargs: Additional keyword arguments forwarded to
293
+ either the container provider's ``start_container`` (docker)
294
+ or to the ``UVProvider`` constructor/start (uv). When
295
+ ``use_docker=False``, the ``project_path`` argument can be
296
+ used to override the default git URL
297
+ (``git+https://huggingface.co/spaces/{repo_id}``).
298
+
299
+ Returns:
300
+ Connected client instance
301
+
302
+ Examples:
303
+ >>> # Pull and run from HF Docker registry
304
+ >>> env = await MyEnv.from_env("openenv/echo-env")
305
+ >>>
306
+ >>> # Run locally with UV (clones the space)
307
+ >>> env = await MyEnv.from_env("openenv/echo-env", use_docker=False)
308
+ >>>
309
+ >>> # Run from a local checkout
310
+ >>> env = await MyEnv.from_env(
311
+ ... "openenv/echo-env",
312
+ ... use_docker=False,
313
+ ... project_path="/path/to/local/checkout"
314
+ ... )
315
+ """
316
+ # Extract start args that apply to both providers
317
+ start_args = {}
318
+ for key in ("port", "env_vars", "workers"):
319
+ if key in provider_kwargs:
320
+ start_args[key] = provider_kwargs.pop(key)
321
+
322
+ if use_docker:
323
+ # Docker mode: pull from HF registry
324
+ docker_provider = provider or LocalDockerProvider()
325
+ tag = provider_kwargs.pop("tag", "latest")
326
+ image = f"registry.hf.space/{repo_id.replace('/', '-')}:{tag}"
327
+ base_url = docker_provider.start_container(
328
+ image, **start_args, **provider_kwargs
329
+ )
330
+ docker_provider.wait_for_ready(base_url)
331
+
332
+ client = cls(base_url=base_url, provider=docker_provider)
333
+ await client.connect()
334
+ return client
335
+ else:
336
+ # UV mode: clone and run with uv
337
+ if provider is None:
338
+ uv_kwargs = dict(provider_kwargs)
339
+ project_path = uv_kwargs.pop("project_path", None)
340
+ if project_path is None:
341
+ project_path = f"git+https://huggingface.co/spaces/{repo_id}"
342
+
343
+ provider = UVProvider(project_path=project_path, **uv_kwargs)
344
+ else:
345
+ if provider_kwargs:
346
+ raise ValueError(
347
+ "provider_kwargs cannot be used when supplying a provider instance"
348
+ )
349
+
350
+ base_url = provider.start(**start_args)
351
+ provider.wait_for_ready()
352
+
353
+ client = cls(base_url=base_url, provider=provider)
354
+ await client.connect()
355
+ return client
356
+
357
+ @abstractmethod
358
+ def _step_payload(self, action: ActT) -> Dict[str, Any]:
359
+ """Convert an Action object to the JSON data expected by the env server."""
360
+ raise NotImplementedError
361
+
362
+ @abstractmethod
363
+ def _parse_result(self, payload: Dict[str, Any]) -> StepResult[ObsT]:
364
+ """Convert a JSON response from the env server to StepResult[ObsT]."""
365
+ raise NotImplementedError
366
+
367
+ @abstractmethod
368
+ def _parse_state(self, payload: Dict[str, Any]) -> StateT:
369
+ """Convert a JSON response from the state endpoint to a State object."""
370
+ raise NotImplementedError
371
+
372
+ async def reset(self, **kwargs: Any) -> StepResult[ObsT]:
373
+ """
374
+ Reset the environment with optional parameters.
375
+
376
+ Args:
377
+ **kwargs: Optional parameters passed to the environment's reset method.
378
+ Common parameters include:
379
+ - seed: Random seed for reproducibility
380
+ - episode_id: Custom episode identifier
381
+
382
+ Returns:
383
+ StepResult containing initial observation
384
+ """
385
+ message = {
386
+ "type": "reset",
387
+ "data": kwargs,
388
+ }
389
+ response = await self._send_and_receive(message)
390
+ return self._parse_result(response.get("data", {}))
391
+
392
+ async def step(self, action: ActT, **kwargs: Any) -> StepResult[ObsT]:
393
+ """
394
+ Execute an action in the environment.
395
+
396
+ Args:
397
+ action: The action to execute
398
+ **kwargs: Optional parameters (currently ignored)
399
+
400
+ Returns:
401
+ StepResult containing observation, reward, and done status
402
+ """
403
+ message = {
404
+ "type": "step",
405
+ "data": self._step_payload(action),
406
+ }
407
+ response = await self._send_and_receive(message)
408
+ return self._parse_result(response.get("data", {}))
409
+
410
+ async def state(self) -> StateT:
411
+ """
412
+ Get the current environment state from the server.
413
+
414
+ Returns:
415
+ State object with environment state information
416
+ """
417
+ message = {"type": "state"}
418
+ response = await self._send_and_receive(message)
419
+ return self._parse_state(response.get("data", {}))
420
+
421
+ async def close(self) -> None:
422
+ """
423
+ Close the WebSocket connection and clean up resources.
424
+
425
+ If this client was created via from_docker_image() or from_env(),
426
+ this will also stop and remove the associated container/process.
427
+ """
428
+ await self.disconnect()
429
+
430
+ if self._provider is not None:
431
+ # Handle both ContainerProvider and RuntimeProvider
432
+ if hasattr(self._provider, "stop_container"):
433
+ self._provider.stop_container()
434
+ elif hasattr(self._provider, "stop"):
435
+ self._provider.stop()
436
+
437
+ async def __aenter__(self) -> "EnvClient":
438
+ """Enter async context manager, ensuring connection is established."""
439
+ await self.connect()
440
+ return self
441
+
442
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
443
+ """Exit async context manager, closing connection."""
444
+ await self.close()
445
+
446
+ def __enter__(self) -> "EnvClient":
447
+ """Sync context manager entry - raises error suggesting async usage."""
448
+ raise TypeError(
449
+ "EnvClient is async by default. Use 'async with' instead of 'with', "
450
+ "or call .sync() to get a synchronous wrapper:\n"
451
+ " async with client: # async usage\n"
452
+ " with client.sync(): # sync wrapper"
453
+ )
454
+
455
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
456
+ """Sync context manager exit - should not be reached."""
457
+ pass # pragma: no cover
458
+
459
+ def sync(self) -> "SyncEnvClient":
460
+ """
461
+ Return a synchronous wrapper around this async client.
462
+
463
+ Use this method when you need synchronous access to the environment
464
+ without async/await syntax. This is useful for:
465
+ - Integration with synchronous codebases
466
+ - Interactive/REPL usage
467
+ - Stopping async from "infecting" the call stack
468
+
469
+ Returns:
470
+ SyncEnvClient wrapper that provides synchronous methods
471
+
472
+ Example:
473
+ >>> # Create async client and get sync wrapper
474
+ >>> async_client = GenericEnvClient(base_url="http://localhost:8000")
475
+ >>> sync_client = async_client.sync()
476
+ >>>
477
+ >>> # Use synchronous API
478
+ >>> with sync_client:
479
+ ... result = sync_client.reset()
480
+ ... result = sync_client.step({"code": "print('hello')"})
481
+ """
482
+ from .sync_client import SyncEnvClient
483
+
484
+ return SyncEnvClient(self)
src/core/env_server/__init__.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Core environment interfaces and types."""
8
+
9
+ from .base_transforms import CompositeTransform, NullTransform
10
+ from .exceptions import (
11
+ ConcurrencyConfigurationError,
12
+ EnvironmentFactoryError,
13
+ OpenEnvError,
14
+ SessionCapacityError,
15
+ SessionCreationError,
16
+ SessionNotFoundError,
17
+ )
18
+ from .http_server import create_app, create_fastapi_app, HTTPEnvServer
19
+ from .interfaces import Environment, Message, ModelTokenizer, Transform
20
+
21
+ try:
22
+ from .mcp_environment import MCPEnvironment
23
+ except ModuleNotFoundError:
24
+ MCPEnvironment = None # type: ignore[assignment]
25
+
26
+ from .mcp_types import (
27
+ CallToolAction,
28
+ CallToolObservation,
29
+ JsonRpcError,
30
+ # JSON-RPC types
31
+ JsonRpcErrorCode,
32
+ JsonRpcRequest,
33
+ JsonRpcResponse,
34
+ ListToolsAction,
35
+ ListToolsObservation,
36
+ McpMethod,
37
+ RESERVED_TOOL_NAMES,
38
+ Tool,
39
+ ToolError,
40
+ ToolErrorType,
41
+ WSMCPMessage,
42
+ WSMCPResponse,
43
+ )
44
+ from .route_config import GetEndpointConfig
45
+ from .serialization import (
46
+ deserialize_action,
47
+ deserialize_action_with_preprocessing,
48
+ serialize_observation,
49
+ )
50
+ from .types import (
51
+ Action,
52
+ BaseMessage,
53
+ ConcurrencyConfig,
54
+ HealthResponse,
55
+ HealthStatus,
56
+ Observation,
57
+ SchemaResponse,
58
+ ServerCapacityStatus,
59
+ ServerMode,
60
+ SessionInfo,
61
+ State,
62
+ WSCloseMessage,
63
+ WSErrorCode,
64
+ WSErrorResponse,
65
+ WSIncomingMessage,
66
+ WSObservationResponse,
67
+ WSResetMessage,
68
+ WSStateMessage,
69
+ WSStateResponse,
70
+ WSStepMessage,
71
+ )
72
+
73
+ try:
74
+ from .web_interface import create_web_interface_app, WebInterfaceManager
75
+ except ModuleNotFoundError:
76
+ create_web_interface_app = None # type: ignore[assignment]
77
+ WebInterfaceManager = None # type: ignore[assignment]
78
+
79
+ __all__ = [
80
+ # Core interfaces
81
+ "Environment",
82
+ "Transform",
83
+ "Message",
84
+ "ModelTokenizer",
85
+ # Types
86
+ "Action",
87
+ "Observation",
88
+ "State",
89
+ "SchemaResponse",
90
+ "HealthResponse",
91
+ # Enums
92
+ "HealthStatus",
93
+ "ServerMode",
94
+ "WSErrorCode",
95
+ # WebSocket message types
96
+ "BaseMessage",
97
+ "WSIncomingMessage",
98
+ "WSResetMessage",
99
+ "WSStepMessage",
100
+ "WSStateMessage",
101
+ "WSCloseMessage",
102
+ "WSObservationResponse",
103
+ "WSStateResponse",
104
+ "WSErrorResponse",
105
+ # Concurrency types
106
+ "ConcurrencyConfig",
107
+ "ServerCapacityStatus",
108
+ "SessionInfo",
109
+ # Exceptions
110
+ "OpenEnvError",
111
+ "ConcurrencyConfigurationError",
112
+ "SessionCapacityError",
113
+ "SessionNotFoundError",
114
+ "SessionCreationError",
115
+ "EnvironmentFactoryError",
116
+ # Base transforms
117
+ "CompositeTransform",
118
+ "NullTransform",
119
+ # HTTP Server
120
+ "HTTPEnvServer",
121
+ "create_app",
122
+ "create_fastapi_app",
123
+ # Web Interface
124
+ "create_web_interface_app",
125
+ "WebInterfaceManager",
126
+ # Serialization utilities
127
+ "deserialize_action",
128
+ "deserialize_action_with_preprocessing",
129
+ "serialize_observation",
130
+ # Route configuration
131
+ "GetEndpointConfig",
132
+ # MCP types
133
+ "Tool",
134
+ "ToolError",
135
+ "ToolErrorType",
136
+ "ListToolsAction",
137
+ "CallToolAction",
138
+ "ListToolsObservation",
139
+ "CallToolObservation",
140
+ "WSMCPMessage",
141
+ "WSMCPResponse",
142
+ "RESERVED_TOOL_NAMES",
143
+ "MCPEnvironment",
144
+ # JSON-RPC types
145
+ "JsonRpcErrorCode",
146
+ "JsonRpcError",
147
+ "JsonRpcRequest",
148
+ "JsonRpcResponse",
149
+ "McpMethod",
150
+ ]
src/core/env_server/base_transforms.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Base transform implementations for composing environment-specific transforms."""
8
+
9
+ from .interfaces import Transform
10
+ from .types import Observation
11
+
12
+
13
+ class CompositeTransform(Transform):
14
+ """Combines multiple transforms into a single transform."""
15
+
16
+ def __init__(self, transforms: list[Transform]):
17
+ self.transforms = transforms
18
+
19
+ def __call__(self, observation: Observation) -> Observation:
20
+ for transform in self.transforms:
21
+ observation = transform(observation)
22
+ return observation
23
+
24
+
25
+ class NullTransform(Transform):
26
+ """Default transform that passes through unchanged."""
27
+
28
+ def __call__(self, observation: Observation) -> Observation:
29
+ return observation
src/core/env_server/exceptions.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Custom exceptions for environment server operations."""
8
+
9
+ from typing import Optional
10
+
11
+
12
+ class OpenEnvError(Exception):
13
+ """Base exception for all OpenEnv errors."""
14
+
15
+ pass
16
+
17
+
18
+ class ConcurrencyConfigurationError(OpenEnvError):
19
+ """
20
+ Raised when an environment is misconfigured for concurrent sessions.
21
+
22
+ This error is raised during server startup when max_concurrent_envs > 1
23
+ is specified for an environment that is not marked as SUPPORTS_CONCURRENT_SESSIONS.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ environment_name: str,
29
+ max_concurrent_envs: int,
30
+ message: Optional[str] = None,
31
+ ):
32
+ self.environment_name = environment_name
33
+ self.max_concurrent_envs = max_concurrent_envs
34
+
35
+ if message is None:
36
+ message = (
37
+ f"Environment '{environment_name}' is not marked as SUPPORTS_CONCURRENT_SESSIONS. "
38
+ f"Cannot run with max_concurrent_envs={max_concurrent_envs}. "
39
+ f"Either set max_concurrent_envs=1 or ensure the environment "
40
+ f"properly isolates session state and set SUPPORTS_CONCURRENT_SESSIONS=True."
41
+ )
42
+
43
+ super().__init__(message)
44
+
45
+
46
+ class SessionCapacityError(OpenEnvError):
47
+ """
48
+ Raised when the server cannot accept new sessions due to capacity limits.
49
+
50
+ This error is raised when a new WebSocket connection is attempted but
51
+ the server has already reached max_concurrent_envs active sessions.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ active_sessions: int,
57
+ max_sessions: int,
58
+ message: Optional[str] = None,
59
+ ):
60
+ self.active_sessions = active_sessions
61
+ self.max_sessions = max_sessions
62
+
63
+ if message is None:
64
+ message = (
65
+ f"Server at capacity: {active_sessions}/{max_sessions} sessions active. "
66
+ f"Cannot accept new connections."
67
+ )
68
+
69
+ super().__init__(message)
70
+
71
+
72
+ class SessionNotFoundError(OpenEnvError):
73
+ """Raised when attempting to access a session that does not exist."""
74
+
75
+ def __init__(self, session_id: str, message: Optional[str] = None):
76
+ self.session_id = session_id
77
+
78
+ if message is None:
79
+ message = f"Session '{session_id}' not found."
80
+
81
+ super().__init__(message)
82
+
83
+
84
+ class SessionCreationError(OpenEnvError):
85
+ """Raised when a session cannot be created."""
86
+
87
+ def __init__(self, reason: str, message: Optional[str] = None):
88
+ self.reason = reason
89
+
90
+ if message is None:
91
+ message = f"Failed to create session: {reason}"
92
+
93
+ super().__init__(message)
94
+
95
+
96
+ class EnvironmentFactoryError(OpenEnvError):
97
+ """Raised when the environment factory fails to create an instance."""
98
+
99
+ def __init__(self, factory_name: str, message: Optional[str] = None):
100
+ self.factory_name = factory_name
101
+
102
+ if message is None:
103
+ message = f"Environment factory '{factory_name}' failed to create instance."
104
+
105
+ super().__init__(message)
src/core/env_server/gradio_theme.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Unified terminal-style theme for OpenEnv Gradio UI (light/dark)."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import gradio as gr
12
+
13
+ _MONO_FONTS = (
14
+ "JetBrains Mono",
15
+ "Fira Code",
16
+ "Cascadia Code",
17
+ "Consolas",
18
+ "ui-monospace",
19
+ "monospace",
20
+ )
21
+
22
+ _CORE_FONT = (
23
+ "Lato",
24
+ "Inter",
25
+ "Arial",
26
+ "Helvetica",
27
+ "sans-serif",
28
+ )
29
+
30
+ _ZERO_RADIUS = gr.themes.Size(
31
+ xxs="0px",
32
+ xs="0px",
33
+ sm="0px",
34
+ md="0px",
35
+ lg="0px",
36
+ xl="0px",
37
+ xxl="0px",
38
+ )
39
+
40
+ _GREEN_HUE = gr.themes.Color(
41
+ c50="#e6f4ea",
42
+ c100="#ceead6",
43
+ c200="#a8dab5",
44
+ c300="#6fcc8b",
45
+ c400="#3fb950",
46
+ c500="#238636",
47
+ c600="#1a7f37",
48
+ c700="#116329",
49
+ c800="#0a4620",
50
+ c900="#033a16",
51
+ c950="#04200d",
52
+ )
53
+
54
+ _NEUTRAL_HUE = gr.themes.Color(
55
+ c50="#f6f8fa",
56
+ c100="#eaeef2",
57
+ c200="#d0d7de",
58
+ c300="#afb8c1",
59
+ c400="#8c959f",
60
+ c500="#6e7781",
61
+ c600="#57606a",
62
+ c700="#424a53",
63
+ c800="#32383f",
64
+ c900="#24292f",
65
+ c950="#1b1f24",
66
+ )
67
+
68
+ OPENENV_GRADIO_THEME = gr.themes.Base(
69
+ primary_hue=_GREEN_HUE,
70
+ secondary_hue=_NEUTRAL_HUE,
71
+ neutral_hue=_NEUTRAL_HUE,
72
+ font=_CORE_FONT,
73
+ font_mono=_MONO_FONTS,
74
+ radius_size=_ZERO_RADIUS,
75
+ ).set(
76
+ body_background_fill="#ffffff",
77
+ background_fill_primary="#ffffff",
78
+ background_fill_secondary="#f6f8fa",
79
+ block_background_fill="#ffffff",
80
+ block_border_color="#ffffff",
81
+ block_label_text_color="#57606a",
82
+ block_title_text_color="#24292f",
83
+ border_color_primary="#d0d7de",
84
+ input_background_fill="#ffffff",
85
+ input_border_color="#d0d7de",
86
+ button_primary_background_fill="#1a7f37",
87
+ button_primary_background_fill_hover="#116329",
88
+ button_primary_text_color="#ffffff",
89
+ button_secondary_background_fill="#f6f8fa",
90
+ button_secondary_background_fill_hover="#eaeef2",
91
+ button_secondary_text_color="#24292f",
92
+ button_secondary_border_color="#d0d7de",
93
+ body_background_fill_dark="#0d1117",
94
+ background_fill_primary_dark="#0d1117",
95
+ background_fill_secondary_dark="#0d1117",
96
+ block_background_fill_dark="#0d1117",
97
+ block_border_color_dark="#0d1117",
98
+ block_label_text_color_dark="#8b949e",
99
+ block_title_text_color_dark="#c9d1d9",
100
+ border_color_primary_dark="#30363d",
101
+ input_background_fill_dark="#0d1117",
102
+ input_border_color_dark="#30363d",
103
+ button_primary_background_fill_dark="#30363d",
104
+ button_primary_background_fill_hover_dark="#484f58",
105
+ button_primary_text_color_dark="#c9d1d9",
106
+ button_secondary_background_fill_dark="#21262d",
107
+ button_secondary_background_fill_hover_dark="#30363d",
108
+ button_secondary_text_color_dark="#c9d1d9",
109
+ button_secondary_border_color_dark="#30363d",
110
+ )
111
+
112
+ OPENENV_GRADIO_CSS = """
113
+ * { border-radius: 0 !important; }
114
+ .col-left { padding: 16px !important; }
115
+ .col-right { padding: 16px !important; }
116
+ .prose, .markdown-text, .md,
117
+ .prose > *, .markdown-text > * {
118
+ background: transparent !important;
119
+ border: none !important;
120
+ box-shadow: none !important;
121
+ }
122
+ .dark .col-left {
123
+ border-left-color: rgba(139, 148, 158, 0.4) !important;
124
+ }
125
+ .dark .col-right {
126
+ border-left-color: rgba(201, 209, 217, 0.3) !important;
127
+ }
128
+ """
src/core/env_server/gradio_ui.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Gradio-based web UI for OpenEnv environments.
9
+
10
+ Replaces the legacy HTML/JavaScript interface when ENABLE_WEB_INTERFACE is set.
11
+ Mount at /web via gr.mount_gradio_app() from create_web_interface_app().
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import re
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ import gradio as gr
21
+
22
+ from .types import EnvironmentMetadata
23
+
24
+
25
+ def _escape_md(text: str) -> str:
26
+ """Escape Markdown special characters in user-controlled content."""
27
+ return re.sub(r"([\\`*_\{\}\[\]()#+\-.!|~>])", r"\\\1", str(text))
28
+
29
+
30
+ def _format_observation(data: Dict[str, Any]) -> str:
31
+ """Format reset/step response for Markdown display."""
32
+ lines: List[str] = []
33
+ obs = data.get("observation", {})
34
+ if isinstance(obs, dict):
35
+ if obs.get("prompt"):
36
+ lines.append(f"**Prompt:**\n\n{_escape_md(obs['prompt'])}\n")
37
+ messages = obs.get("messages", [])
38
+ if messages:
39
+ lines.append("**Messages:**\n")
40
+ for msg in messages:
41
+ sender = _escape_md(str(msg.get("sender_id", "?")))
42
+ content = _escape_md(str(msg.get("content", "")))
43
+ cat = _escape_md(str(msg.get("category", "")))
44
+ lines.append(f"- `[{cat}]` Player {sender}: {content}")
45
+ lines.append("")
46
+ reward = data.get("reward")
47
+ done = data.get("done")
48
+ if reward is not None:
49
+ lines.append(f"**Reward:** `{reward}`")
50
+ if done is not None:
51
+ lines.append(f"**Done:** `{done}`")
52
+ return "\n".join(lines) if lines else "*No observation data*"
53
+
54
+
55
+ def _readme_section(metadata: Optional[EnvironmentMetadata]) -> str:
56
+ """README content for the left panel."""
57
+ if not metadata or not metadata.readme_content:
58
+ return "*No README available.*"
59
+ return metadata.readme_content
60
+
61
+
62
+ def get_gradio_display_title(
63
+ metadata: Optional[EnvironmentMetadata],
64
+ fallback: str = "OpenEnv Environment",
65
+ ) -> str:
66
+ """Return the title used for the Gradio app (browser tab and Blocks)."""
67
+ name = metadata.name if metadata else fallback
68
+ return f"OpenEnv Agentic Environment: {name}"
69
+
70
+
71
+ def build_gradio_app(
72
+ web_manager: Any,
73
+ action_fields: List[Dict[str, Any]],
74
+ metadata: Optional[EnvironmentMetadata],
75
+ is_chat_env: bool,
76
+ title: str = "OpenEnv Environment",
77
+ quick_start_md: Optional[str] = None,
78
+ ) -> gr.Blocks:
79
+ """
80
+ Build a Gradio Blocks app for the OpenEnv web interface.
81
+
82
+ Args:
83
+ web_manager: WebInterfaceManager (reset/step_environment, get_state).
84
+ action_fields: Field dicts from _extract_action_fields(action_cls).
85
+ metadata: Environment metadata for README/name.
86
+ is_chat_env: If True, single message textbox; else form from action_fields.
87
+ title: App title (overridden by metadata.name when present; see get_gradio_display_title).
88
+ quick_start_md: Optional Quick Start markdown (class names already replaced).
89
+
90
+ Returns:
91
+ gr.Blocks to mount with gr.mount_gradio_app(app, blocks, path="/web").
92
+ """
93
+ readme_content = _readme_section(metadata)
94
+ display_title = get_gradio_display_title(metadata, fallback=title)
95
+
96
+ async def reset_env():
97
+ try:
98
+ data = await web_manager.reset_environment()
99
+ obs_md = _format_observation(data)
100
+ return (
101
+ obs_md,
102
+ json.dumps(data, indent=2),
103
+ "Environment reset successfully.",
104
+ )
105
+ except Exception as e:
106
+ return ("", "", f"Error: {e}")
107
+
108
+ def _step_with_action(action_data: Dict[str, Any]):
109
+ async def _run():
110
+ try:
111
+ data = await web_manager.step_environment(action_data)
112
+ obs_md = _format_observation(data)
113
+ return (
114
+ obs_md,
115
+ json.dumps(data, indent=2),
116
+ "Step complete.",
117
+ )
118
+ except Exception as e:
119
+ return ("", "", f"Error: {e}")
120
+
121
+ return _run
122
+
123
+ async def step_chat(message: str):
124
+ if not (message or str(message).strip()):
125
+ return ("", "", "Please enter an action message.")
126
+ action = {"message": str(message).strip()}
127
+ return await _step_with_action(action)()
128
+
129
+ def get_state_sync():
130
+ try:
131
+ data = web_manager.get_state()
132
+ return json.dumps(data, indent=2)
133
+ except Exception as e:
134
+ return f"Error: {e}"
135
+
136
+ with gr.Blocks(title=display_title) as demo:
137
+ with gr.Row():
138
+ with gr.Column(scale=1, elem_classes="col-left"):
139
+ if quick_start_md:
140
+ with gr.Accordion("Quick Start", open=True):
141
+ gr.Markdown(quick_start_md)
142
+ with gr.Accordion("README", open=False):
143
+ gr.Markdown(readme_content)
144
+
145
+ with gr.Column(scale=2, elem_classes="col-right"):
146
+ obs_display = gr.Markdown(
147
+ value=("# Playground\n\nClick **Reset** to start a new episode."),
148
+ )
149
+ with gr.Group():
150
+ if is_chat_env:
151
+ action_input = gr.Textbox(
152
+ label="Action message",
153
+ placeholder="e.g. Enter your message...",
154
+ )
155
+ step_inputs = [action_input]
156
+ step_fn = step_chat
157
+ else:
158
+ step_inputs = []
159
+ for field in action_fields:
160
+ name = field["name"]
161
+ field_type = field.get("type", "text")
162
+ label = name.replace("_", " ").title()
163
+ placeholder = field.get("placeholder", "")
164
+ if field_type == "checkbox":
165
+ inp = gr.Checkbox(label=label)
166
+ elif field_type == "number":
167
+ inp = gr.Number(label=label)
168
+ elif field_type == "select":
169
+ choices = field.get("choices") or []
170
+ inp = gr.Dropdown(
171
+ choices=choices,
172
+ label=label,
173
+ allow_custom_value=False,
174
+ )
175
+ elif field_type in ("textarea", "tensor"):
176
+ inp = gr.Textbox(
177
+ label=label,
178
+ placeholder=placeholder,
179
+ lines=3,
180
+ )
181
+ else:
182
+ inp = gr.Textbox(
183
+ label=label,
184
+ placeholder=placeholder,
185
+ )
186
+ step_inputs.append(inp)
187
+
188
+ async def step_form(*values):
189
+ if not action_fields:
190
+ return await _step_with_action({})()
191
+ action_data = {}
192
+ for i, field in enumerate(action_fields):
193
+ if i >= len(values):
194
+ break
195
+ name = field["name"]
196
+ val = values[i]
197
+ if field.get("type") == "checkbox":
198
+ action_data[name] = bool(val)
199
+ elif val is not None and val != "":
200
+ action_data[name] = val
201
+ return await _step_with_action(action_data)()
202
+
203
+ step_fn = step_form
204
+
205
+ with gr.Row():
206
+ step_btn = gr.Button("Step", variant="primary")
207
+ reset_btn = gr.Button("Reset", variant="secondary")
208
+ state_btn = gr.Button("Get state", variant="secondary")
209
+ with gr.Row():
210
+ status = gr.Textbox(
211
+ label="Status",
212
+ interactive=False,
213
+ )
214
+ raw_json = gr.Code(
215
+ label="Raw JSON response",
216
+ language="json",
217
+ interactive=False,
218
+ )
219
+
220
+ reset_btn.click(
221
+ fn=reset_env,
222
+ outputs=[obs_display, raw_json, status],
223
+ )
224
+ step_btn.click(
225
+ fn=step_fn,
226
+ inputs=step_inputs,
227
+ outputs=[obs_display, raw_json, status],
228
+ )
229
+ if is_chat_env:
230
+ action_input.submit(
231
+ fn=step_fn,
232
+ inputs=step_inputs,
233
+ outputs=[obs_display, raw_json, status],
234
+ )
235
+ state_btn.click(
236
+ fn=get_state_sync,
237
+ outputs=[raw_json],
238
+ )
239
+
240
+ return demo
src/core/env_server/http_server.py ADDED
@@ -0,0 +1,1646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ HTTP server wrapper for Environment instances.
9
+
10
+ This module provides utilities to wrap any Environment subclass and expose it
11
+ over HTTP and WebSocket endpoints that EnvClient can consume.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import asyncio
17
+ import inspect
18
+ import json
19
+ import logging
20
+ import os
21
+ import time
22
+ import uuid
23
+ from concurrent.futures import ThreadPoolExecutor
24
+ from contextlib import AsyncExitStack
25
+ from typing import Any, AsyncContextManager, Callable, cast, Dict, Optional, Type
26
+
27
+ _MISSING = object()
28
+
29
+ from fastapi import (
30
+ Body,
31
+ FastAPI,
32
+ HTTPException,
33
+ Request,
34
+ status,
35
+ WebSocket,
36
+ WebSocketDisconnect,
37
+ )
38
+ from pydantic import ValidationError
39
+
40
+ from .interfaces import Environment
41
+ from .mcp_environment import get_server_tools
42
+ from .mcp_types import (
43
+ JsonRpcErrorCode,
44
+ JsonRpcRequest,
45
+ JsonRpcResponse,
46
+ McpMethod,
47
+ WSMCPMessage,
48
+ WSMCPResponse,
49
+ )
50
+ from .route_config import GetEndpointConfig, register_get_endpoints
51
+ from .serialization import deserialize_action, serialize_observation
52
+ from .types import (
53
+ Action,
54
+ ConcurrencyConfig,
55
+ EnvironmentMetadata,
56
+ HealthResponse,
57
+ HealthStatus,
58
+ Observation,
59
+ ResetRequest,
60
+ ResetResponse,
61
+ SchemaResponse,
62
+ ServerCapacityStatus,
63
+ ServerMode,
64
+ SessionInfo,
65
+ State,
66
+ StepRequest,
67
+ StepResponse,
68
+ WSCloseMessage,
69
+ WSErrorCode,
70
+ WSErrorResponse,
71
+ WSObservationResponse,
72
+ WSResetMessage,
73
+ WSStateMessage,
74
+ WSStateResponse,
75
+ WSStepMessage,
76
+ )
77
+
78
+
79
+ def _make_json_serializable(obj: Any) -> Any:
80
+ """
81
+ Convert an object to a JSON-serializable form.
82
+
83
+ Handles Pydantic models, dataclasses, and other common types.
84
+
85
+ Args:
86
+ obj: The object to convert
87
+
88
+ Returns:
89
+ A JSON-serializable representation of the object
90
+ """
91
+ if obj is None:
92
+ return None
93
+ if isinstance(obj, (str, int, float, bool)):
94
+ return obj
95
+ if isinstance(obj, (list, tuple)):
96
+ return [_make_json_serializable(item) for item in obj]
97
+ if isinstance(obj, dict):
98
+ return {k: _make_json_serializable(v) for k, v in obj.items()}
99
+ if hasattr(obj, "model_dump"):
100
+ # Pydantic model
101
+ return obj.model_dump()
102
+ if hasattr(obj, "__dict__"):
103
+ # Object with __dict__
104
+ return {k: _make_json_serializable(v) for k, v in obj.__dict__.items()}
105
+ # Fallback to string representation
106
+ return str(obj)
107
+
108
+
109
+ from .exceptions import (
110
+ ConcurrencyConfigurationError,
111
+ EnvironmentFactoryError,
112
+ SessionCapacityError,
113
+ )
114
+
115
+
116
+ class HTTPEnvServer:
117
+ """
118
+ HTTP server wrapper for Environment instances.
119
+
120
+ This class wraps an Environment and exposes its reset(), step(), and state
121
+ methods as HTTP and WebSocket endpoints compatible with EnvClient.
122
+
123
+ The server expects:
124
+ - Action deserialization: Converts JSON dict to Action subclass
125
+ - Observation serialization: Converts Observation subclass to JSON dict
126
+
127
+ Example:
128
+ >>> from core.env_server import HTTPEnvServer
129
+ >>> from envs.coding_env.server import CodeExecutionEnvironment
130
+ >>> from envs.coding_env.models import CodeAction, CodeObservation
131
+ >>>
132
+ >>> # Pass environment class (factory pattern)
133
+ >>> server = HTTPEnvServer(
134
+ ... env=CodeExecutionEnvironment,
135
+ ... action_cls=CodeAction,
136
+ ... observation_cls=CodeObservation,
137
+ ... max_concurrent_envs=4,
138
+ ... )
139
+ >>>
140
+ >>> # Register routes with FastAPI
141
+ >>> from fastapi import FastAPI
142
+ >>> app = FastAPI()
143
+ >>> server.register_routes(app)
144
+ """
145
+
146
+ def __init__(
147
+ self,
148
+ env: Callable[[], Environment],
149
+ action_cls: Type[Action],
150
+ observation_cls: Type[Observation],
151
+ max_concurrent_envs: Optional[int] = None,
152
+ concurrency_config: Optional[ConcurrencyConfig] = None,
153
+ ):
154
+ """
155
+ Initialize HTTP server wrapper.
156
+
157
+ Args:
158
+ env: Environment factory (callable) that creates new instances.
159
+ Will be called to create a new environment for each WebSocket session.
160
+ action_cls: The Action subclass this environment expects
161
+ observation_cls: The Observation subclass this environment returns
162
+ max_concurrent_envs: Maximum number of concurrent WebSocket sessions.
163
+ Mutually exclusive with concurrency_config.
164
+ concurrency_config: Optional ConcurrencyConfig for advanced concurrency settings.
165
+ Mutually exclusive with max_concurrent_envs.
166
+
167
+ Raises:
168
+ ValueError: If both max_concurrent_envs and concurrency_config are provided.
169
+ ConcurrencyConfigurationError: If max_concurrent_envs > 1 for an
170
+ environment that is not marked as SUPPORTS_CONCURRENT_SESSIONS.
171
+ """
172
+ # Validate that env is callable
173
+ if not callable(env):
174
+ raise TypeError(
175
+ f"env must be a callable (class or factory function), got {type(env)}. "
176
+ f"Pass the environment class (e.g., MyEnvironment) not an instance (e.g., MyEnvironment())."
177
+ )
178
+
179
+ self._env_factory: Callable[[], Environment] = env
180
+
181
+ # Handle concurrency configuration
182
+ if max_concurrent_envs is not None and concurrency_config is not None:
183
+ raise ValueError(
184
+ "Cannot specify both 'max_concurrent_envs' and 'concurrency_config'. "
185
+ "Please use only one method to configure concurrency."
186
+ )
187
+
188
+ if concurrency_config is not None:
189
+ self._concurrency_config = concurrency_config
190
+ elif max_concurrent_envs is not None:
191
+ self._concurrency_config = ConcurrencyConfig(
192
+ max_concurrent_envs=max_concurrent_envs,
193
+ session_timeout=None,
194
+ )
195
+ else:
196
+ # Default configuration
197
+ self._concurrency_config = ConcurrencyConfig(
198
+ max_concurrent_envs=1,
199
+ session_timeout=None,
200
+ )
201
+
202
+ self._max_concurrent_envs = self._concurrency_config.max_concurrent_envs
203
+
204
+ # Validate concurrency configuration
205
+ self._validate_concurrency_safety()
206
+
207
+ self.action_cls = action_cls
208
+ self.observation_cls = observation_cls
209
+
210
+ # Session management for WebSocket connections
211
+ self._sessions: Dict[str, Optional[Environment]] = {}
212
+ self._session_executors: Dict[str, ThreadPoolExecutor] = {}
213
+ self._session_stacks: Dict[str, AsyncExitStack] = {}
214
+ self._session_info: Dict[str, SessionInfo] = {}
215
+ self._session_lock = asyncio.Lock()
216
+
217
+ # Create thread pool for running sync code in async context
218
+ # This is needed for environments using sync libraries (e.g., Playwright)
219
+ self._executor = ThreadPoolExecutor(max_workers=32)
220
+
221
+ # Idle session reaper configuration.
222
+ # Timeout is taken from ConcurrencyConfig.session_timeout;
223
+ # None means no timeout (default — reaper is a no-op).
224
+ self._session_idle_timeout_s: Optional[float] = (
225
+ self._concurrency_config.session_timeout
226
+ )
227
+ self._reaper_task: Optional[asyncio.Task[None]] = None
228
+
229
+ def _validate_concurrency_safety(self) -> None:
230
+ """
231
+ Validate that the environment supports the configured concurrency level.
232
+
233
+ Raises:
234
+ ConcurrencyConfigurationError: If max_concurrent_envs > 1 for an
235
+ environment that is not marked as SUPPORTS_CONCURRENT_SESSIONS.
236
+ """
237
+ if self._max_concurrent_envs <= 1:
238
+ return
239
+
240
+ if inspect.isclass(self._env_factory):
241
+ env_cls = self._env_factory
242
+ else:
243
+ _temp_env = self._env_factory()
244
+ env_cls = type(_temp_env)
245
+ _temp_env.close()
246
+ del _temp_env
247
+
248
+ if not getattr(env_cls, "SUPPORTS_CONCURRENT_SESSIONS", False):
249
+ raise ConcurrencyConfigurationError(
250
+ environment_name=env_cls.__name__,
251
+ max_concurrent_envs=self._max_concurrent_envs,
252
+ )
253
+
254
+ def get_capacity_status(self) -> ServerCapacityStatus:
255
+ """
256
+ Get the current capacity status of the server.
257
+
258
+ Returns:
259
+ ServerCapacityStatus with current session counts and availability.
260
+ """
261
+ return ServerCapacityStatus.from_counts(
262
+ active=len(self._sessions),
263
+ max_sessions=self._max_concurrent_envs,
264
+ )
265
+
266
+ async def _run_sync_in_thread_pool(
267
+ self, func: Callable[..., Observation], *args, **kwargs
268
+ ) -> Observation:
269
+ """Run a synchronous function in the thread pool executor."""
270
+ loop = asyncio.get_event_loop()
271
+ return await loop.run_in_executor(self._executor, lambda: func(*args, **kwargs))
272
+
273
+ def _get_valid_kwargs(
274
+ self,
275
+ sig: inspect.Signature,
276
+ kwargs: Dict[str, Any],
277
+ skip_params: Optional[set[str]] = None,
278
+ ) -> Dict[str, Any]:
279
+ """Filter kwargs to only include parameters accepted by the function signature."""
280
+ if skip_params is None:
281
+ skip_params = set()
282
+
283
+ valid_kwargs = {}
284
+
285
+ has_kwargs = any(
286
+ p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
287
+ )
288
+
289
+ for k, v in kwargs.items():
290
+ if k in sig.parameters or has_kwargs:
291
+ if k not in skip_params:
292
+ valid_kwargs[k] = v
293
+
294
+ return valid_kwargs
295
+
296
+ async def _create_session(self) -> tuple[str, Environment]:
297
+ """
298
+ Create a new WebSocket session with its own environment instance.
299
+
300
+ Returns:
301
+ Tuple of (session_id, environment)
302
+
303
+ Raises:
304
+ SessionCapacityError: If max concurrent sessions reached
305
+ EnvironmentFactoryError: If the factory fails to create an environment
306
+ """
307
+ async with self._session_lock:
308
+ if len(self._sessions) >= self._max_concurrent_envs:
309
+ raise SessionCapacityError(
310
+ active_sessions=len(self._sessions),
311
+ max_sessions=self._max_concurrent_envs,
312
+ )
313
+
314
+ session_id = str(uuid.uuid4())
315
+ current_time = time.time()
316
+
317
+ # Create executor and reserve slot so capacity is not exceeded while
318
+ # we create the env outside the lock (avoids blocking other sessions)
319
+ executor = ThreadPoolExecutor(max_workers=1)
320
+ self._session_executors[session_id] = executor
321
+ self._sessions[session_id] = None # placeholder until env is ready
322
+
323
+ try:
324
+ # Create environment in the executor thread (outside lock)
325
+ loop = asyncio.get_event_loop()
326
+ env = await loop.run_in_executor(executor, self._env_factory)
327
+ except Exception as e:
328
+ async with self._session_lock:
329
+ executor.shutdown(wait=False)
330
+ self._session_executors.pop(session_id, None)
331
+ self._sessions.pop(session_id, None)
332
+ factory_name = getattr(
333
+ self._env_factory, "__name__", str(self._env_factory)
334
+ )
335
+ raise EnvironmentFactoryError(factory_name) from e
336
+
337
+ # Hold the MCP session open for the lifetime of this session,
338
+ # matching the WebSocket path's AsyncExitStack pattern. This
339
+ # prevents per-request MCP transport teardown/reconnection and
340
+ # preserves FastMCP session state (ctx.set_state / ctx.get_state)
341
+ # across HTTP calls within the same OpenEnv session.
342
+ stack = AsyncExitStack()
343
+ try:
344
+ mcp_session_factory = getattr(env, "mcp_session", None)
345
+ if callable(mcp_session_factory):
346
+ mcp_session_cm = cast(AsyncContextManager[Any], mcp_session_factory())
347
+ await stack.enter_async_context(mcp_session_cm)
348
+ except Exception:
349
+ # MCP transport failed to start — clean up the reserved slot,
350
+ # the env, and the executor so they don't leak permanently
351
+ # against _max_concurrent_envs.
352
+ await stack.aclose() # best-effort
353
+ async with self._session_lock:
354
+ self._sessions.pop(session_id, None)
355
+ self._session_executors.pop(session_id, None)
356
+ self._session_info.pop(session_id, None)
357
+ await self._cleanup_session_resources(env, executor)
358
+ raise
359
+
360
+ async with self._session_lock:
361
+ self._sessions[session_id] = env
362
+ self._session_stacks[session_id] = stack
363
+ now = time.time()
364
+ self._session_info[session_id] = SessionInfo(
365
+ session_id=session_id,
366
+ created_at=current_time,
367
+ last_activity_at=now,
368
+ step_count=0,
369
+ environment_type=type(env).__name__,
370
+ )
371
+
372
+ return session_id, env
373
+
374
+ async def _destroy_session(self, session_id: str) -> None:
375
+ """
376
+ Destroy a WebSocket session and cleanup resources.
377
+
378
+ Args:
379
+ session_id: The session ID to destroy
380
+ """
381
+ async with self._session_lock:
382
+ env = self._sessions.pop(session_id, None)
383
+ executor = self._session_executors.pop(session_id, None)
384
+ stack = self._session_stacks.pop(session_id, None)
385
+ self._session_info.pop(session_id, None)
386
+
387
+ await self._cleanup_session_resources(env, executor, stack)
388
+
389
+ async def _cleanup_session_resources(
390
+ self,
391
+ env: Optional[Environment],
392
+ executor: Optional[ThreadPoolExecutor],
393
+ stack: Optional[AsyncExitStack] = None,
394
+ ) -> None:
395
+ """Close an environment and shut down its executor (best-effort)."""
396
+ # Close the MCP session stack first — this gracefully exits the
397
+ # mcp_session() context (and the underlying FastMCP Client session)
398
+ # before we tear down the environment references.
399
+ if stack is not None:
400
+ try:
401
+ await stack.aclose()
402
+ except Exception:
403
+ pass # Best effort cleanup
404
+
405
+ # Run close() in the same executor where the env was created
406
+ # This is required for thread-sensitive libraries like Playwright/greenlet
407
+ if env is not None:
408
+ if executor is not None:
409
+ try:
410
+ loop = asyncio.get_event_loop()
411
+ await loop.run_in_executor(executor, env.close)
412
+ except Exception:
413
+ # If executor close fails, try direct close as fallback
414
+ try:
415
+ env.close()
416
+ except Exception:
417
+ pass # Best effort cleanup
418
+ else:
419
+ try:
420
+ env.close()
421
+ except Exception:
422
+ pass # Best effort cleanup
423
+
424
+ # Shutdown executor after close is done
425
+ if executor is not None:
426
+ executor.shutdown(wait=False)
427
+
428
+ def _update_session_activity(
429
+ self, session_id: str, increment_step: bool = False
430
+ ) -> None:
431
+ """
432
+ Update session activity timestamp and optionally increment step count.
433
+
434
+ Args:
435
+ session_id: The session ID to update
436
+ increment_step: If True, increment the step count
437
+ """
438
+ if session_id in self._session_info:
439
+ self._session_info[session_id].last_activity_at = time.time()
440
+ if increment_step:
441
+ self._session_info[session_id].step_count += 1
442
+
443
+ async def _reap_idle_sessions(self) -> None:
444
+ """Background task that periodically destroys sessions idle beyond the timeout."""
445
+ timeout = self._session_idle_timeout_s
446
+ if timeout is None:
447
+ return # no timeout configured — noop
448
+ interval = max(timeout / 4, 5.0) # check frequently enough
449
+ while True:
450
+ try:
451
+ await asyncio.sleep(interval)
452
+ now = time.time()
453
+ stale_ids: list[str] = []
454
+ async with self._session_lock:
455
+ for sid, info in self._session_info.items():
456
+ if now - info.last_activity_at > timeout:
457
+ stale_ids.append(sid)
458
+ for sid in stale_ids:
459
+ # Re-check under lock: activity may have arrived since
460
+ # the snapshot was taken, making this session active again.
461
+ # Refresh `now` so slow _destroy_session calls don't cause
462
+ # subsequent entries to be validated against a stale clock.
463
+ now = time.time()
464
+ async with self._session_lock:
465
+ info = self._session_info.get(sid)
466
+ if info is None or (now - info.last_activity_at) <= timeout:
467
+ continue
468
+ await self._destroy_session(sid)
469
+ except asyncio.CancelledError:
470
+ break
471
+ except Exception as exc:
472
+ logging.getLogger(__name__).warning(
473
+ "Idle-session reaper encountered an error (will retry): %s",
474
+ exc,
475
+ )
476
+
477
+ def _start_reaper(self) -> None:
478
+ """Start the idle-session reaper if a timeout is configured."""
479
+ if self._session_idle_timeout_s is not None and self._reaper_task is None:
480
+ self._reaper_task = asyncio.create_task(self._reap_idle_sessions())
481
+
482
+ def _stop_reaper(self) -> None:
483
+ """Cancel the reaper background task."""
484
+ if self._reaper_task is not None:
485
+ self._reaper_task.cancel()
486
+ self._reaper_task = None
487
+
488
+ def get_session_info(self, session_id: str) -> Optional[SessionInfo]:
489
+ """
490
+ Get information about a specific session.
491
+
492
+ Args:
493
+ session_id: The session ID to query
494
+
495
+ Returns:
496
+ SessionInfo if the session exists, None otherwise
497
+ """
498
+ return self._session_info.get(session_id)
499
+
500
+ async def _run_in_session_executor(
501
+ self, session_id: str, func: Callable[..., Observation], *args, **kwargs
502
+ ) -> Observation:
503
+ """Run a synchronous function in the session's thread pool executor."""
504
+ executor = self._session_executors.get(session_id, self._executor)
505
+ loop = asyncio.get_event_loop()
506
+ return await loop.run_in_executor(executor, lambda: func(*args, **kwargs))
507
+
508
+ @property
509
+ def active_sessions(self) -> int:
510
+ """Return the number of active WebSocket sessions."""
511
+ return len(self._sessions)
512
+
513
+ @property
514
+ def max_concurrent_envs(self) -> int:
515
+ """Return the maximum number of concurrent environments."""
516
+ return self._max_concurrent_envs
517
+
518
+ @property
519
+ def is_concurrency_safe(self) -> bool:
520
+ """Return whether the environment is marked as concurrency safe."""
521
+ import inspect
522
+
523
+ if inspect.isclass(self._env_factory):
524
+ return getattr(self._env_factory, "SUPPORTS_CONCURRENT_SESSIONS", False)
525
+ else:
526
+ _temp_env = self._env_factory()
527
+ result = getattr(_temp_env, "SUPPORTS_CONCURRENT_SESSIONS", False)
528
+ _temp_env.close()
529
+ del _temp_env
530
+ return result
531
+
532
+ @property
533
+ def concurrency_config(self) -> ConcurrencyConfig:
534
+ """Return the concurrency configuration."""
535
+ return self._concurrency_config
536
+
537
+ def register_routes(
538
+ self, app: FastAPI, mode: ServerMode | str = ServerMode.SIMULATION
539
+ ) -> None:
540
+ """
541
+ Register HTTP routes on a FastAPI application.
542
+
543
+ Args:
544
+ app: FastAPI application instance
545
+ mode: Server mode - either SIMULATION or PRODUCTION (or string equivalents).
546
+ In production mode, simulation control endpoints (/reset, /step, /state)
547
+ are NOT registered. Only safe endpoints (/health, /schema, /metadata, /ws)
548
+ are available. Defaults to SIMULATION for backwards compatibility.
549
+
550
+ Raises:
551
+ ValueError: If mode is not a valid ServerMode or string equivalent.
552
+ """
553
+ # Convert string to ServerMode enum for backwards compatibility
554
+ if isinstance(mode, str):
555
+ try:
556
+ mode = ServerMode(mode.lower())
557
+ except ValueError:
558
+ valid_modes = [m.value for m in ServerMode]
559
+ raise ValueError(
560
+ f"Invalid mode: '{mode}'. Must be one of: {valid_modes}"
561
+ )
562
+
563
+ # Wire up idle-session reaper lifecycle via app events
564
+ server_ref = self
565
+
566
+ async def _start_session_reaper() -> None:
567
+ server_ref._start_reaper()
568
+
569
+ async def _stop_session_reaper() -> None:
570
+ server_ref._stop_reaper()
571
+
572
+ if not getattr(app.router, "_openenv_reaper_registered", False):
573
+ app.router.on_startup.append(_start_session_reaper)
574
+ app.router.on_shutdown.append(_stop_session_reaper)
575
+ app.router._openenv_reaper_registered = True # type: ignore[attr-defined]
576
+
577
+ # Helper function to handle reset endpoint
578
+ async def reset_handler(
579
+ request: ResetRequest = Body(default_factory=ResetRequest),
580
+ ) -> ResetResponse:
581
+ """Reset endpoint - returns initial observation."""
582
+ _env = self._env_factory()
583
+
584
+ try:
585
+ kwargs = request.model_dump(exclude_unset=True)
586
+
587
+ is_async = _env.reset_async.__func__ is not Environment.reset_async
588
+
589
+ if is_async:
590
+ sig = inspect.signature(_env.reset_async)
591
+ else:
592
+ sig = inspect.signature(_env.reset)
593
+ valid_kwargs = self._get_valid_kwargs(sig, kwargs)
594
+
595
+ if is_async:
596
+ observation = await _env.reset_async(**valid_kwargs)
597
+ else:
598
+ observation = await self._run_sync_in_thread_pool(
599
+ _env.reset, **valid_kwargs
600
+ )
601
+ return ResetResponse(**serialize_observation(observation))
602
+ finally:
603
+ _env.close()
604
+
605
+ # Helper function to handle step endpoint
606
+ async def step_handler(request: StepRequest) -> StepResponse:
607
+ """Step endpoint - executes action and returns observation."""
608
+ action_data = request.action
609
+
610
+ try:
611
+ action = deserialize_action(action_data, self.action_cls)
612
+ except ValidationError as e:
613
+ raise HTTPException(
614
+ status_code=status.HTTP_422_UNPROCESSABLE_CONTENT, detail=e.errors()
615
+ )
616
+
617
+ _env = self._env_factory()
618
+
619
+ try:
620
+ kwargs = request.model_dump(exclude_unset=True, exclude={"action"})
621
+
622
+ is_async = _env.step_async.__func__ is not Environment.step_async
623
+
624
+ if is_async:
625
+ sig = inspect.signature(_env.step_async)
626
+ else:
627
+ sig = inspect.signature(_env.step)
628
+ valid_kwargs = self._get_valid_kwargs(
629
+ sig, kwargs, skip_params={"action"}
630
+ )
631
+
632
+ if is_async:
633
+ observation = await _env.step_async(action, **valid_kwargs)
634
+ else:
635
+ observation = await self._run_sync_in_thread_pool(
636
+ _env.step, action, **valid_kwargs
637
+ )
638
+
639
+ return StepResponse(**serialize_observation(observation))
640
+ finally:
641
+ _env.close()
642
+
643
+ # Helper function to handle MCP endpoint
644
+ async def mcp_handler(
645
+ request: JsonRpcRequest,
646
+ session_env: Optional[Environment] = None,
647
+ session_id: Optional[str] = None,
648
+ ) -> JsonRpcResponse:
649
+ """
650
+ Handle MCP JSON-RPC requests.
651
+
652
+ Supports tools/list and tools/call methods in JSON-RPC 2.0 format,
653
+ plus OpenEnv session lifecycle methods for HTTP MCP:
654
+ - openenv/session/create
655
+ - openenv/session/close
656
+ """
657
+ method = request.method
658
+ request_id = request.id
659
+ params = request.params
660
+ if not isinstance(params, dict):
661
+ return JsonRpcResponse.error_response(
662
+ JsonRpcErrorCode.INVALID_PARAMS,
663
+ "Params must be an object",
664
+ request_id=request_id,
665
+ )
666
+
667
+ # OpenEnv extension methods for explicit MCP session management.
668
+ # This enables persistent MCP lifecycles over HTTP /mcp, matching WebSocket semantics.
669
+ if method == "openenv/session/create":
670
+ if session_env is not None and session_id is not None:
671
+ return JsonRpcResponse.success(
672
+ result={"session_id": session_id},
673
+ request_id=request_id,
674
+ )
675
+ try:
676
+ created_session_id, _ = await self._create_session()
677
+ except SessionCapacityError as e:
678
+ return JsonRpcResponse.error_response(
679
+ JsonRpcErrorCode.SERVER_ERROR,
680
+ str(e),
681
+ request_id=request_id,
682
+ data={
683
+ "active_sessions": e.active_sessions,
684
+ "max_sessions": e.max_sessions,
685
+ },
686
+ )
687
+ except EnvironmentFactoryError as e:
688
+ return JsonRpcResponse.error_response(
689
+ JsonRpcErrorCode.SERVER_ERROR,
690
+ str(e),
691
+ request_id=request_id,
692
+ data={"factory_name": e.factory_name},
693
+ )
694
+ return JsonRpcResponse.success(
695
+ result={"session_id": created_session_id},
696
+ request_id=request_id,
697
+ )
698
+
699
+ if method == "openenv/session/close":
700
+ target_session_id = params.get("session_id")
701
+ if not target_session_id:
702
+ return JsonRpcResponse.error_response(
703
+ JsonRpcErrorCode.INVALID_PARAMS,
704
+ "Invalid params - 'session_id' is required",
705
+ request_id=request_id,
706
+ )
707
+
708
+ if session_id is not None and target_session_id == session_id:
709
+ return JsonRpcResponse.error_response(
710
+ JsonRpcErrorCode.INVALID_REQUEST,
711
+ "Cannot close active WebSocket-managed session via MCP method",
712
+ request_id=request_id,
713
+ )
714
+
715
+ async with self._session_lock:
716
+ env = self._sessions.pop(target_session_id, _MISSING)
717
+ if env is not _MISSING:
718
+ executor = self._session_executors.pop(target_session_id, None)
719
+ stack = self._session_stacks.pop(target_session_id, None)
720
+ self._session_info.pop(target_session_id, None)
721
+ else:
722
+ executor = None
723
+ stack = None
724
+
725
+ if env is _MISSING:
726
+ return JsonRpcResponse.error_response(
727
+ JsonRpcErrorCode.INVALID_PARAMS,
728
+ f"Unknown session_id: {target_session_id}",
729
+ request_id=request_id,
730
+ )
731
+
732
+ if env is None:
733
+ # Session slot reserved but env factory still running;
734
+ # re-insert the placeholder AND the executor so
735
+ # _create_session can finish and the executor remains
736
+ # tracked for eventual shutdown.
737
+ async with self._session_lock:
738
+ self._sessions[target_session_id] = None
739
+ if executor is not None:
740
+ self._session_executors[target_session_id] = executor
741
+ return JsonRpcResponse.error_response(
742
+ JsonRpcErrorCode.INVALID_REQUEST,
743
+ f"Session {target_session_id} is still initializing; retry shortly",
744
+ request_id=request_id,
745
+ )
746
+
747
+ # env/executor/stack cleanup outside the lock
748
+ await self._cleanup_session_resources(env, executor, stack)
749
+ return JsonRpcResponse.success(
750
+ result={"session_id": target_session_id, "closed": True},
751
+ request_id=request_id,
752
+ )
753
+
754
+ requested_session_id = params.get("session_id")
755
+ managed_session_id = session_id
756
+
757
+ # Use provided session environment or create temporary one
758
+ if session_env is not None:
759
+ _env = session_env
760
+ should_close = False
761
+ elif requested_session_id:
762
+ async with self._session_lock:
763
+ _env = self._sessions.get(requested_session_id, _MISSING)
764
+
765
+ if _env is _MISSING:
766
+ return JsonRpcResponse.error_response(
767
+ JsonRpcErrorCode.INVALID_PARAMS,
768
+ f"Unknown session_id: {requested_session_id}",
769
+ request_id=request_id,
770
+ )
771
+
772
+ if _env is None:
773
+ return JsonRpcResponse.error_response(
774
+ JsonRpcErrorCode.INVALID_REQUEST,
775
+ f"Session {requested_session_id} is still initializing; retry shortly",
776
+ request_id=request_id,
777
+ )
778
+
779
+ should_close = False
780
+ managed_session_id = requested_session_id
781
+ else:
782
+ _env = self._env_factory()
783
+ should_close = True
784
+ try:
785
+ mcp_client = getattr(_env, "mcp_client", None)
786
+ mcp_server = getattr(_env, "mcp_server", None)
787
+ mcp_session_factory = getattr(_env, "mcp_session", None)
788
+
789
+ if method == McpMethod.TOOLS_LIST:
790
+ # Check if environment is MCP-enabled
791
+ if mcp_client is None and mcp_server is None:
792
+ return JsonRpcResponse.error_response(
793
+ JsonRpcErrorCode.INTERNAL_ERROR,
794
+ "Environment does not support MCP",
795
+ request_id=request_id,
796
+ )
797
+
798
+ if mcp_client:
799
+ if managed_session_id and mcp_client.is_connected():
800
+ # Session-managed with live transport — call
801
+ # directly, no redundant re-entry.
802
+ tools = await mcp_client.list_tools()
803
+ elif callable(mcp_session_factory):
804
+ # Stateless request, or session-managed but the
805
+ # background transport was lost: (re-)open.
806
+ mcp_session_cm = cast(
807
+ AsyncContextManager[Any], mcp_session_factory()
808
+ )
809
+ async with mcp_session_cm:
810
+ tools = await mcp_client.list_tools()
811
+ else:
812
+ async with mcp_client:
813
+ tools = await mcp_client.list_tools()
814
+
815
+ return JsonRpcResponse.success(
816
+ result={
817
+ "tools": [
818
+ t.model_dump()
819
+ if hasattr(t, "model_dump")
820
+ else dict(t)
821
+ for t in tools
822
+ ]
823
+ },
824
+ request_id=request_id,
825
+ )
826
+
827
+ if mcp_server:
828
+ tools = []
829
+ for _tool_name, tool in get_server_tools(mcp_server).items():
830
+ tools.append(
831
+ {
832
+ "name": tool.name,
833
+ "description": tool.description or "",
834
+ "inputSchema": tool.parameters or {},
835
+ }
836
+ )
837
+ return JsonRpcResponse.success(
838
+ result={"tools": tools},
839
+ request_id=request_id,
840
+ )
841
+
842
+ return JsonRpcResponse.error_response(
843
+ JsonRpcErrorCode.INTERNAL_ERROR,
844
+ "MCP server not available",
845
+ request_id=request_id,
846
+ )
847
+
848
+ elif method == McpMethod.TOOLS_CALL:
849
+ tool_name = params.get("name")
850
+ arguments = params.get("arguments", {})
851
+
852
+ if mcp_client is None and mcp_server is None:
853
+ return JsonRpcResponse.error_response(
854
+ JsonRpcErrorCode.INTERNAL_ERROR,
855
+ "Environment does not support MCP",
856
+ request_id=request_id,
857
+ )
858
+
859
+ if not tool_name:
860
+ return JsonRpcResponse.error_response(
861
+ JsonRpcErrorCode.INVALID_PARAMS,
862
+ "Missing 'name' in params",
863
+ request_id=request_id,
864
+ )
865
+
866
+ if mcp_client:
867
+ if managed_session_id and mcp_client.is_connected():
868
+ # Session-managed with live transport.
869
+ result = await mcp_client.call_tool(
870
+ name=tool_name, arguments=arguments
871
+ )
872
+ elif callable(mcp_session_factory):
873
+ # Stateless request, or session-managed but the
874
+ # background transport was lost: (re-)open.
875
+ mcp_session_cm = cast(
876
+ AsyncContextManager[Any], mcp_session_factory()
877
+ )
878
+ async with mcp_session_cm:
879
+ result = await mcp_client.call_tool(
880
+ name=tool_name, arguments=arguments
881
+ )
882
+ else:
883
+ async with mcp_client:
884
+ result = await mcp_client.call_tool(
885
+ name=tool_name, arguments=arguments
886
+ )
887
+ elif mcp_server:
888
+ server_tools = get_server_tools(mcp_server)
889
+ if tool_name in server_tools:
890
+ tool = server_tools[tool_name]
891
+ if inspect.iscoroutinefunction(tool.fn):
892
+ result = await tool.fn(**arguments)
893
+ else:
894
+ result = tool.fn(**arguments)
895
+ else:
896
+ return JsonRpcResponse.error_response(
897
+ JsonRpcErrorCode.INVALID_PARAMS,
898
+ f"Tool not found: {tool_name}",
899
+ request_id=request_id,
900
+ )
901
+ else:
902
+ return JsonRpcResponse.error_response(
903
+ JsonRpcErrorCode.INTERNAL_ERROR,
904
+ "MCP server not available",
905
+ request_id=request_id,
906
+ )
907
+
908
+ # Ensure result is JSON serializable
909
+ serializable_result = _make_json_serializable(result)
910
+
911
+ return JsonRpcResponse.success(
912
+ result=serializable_result,
913
+ request_id=request_id,
914
+ )
915
+
916
+ else:
917
+ return JsonRpcResponse.error_response(
918
+ JsonRpcErrorCode.METHOD_NOT_FOUND,
919
+ f"Method not found: {method}",
920
+ request_id=request_id,
921
+ )
922
+
923
+ except Exception as e:
924
+ return JsonRpcResponse.error_response(
925
+ JsonRpcErrorCode.INTERNAL_ERROR,
926
+ str(e),
927
+ request_id=request_id,
928
+ )
929
+ finally:
930
+ if managed_session_id:
931
+ self._update_session_activity(
932
+ managed_session_id,
933
+ increment_step=(method == McpMethod.TOOLS_CALL),
934
+ )
935
+ if should_close:
936
+ _env.close()
937
+
938
+ # Register MCP WebSocket endpoint (available in both production and simulation modes)
939
+ @app.websocket("/mcp")
940
+ async def mcp_websocket_endpoint(websocket: WebSocket):
941
+ """
942
+ WebSocket endpoint for MCP JSON-RPC requests.
943
+
944
+ Each WebSocket connection gets its own environment instance for MCP operations.
945
+
946
+ Message Protocol:
947
+ - Client sends: JSON-RPC 2.0 request (tools/list, tools/call)
948
+ - Server responds: JSON-RPC 2.0 response (result or error)
949
+ """
950
+ await websocket.accept()
951
+
952
+ session_id = None
953
+ session_env = None
954
+
955
+ try:
956
+ # Create session with dedicated environment
957
+ session_id, session_env = await self._create_session()
958
+ if session_env is None:
959
+ raise RuntimeError(
960
+ "Session environment not initialized for MCP websocket"
961
+ )
962
+
963
+ # If environment has an mcp_session context manager, hold it open
964
+ # for the lifetime of the websocket connection
965
+
966
+ async with AsyncExitStack() as stack:
967
+ mcp_session_factory = getattr(session_env, "mcp_session", None)
968
+ if callable(mcp_session_factory):
969
+ mcp_session_cm = cast(
970
+ AsyncContextManager[Any], mcp_session_factory()
971
+ )
972
+ await stack.enter_async_context(mcp_session_cm)
973
+
974
+ while True:
975
+ # Receive message from client
976
+ raw_message = await websocket.receive_text()
977
+
978
+ try:
979
+ jsonrpc_dict = json.loads(raw_message)
980
+ jsonrpc_request = JsonRpcRequest(**jsonrpc_dict)
981
+ except json.JSONDecodeError as e:
982
+ error_resp = JsonRpcResponse.error_response(
983
+ JsonRpcErrorCode.PARSE_ERROR,
984
+ f"Parse error: {e}",
985
+ )
986
+ await websocket.send_text(error_resp.model_dump_json())
987
+ continue
988
+ except ValidationError as e:
989
+ error_resp = JsonRpcResponse.error_response(
990
+ JsonRpcErrorCode.INVALID_REQUEST,
991
+ f"Invalid request: {e}",
992
+ )
993
+ await websocket.send_text(error_resp.model_dump_json())
994
+ continue
995
+
996
+ try:
997
+ # Call mcp_handler with session environment
998
+ response = await mcp_handler(
999
+ jsonrpc_request,
1000
+ session_env=session_env,
1001
+ session_id=session_id,
1002
+ )
1003
+ await websocket.send_text(response.model_dump_json())
1004
+ except Exception as e:
1005
+ error_resp = JsonRpcResponse.error_response(
1006
+ JsonRpcErrorCode.INTERNAL_ERROR,
1007
+ str(e),
1008
+ request_id=jsonrpc_request.id,
1009
+ )
1010
+ await websocket.send_text(error_resp.model_dump_json())
1011
+
1012
+ except WebSocketDisconnect:
1013
+ pass
1014
+ except SessionCapacityError as e:
1015
+ error_resp = JsonRpcResponse.error_response(
1016
+ JsonRpcErrorCode.SERVER_ERROR,
1017
+ str(e),
1018
+ data={
1019
+ "active_sessions": e.active_sessions,
1020
+ "max_sessions": e.max_sessions,
1021
+ },
1022
+ )
1023
+ await websocket.send_text(error_resp.model_dump_json())
1024
+ except EnvironmentFactoryError as e:
1025
+ error_resp = JsonRpcResponse.error_response(
1026
+ JsonRpcErrorCode.SERVER_ERROR,
1027
+ str(e),
1028
+ data={"factory_name": e.factory_name},
1029
+ )
1030
+ await websocket.send_text(error_resp.model_dump_json())
1031
+ except Exception as e:
1032
+ error_resp = JsonRpcResponse.error_response(
1033
+ JsonRpcErrorCode.SERVER_ERROR,
1034
+ str(e),
1035
+ )
1036
+ await websocket.send_text(error_resp.model_dump_json())
1037
+ finally:
1038
+ if session_id:
1039
+ await self._destroy_session(session_id)
1040
+ try:
1041
+ await websocket.close()
1042
+ except RuntimeError:
1043
+ pass
1044
+
1045
+ # Register simulation control routes only in simulation mode
1046
+ if mode == ServerMode.SIMULATION:
1047
+
1048
+ @app.post(
1049
+ "/reset",
1050
+ response_model=ResetResponse,
1051
+ tags=["Environment Control"],
1052
+ summary="Reset the environment",
1053
+ description="""
1054
+ Reset the environment to its initial state and return the first observation.
1055
+
1056
+ You can optionally provide a seed for reproducibility and an episode_id for tracking.
1057
+ """,
1058
+ responses={
1059
+ 200: {
1060
+ "description": "Environment reset successfully",
1061
+ "content": {
1062
+ "application/json": {
1063
+ "example": {
1064
+ "observation": {"status": "ready", "data": {}},
1065
+ "reward": None,
1066
+ "done": False,
1067
+ }
1068
+ }
1069
+ },
1070
+ }
1071
+ },
1072
+ )
1073
+ async def reset(
1074
+ request: ResetRequest = Body(default_factory=ResetRequest),
1075
+ ) -> ResetResponse:
1076
+ return await reset_handler(request)
1077
+
1078
+ @app.post(
1079
+ "/step",
1080
+ response_model=StepResponse,
1081
+ tags=["Environment Control"],
1082
+ summary="Execute an action in the environment",
1083
+ description="""
1084
+ Execute an action in the environment and receive the resulting observation.
1085
+
1086
+ The action must conform to the environment's action schema, which can be
1087
+ retrieved from the `/schema` endpoint. If the action is invalid,
1088
+ the endpoint will return HTTP 422 with detailed validation errors.
1089
+
1090
+ The response includes:
1091
+ - **observation**: The environment's response to the action
1092
+ - **reward**: Optional reward signal (float or None)
1093
+ - **done**: Boolean indicating if the episode has terminated
1094
+ """,
1095
+ responses={
1096
+ 200: {
1097
+ "description": "Action executed successfully",
1098
+ "content": {
1099
+ "application/json": {
1100
+ "example": {
1101
+ "observation": {"status": "success", "data": {}},
1102
+ "reward": 1.0,
1103
+ "done": False,
1104
+ }
1105
+ }
1106
+ },
1107
+ },
1108
+ 422: {
1109
+ "description": "Validation error - invalid action format or values",
1110
+ "content": {
1111
+ "application/json": {
1112
+ "example": {
1113
+ "detail": [
1114
+ {
1115
+ "type": "string_too_short",
1116
+ "loc": ["body", "action", "message"],
1117
+ "msg": "String should have at least 1 character",
1118
+ "input": "",
1119
+ }
1120
+ ]
1121
+ }
1122
+ }
1123
+ },
1124
+ },
1125
+ 500: {
1126
+ "description": "Internal server error during action execution"
1127
+ },
1128
+ },
1129
+ )
1130
+ async def step(request: StepRequest) -> StepResponse:
1131
+ return await step_handler(request)
1132
+
1133
+ def get_state_handler() -> State:
1134
+ _env = self._env_factory()
1135
+ try:
1136
+ return _env.state
1137
+ finally:
1138
+ _env.close()
1139
+
1140
+ def get_metadata_handler() -> EnvironmentMetadata:
1141
+ _env = self._env_factory()
1142
+ try:
1143
+ return _env.get_metadata()
1144
+ finally:
1145
+ _env.close()
1146
+
1147
+ # Build list of GET endpoints based on mode
1148
+ get_endpoints = [
1149
+ GetEndpointConfig(
1150
+ path="/metadata",
1151
+ handler=get_metadata_handler,
1152
+ response_model=EnvironmentMetadata,
1153
+ tag="Environment Info",
1154
+ summary="Get environment metadata",
1155
+ description="""
1156
+ Get metadata about this environment.
1157
+
1158
+ Returns information about the environment including name, description,
1159
+ version, author, and documentation links.
1160
+ """,
1161
+ ),
1162
+ GetEndpointConfig(
1163
+ path="/health",
1164
+ handler=lambda: HealthResponse(status=HealthStatus.HEALTHY),
1165
+ response_model=HealthResponse,
1166
+ tag="Health",
1167
+ summary="Health check",
1168
+ description="Check if the environment server is running and healthy.",
1169
+ ),
1170
+ ]
1171
+
1172
+ # Only register /state endpoint in simulation mode
1173
+ if mode == ServerMode.SIMULATION:
1174
+ get_endpoints.insert(
1175
+ 0,
1176
+ GetEndpointConfig(
1177
+ path="/state",
1178
+ handler=get_state_handler,
1179
+ response_model=State,
1180
+ tag="State Management",
1181
+ summary="Get current environment state",
1182
+ description="""
1183
+ Retrieve the current internal state of the environment.
1184
+
1185
+ The structure of the state object is defined by the environment's State model.
1186
+ """,
1187
+ ),
1188
+ )
1189
+
1190
+ register_get_endpoints(app, get_endpoints)
1191
+
1192
+ # Register combined schema endpoint
1193
+ @app.get(
1194
+ "/schema",
1195
+ response_model=SchemaResponse,
1196
+ tags=["Schema"],
1197
+ summary="Get all JSON schemas",
1198
+ description="""
1199
+ Get JSON schemas for actions, observations, and state in a single response.
1200
+
1201
+ Returns a combined schema object containing:
1202
+ - **action**: JSON schema for actions accepted by this environment
1203
+ - **observation**: JSON schema for observations returned by this environment
1204
+ - **state**: JSON schema for environment state objects
1205
+
1206
+ This is more efficient than calling individual schema endpoints and provides
1207
+ all schema information needed to interact with the environment.
1208
+ """,
1209
+ responses={
1210
+ 200: {
1211
+ "description": "Combined schemas retrieved successfully",
1212
+ "content": {
1213
+ "application/json": {
1214
+ "example": {
1215
+ "action": {
1216
+ "type": "object",
1217
+ "properties": {"message": {"type": "string"}},
1218
+ },
1219
+ "observation": {
1220
+ "type": "object",
1221
+ "properties": {"response": {"type": "string"}},
1222
+ },
1223
+ "state": {
1224
+ "type": "object",
1225
+ "properties": {"step_count": {"type": "integer"}},
1226
+ },
1227
+ }
1228
+ }
1229
+ },
1230
+ }
1231
+ },
1232
+ )
1233
+ async def get_schemas() -> SchemaResponse:
1234
+ """Return all schemas in one response."""
1235
+ return SchemaResponse(
1236
+ action=self.action_cls.model_json_schema(),
1237
+ observation=self.observation_cls.model_json_schema(),
1238
+ state=State.model_json_schema(),
1239
+ )
1240
+
1241
+ # Register MCP endpoint for production mode (direct MCP access)
1242
+ @app.post("/mcp")
1243
+ async def mcp_endpoint(request_raw: Request) -> Dict[str, Any]:
1244
+ """
1245
+ MCP JSON-RPC endpoint for production mode.
1246
+
1247
+ Bypasses step() overhead and provides direct access to MCP tools.
1248
+ Supports tools/list and tools/call methods.
1249
+ """
1250
+ # Parse JSON manually to handle parse errors gracefully
1251
+ try:
1252
+ body = await request_raw.body()
1253
+ request_dict = json.loads(body)
1254
+ request = JsonRpcRequest(**request_dict)
1255
+ except json.JSONDecodeError:
1256
+ return JsonRpcResponse.error_response(
1257
+ JsonRpcErrorCode.PARSE_ERROR
1258
+ ).model_dump()
1259
+ except ValidationError as e:
1260
+ return JsonRpcResponse.error_response(
1261
+ JsonRpcErrorCode.INVALID_REQUEST,
1262
+ f"Invalid request: {e}",
1263
+ ).model_dump()
1264
+ except Exception:
1265
+ return JsonRpcResponse.error_response(
1266
+ JsonRpcErrorCode.PARSE_ERROR
1267
+ ).model_dump()
1268
+
1269
+ response = await mcp_handler(request)
1270
+ return response.model_dump()
1271
+
1272
+ # Register WebSocket endpoint for persistent sessions
1273
+ @app.websocket("/ws")
1274
+ async def websocket_endpoint(websocket: WebSocket):
1275
+ """
1276
+ WebSocket endpoint for persistent environment sessions.
1277
+
1278
+ Each WebSocket connection gets its own environment instance.
1279
+
1280
+ Message Protocol:
1281
+ - Client sends: WSResetMessage | WSStepMessage | WSStateMessage | WSCloseMessage
1282
+ - Server responds: WSObservationResponse | WSStateResponse | WSErrorResponse
1283
+ """
1284
+ await websocket.accept()
1285
+
1286
+ session_id = None
1287
+ session_env = None
1288
+
1289
+ try:
1290
+ # Create session with dedicated environment
1291
+ session_id, session_env = await self._create_session()
1292
+ if session_env is None:
1293
+ raise RuntimeError(
1294
+ "Session environment not initialized for websocket"
1295
+ )
1296
+
1297
+ # Keep MCP session open for entire websocket lifetime
1298
+ # (avoids reconnect overhead on every message)
1299
+
1300
+ async with AsyncExitStack() as stack:
1301
+ mcp_session_factory = getattr(session_env, "mcp_session", None)
1302
+ if callable(mcp_session_factory):
1303
+ mcp_session_cm = cast(
1304
+ AsyncContextManager[Any], mcp_session_factory()
1305
+ )
1306
+ await stack.enter_async_context(mcp_session_cm)
1307
+
1308
+ while True:
1309
+ # Receive message from client
1310
+ raw_message = await websocket.receive_text()
1311
+
1312
+ try:
1313
+ message_dict = json.loads(raw_message)
1314
+ except json.JSONDecodeError as e:
1315
+ error_resp = WSErrorResponse(
1316
+ data={
1317
+ "message": f"Invalid JSON: {e}",
1318
+ "code": WSErrorCode.INVALID_JSON,
1319
+ }
1320
+ )
1321
+ await websocket.send_text(error_resp.model_dump_json())
1322
+ continue
1323
+
1324
+ msg_type = message_dict.get("type", "")
1325
+
1326
+ try:
1327
+ match msg_type:
1328
+ case "reset":
1329
+ msg = WSResetMessage(**message_dict)
1330
+
1331
+ is_async = (
1332
+ session_env.reset_async.__func__
1333
+ is not Environment.reset_async
1334
+ )
1335
+
1336
+ if is_async:
1337
+ sig = inspect.signature(session_env.reset_async)
1338
+ valid_kwargs = self._get_valid_kwargs(
1339
+ sig, msg.data
1340
+ )
1341
+ observation = await session_env.reset_async(
1342
+ **valid_kwargs
1343
+ )
1344
+ else:
1345
+ sig = inspect.signature(session_env.reset)
1346
+ valid_kwargs = self._get_valid_kwargs(
1347
+ sig, msg.data
1348
+ )
1349
+ observation = (
1350
+ await self._run_in_session_executor(
1351
+ session_id,
1352
+ session_env.reset,
1353
+ **valid_kwargs,
1354
+ )
1355
+ )
1356
+
1357
+ self._update_session_activity(session_id)
1358
+
1359
+ response = WSObservationResponse(
1360
+ data=serialize_observation(observation),
1361
+ )
1362
+
1363
+ case "step":
1364
+ msg = WSStepMessage(**message_dict)
1365
+ action = deserialize_action(
1366
+ msg.data, self.action_cls
1367
+ )
1368
+
1369
+ is_async = (
1370
+ session_env.step_async.__func__
1371
+ is not Environment.step_async
1372
+ )
1373
+
1374
+ if is_async:
1375
+ observation = await session_env.step_async(
1376
+ action
1377
+ )
1378
+ else:
1379
+ observation = (
1380
+ await self._run_in_session_executor(
1381
+ session_id, session_env.step, action
1382
+ )
1383
+ )
1384
+
1385
+ self._update_session_activity(
1386
+ session_id, increment_step=True
1387
+ )
1388
+
1389
+ response = WSObservationResponse(
1390
+ data=serialize_observation(observation)
1391
+ )
1392
+
1393
+ case "state":
1394
+ msg = WSStateMessage(**message_dict)
1395
+ state = session_env.state
1396
+ if hasattr(state, "model_dump"):
1397
+ state_data = state.model_dump()
1398
+ else:
1399
+ state_data = dict(state) if state else {}
1400
+
1401
+ response = WSStateResponse(data=state_data)
1402
+
1403
+ case "close":
1404
+ msg = WSCloseMessage(**message_dict)
1405
+ break
1406
+
1407
+ case "mcp":
1408
+ msg = WSMCPMessage(**message_dict)
1409
+ try:
1410
+ rpc_request = JsonRpcRequest(**msg.data)
1411
+ except (ValidationError, Exception) as e:
1412
+ rpc_response = JsonRpcResponse.error_response(
1413
+ JsonRpcErrorCode.INVALID_REQUEST,
1414
+ f"Invalid request: {e}",
1415
+ )
1416
+ else:
1417
+ rpc_response = await mcp_handler(
1418
+ rpc_request,
1419
+ session_env=session_env,
1420
+ session_id=session_id,
1421
+ )
1422
+ response = WSMCPResponse(
1423
+ data=rpc_response.model_dump()
1424
+ )
1425
+
1426
+ case _:
1427
+ response = WSErrorResponse(
1428
+ data={
1429
+ "message": f"Unknown message type: {msg_type}",
1430
+ "code": WSErrorCode.UNKNOWN_TYPE,
1431
+ }
1432
+ )
1433
+
1434
+ await websocket.send_text(response.model_dump_json())
1435
+
1436
+ except ValidationError as e:
1437
+ error_resp = WSErrorResponse(
1438
+ data={
1439
+ "message": "Invalid message",
1440
+ "code": WSErrorCode.VALIDATION_ERROR,
1441
+ "errors": e.errors(),
1442
+ }
1443
+ )
1444
+ await websocket.send_text(error_resp.model_dump_json())
1445
+ except Exception as e:
1446
+ error_resp = WSErrorResponse(
1447
+ data={
1448
+ "message": str(e),
1449
+ "code": WSErrorCode.EXECUTION_ERROR,
1450
+ }
1451
+ )
1452
+ await websocket.send_text(error_resp.model_dump_json())
1453
+
1454
+ except WebSocketDisconnect:
1455
+ pass
1456
+ except SessionCapacityError as e:
1457
+ error_resp = WSErrorResponse(
1458
+ data={
1459
+ "message": str(e),
1460
+ "code": WSErrorCode.CAPACITY_REACHED,
1461
+ "active_sessions": e.active_sessions,
1462
+ "max_sessions": e.max_sessions,
1463
+ }
1464
+ )
1465
+ await websocket.send_text(error_resp.model_dump_json())
1466
+ except EnvironmentFactoryError as e:
1467
+ error_resp = WSErrorResponse(
1468
+ data={
1469
+ "message": str(e),
1470
+ "code": WSErrorCode.FACTORY_ERROR,
1471
+ "factory_name": e.factory_name,
1472
+ }
1473
+ )
1474
+ await websocket.send_text(error_resp.model_dump_json())
1475
+ except Exception as e:
1476
+ error_resp = WSErrorResponse(
1477
+ data={"message": str(e), "code": WSErrorCode.SESSION_ERROR}
1478
+ )
1479
+ await websocket.send_text(error_resp.model_dump_json())
1480
+ finally:
1481
+ if session_id:
1482
+ await self._destroy_session(session_id)
1483
+ try:
1484
+ await websocket.close()
1485
+ except RuntimeError:
1486
+ pass
1487
+
1488
+
1489
+ def create_app(
1490
+ env: Callable[[], Environment],
1491
+ action_cls: Type[Action],
1492
+ observation_cls: Type[Observation],
1493
+ env_name: Optional[str] = None,
1494
+ max_concurrent_envs: Optional[int] = None,
1495
+ concurrency_config: Optional[ConcurrencyConfig] = None,
1496
+ gradio_builder: Optional[Callable[..., Any]] = None,
1497
+ ) -> FastAPI:
1498
+ """
1499
+ Create a FastAPI application with or without web interface.
1500
+
1501
+ This function creates a FastAPI app with the web interface enabled by default,
1502
+ including README integration for better user experience.
1503
+
1504
+ Args:
1505
+ env: Environment factory (callable) that creates new instances
1506
+ action_cls: The Action subclass this environment expects
1507
+ observation_cls: The Observation subclass this environment returns
1508
+ env_name: Optional environment name for README loading
1509
+ max_concurrent_envs: Maximum concurrent WebSocket sessions.
1510
+ Mutually exclusive with concurrency_config.
1511
+ concurrency_config: Optional ConcurrencyConfig for advanced concurrency settings.
1512
+ Mutually exclusive with max_concurrent_envs.
1513
+ gradio_builder: Optional callable to build a custom Gradio UI at /web.
1514
+ Signature: (web_manager, action_fields, metadata, is_chat_env, title,
1515
+ quick_start_md) -> gr.Blocks. When None, the default Gradio app is used.
1516
+ See docs/customizing-web-ui.md.
1517
+
1518
+ Returns:
1519
+ FastAPI application instance with or without web interface and README integration
1520
+ """
1521
+ # Check if web interface should be enabled
1522
+ # This can be controlled via environment variable or build argument
1523
+ enable_web = os.getenv("ENABLE_WEB_INTERFACE", "false").lower() in (
1524
+ "true",
1525
+ "1",
1526
+ "yes",
1527
+ )
1528
+
1529
+ if enable_web:
1530
+ # Gradio-based web UI (gradio is a core dependency)
1531
+ from .web_interface import create_web_interface_app
1532
+
1533
+ return create_web_interface_app(
1534
+ cast(Any, env),
1535
+ action_cls,
1536
+ observation_cls,
1537
+ env_name,
1538
+ max_concurrent_envs,
1539
+ concurrency_config,
1540
+ gradio_builder=gradio_builder,
1541
+ )
1542
+ else:
1543
+ # Use standard FastAPI app without web interface
1544
+ return create_fastapi_app(
1545
+ env, action_cls, observation_cls, max_concurrent_envs, concurrency_config
1546
+ )
1547
+
1548
+
1549
+ def create_fastapi_app(
1550
+ env: Callable[[], Environment],
1551
+ action_cls: Type[Action],
1552
+ observation_cls: Type[Observation],
1553
+ max_concurrent_envs: Optional[int] = None,
1554
+ concurrency_config: Optional[ConcurrencyConfig] = None,
1555
+ ) -> FastAPI:
1556
+ """
1557
+ Create a FastAPI application with comprehensive documentation.
1558
+
1559
+ Args:
1560
+ env: Environment factory (callable) that creates new instances
1561
+ action_cls: The Action subclass this environment expects
1562
+ observation_cls: The Observation subclass this environment returns
1563
+ max_concurrent_envs: Maximum concurrent WebSocket sessions.
1564
+ Mutually exclusive with concurrency_config.
1565
+ concurrency_config: Optional ConcurrencyConfig for advanced concurrency settings.
1566
+ Mutually exclusive with max_concurrent_envs.
1567
+
1568
+ Returns:
1569
+ FastAPI application instance
1570
+ """
1571
+ try:
1572
+ from fastapi import FastAPI
1573
+ except ImportError:
1574
+ raise ImportError(
1575
+ "FastAPI is required. Install with: pip install fastapi uvicorn"
1576
+ )
1577
+
1578
+ app = FastAPI(
1579
+ title="OpenEnv Environment HTTP API",
1580
+ version="1.0.0",
1581
+ description="""
1582
+ # OpenEnv Environment HTTP API
1583
+
1584
+ HTTP API for interacting with OpenEnv environments through a standardized interface.
1585
+
1586
+ ## Features
1587
+
1588
+ * **Environment Reset**: Initialize or restart episodes
1589
+ * **Action Execution**: Send actions and receive observations
1590
+ * **State Inspection**: Query current environment state
1591
+ * **Schema Access**: Retrieve JSON schemas for actions and observations
1592
+
1593
+ ## Workflow
1594
+
1595
+ 1. Call `/reset` to start a new episode and get initial observation
1596
+ 2. Call `/step` repeatedly with actions to interact with environment
1597
+ 3. Episode ends when observation returns `done: true`
1598
+ 4. Call `/state` anytime to inspect current environment state
1599
+
1600
+ ## Documentation
1601
+
1602
+ * **Swagger UI**: Available at `/docs`
1603
+ * **ReDoc**: Available at `/redoc`
1604
+ * **OpenAPI Schema**: Available at `/openapi.json`
1605
+ """,
1606
+ openapi_tags=[
1607
+ {
1608
+ "name": "Environment Control",
1609
+ "description": "Core operations for environment interaction (reset, step)",
1610
+ },
1611
+ {
1612
+ "name": "State Management",
1613
+ "description": "Operations for inspecting environment state",
1614
+ },
1615
+ {
1616
+ "name": "Environment Info",
1617
+ "description": "Information about the environment",
1618
+ },
1619
+ {
1620
+ "name": "Schema",
1621
+ "description": "JSON Schema endpoints for actions, observations, and state",
1622
+ },
1623
+ {"name": "Health", "description": "Service health and status checks"},
1624
+ ],
1625
+ docs_url="/docs",
1626
+ redoc_url="/redoc",
1627
+ openapi_url="/openapi.json",
1628
+ contact={
1629
+ "name": "OpenEnv Team",
1630
+ "url": "https://github.com/meta-pytorch/OpenEnv",
1631
+ },
1632
+ license_info={
1633
+ "name": "BSD-3-Clause",
1634
+ "url": "https://github.com/meta-pytorch/OpenEnv/blob/main/LICENSE",
1635
+ },
1636
+ )
1637
+
1638
+ server = HTTPEnvServer(
1639
+ env,
1640
+ action_cls,
1641
+ observation_cls,
1642
+ max_concurrent_envs,
1643
+ concurrency_config=concurrency_config,
1644
+ )
1645
+ server.register_routes(app)
1646
+ return app
src/core/env_server/interfaces.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import inspect
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Generic, Optional, Protocol, TYPE_CHECKING, TypeVar
10
+
11
+ from typing_extensions import TypedDict
12
+
13
+ from .types import Action, EnvironmentMetadata, Observation, State
14
+
15
+ if TYPE_CHECKING:
16
+ from openenv.core.rubrics import Rubric
17
+
18
+ ActT = TypeVar("ActT", bound=Action)
19
+ ObsT = TypeVar("ObsT", bound=Observation)
20
+ StateT = TypeVar("StateT", bound=State)
21
+
22
+
23
+ class Message(TypedDict):
24
+ """A message in a conversation.
25
+
26
+ Compatible with Huggingface chat template format.
27
+ """
28
+
29
+ role: str
30
+ content: str
31
+
32
+
33
+ class ModelTokenizer(Protocol):
34
+ """Protocol for tokenizers that support chat templates.
35
+
36
+ This protocol defines the interface that tokenizers must implement
37
+ to work with chat-based environments. It's compatible with
38
+ Huggingface transformers tokenizers.
39
+ """
40
+
41
+ def apply_chat_template(
42
+ self,
43
+ conversation: list[Message],
44
+ tokenize: bool = True,
45
+ return_tensors: str | None = None,
46
+ **kwargs: Any,
47
+ ) -> Any:
48
+ """Apply a chat template to format and optionally tokenize a conversation.
49
+
50
+ Args:
51
+ conversation: List of message dictionaries with 'role' and 'content'
52
+ tokenize: Whether to tokenize the output
53
+ return_tensors: Format for returned tensors ('pt' for PyTorch)
54
+ **kwargs: Additional arguments
55
+
56
+ Returns:
57
+ Formatted and optionally tokenized conversation
58
+ """
59
+ ...
60
+
61
+ def decode(
62
+ self, token_ids: Any, skip_special_tokens: bool = False, **kwargs: Any
63
+ ) -> str:
64
+ """Decode token IDs back to text.
65
+
66
+ Args:
67
+ token_ids: Token IDs to decode
68
+ skip_special_tokens: Whether to skip special tokens in output
69
+ **kwargs: Additional arguments
70
+
71
+ Returns:
72
+ Decoded text string
73
+ """
74
+ ...
75
+
76
+
77
+ class Transform(ABC, Generic[ObsT]):
78
+ """Transform observations to add rewards, metrics, or other modifications.
79
+
80
+ Transforms follow the TorchRL pattern where they take an observation
81
+ and return a (potentially modified) observation. This allows for
82
+ flexible reward computation and observation augmentation.
83
+ """
84
+
85
+ @abstractmethod
86
+ def __call__(self, observation: ObsT) -> ObsT:
87
+ """Transform an observation.
88
+
89
+ Args:
90
+ observation: The input observation
91
+
92
+ Returns:
93
+ The transformed observation
94
+ """
95
+ pass
96
+
97
+
98
+ class Environment(ABC, Generic[ActT, ObsT, StateT]):
99
+ """Base class for all environment servers following Gym/Gymnasium API.
100
+
101
+ Args:
102
+ transform: Optional transform to apply to observations
103
+ rubric: Optional rubric for reward computation. When provided, the
104
+ rubric's output can be used to set the observation's reward in step().
105
+
106
+ Class Attributes:
107
+ SUPPORTS_CONCURRENT_SESSIONS: Whether this environment supports concurrent sessions.
108
+ When True, multiple WebSocket connections can each have their own
109
+ environment instance (up to max_concurrent_envs). When False (default),
110
+ the environment should only be used with a single session at a time.
111
+
112
+ Set this to True in your Environment subclass if:
113
+ - The environment uses proper session isolation (e.g., unique working dirs)
114
+ - No shared mutable state exists between instances
115
+ - External resources (databases, APIs) can handle concurrent access
116
+
117
+ Attributes:
118
+ rubric: Optional rubric for computing rewards. Environments can set this
119
+ in __init__ and use it in step() to compute observation rewards.
120
+ Training infrastructure can access it for introspection:
121
+ for name, r in env.rubric.named_rubrics():
122
+ print(f"{name}: {r.last_score}")
123
+
124
+ See RFC 004 for rubric design: rfcs/004-rubrics.md
125
+ """
126
+
127
+ # Class-level flag indicating whether this environment supports concurrent sessions
128
+ SUPPORTS_CONCURRENT_SESSIONS: bool = False
129
+
130
+ # Optional rubric for reward computation
131
+ rubric: Optional["Rubric"]
132
+
133
+ def __init__(
134
+ self,
135
+ transform: Optional[Transform[ObsT]] = None,
136
+ rubric: Optional["Rubric"] = None,
137
+ ):
138
+ self.transform = transform
139
+ self.rubric = rubric
140
+
141
+ @abstractmethod
142
+ def reset(
143
+ self,
144
+ seed: Optional[int] = None,
145
+ episode_id: Optional[str] = None,
146
+ **kwargs: Any,
147
+ ) -> ObsT:
148
+ """Reset the environment and return initial observation."""
149
+ pass
150
+
151
+ async def reset_async(
152
+ self,
153
+ seed: Optional[int] = None,
154
+ episode_id: Optional[str] = None,
155
+ **kwargs: Any,
156
+ ) -> ObsT:
157
+ """Async version of reset. Default implementation calls sync reset.
158
+
159
+ Override to provide true async implementation.
160
+ """
161
+ return self.reset(seed=seed, episode_id=episode_id, **kwargs)
162
+
163
+ @abstractmethod
164
+ def step(
165
+ self,
166
+ action: ActT,
167
+ timeout_s: Optional[float] = None,
168
+ **kwargs: Any,
169
+ ) -> ObsT:
170
+ """Take a step in the environment."""
171
+ pass
172
+
173
+ async def step_async(
174
+ self,
175
+ action: ActT,
176
+ timeout_s: Optional[float] = None,
177
+ **kwargs: Any,
178
+ ) -> ObsT:
179
+ """Async version of step. Default implementation calls sync step.
180
+
181
+ Override to provide true async implementation.
182
+ """
183
+ return self.step(action, timeout_s=timeout_s, **kwargs)
184
+
185
+ @property
186
+ @abstractmethod
187
+ def state(self) -> StateT:
188
+ """Get the current environment state."""
189
+ pass
190
+
191
+ def get_metadata(self) -> EnvironmentMetadata:
192
+ """
193
+ Get metadata about this environment.
194
+
195
+ Override this method to provide custom metadata for the environment.
196
+ Default implementation returns basic metadata derived from class name.
197
+
198
+ Returns:
199
+ EnvironmentMetadata with environment information
200
+ """
201
+ return EnvironmentMetadata(
202
+ name=self.__class__.__name__,
203
+ description=f"{self.__class__.__name__} environment",
204
+ version="1.0.0",
205
+ )
206
+
207
+ def _apply_transform(self, observation: ObsT) -> ObsT:
208
+ """Apply transform if one is provided."""
209
+ if self.transform is not None:
210
+ return self.transform(observation)
211
+ return observation
212
+
213
+ def _apply_rubric(self, action: ActT, observation: ObsT) -> float:
214
+ """Apply rubric if one is provided.
215
+
216
+ Args:
217
+ action: The action taken by the agent.
218
+ observation: The resulting observation.
219
+
220
+ Returns:
221
+ Reward value from the rubric, or 0.0 if no rubric is set.
222
+
223
+ Usage in step():
224
+ def step(self, action: MyAction, ...) -> MyObservation:
225
+ # ... execute action and create observation ...
226
+ observation.reward = self._apply_rubric(action, observation)
227
+ return observation
228
+ """
229
+ if self.rubric is not None:
230
+ return self.rubric(action, observation)
231
+ return 0.0
232
+
233
+ async def _apply_rubric_async(self, action: ActT, observation: ObsT) -> float:
234
+ """Apply rubric asynchronously if one is provided.
235
+
236
+ Args:
237
+ action: The action taken by the agent.
238
+ observation: The resulting observation.
239
+
240
+ Returns:
241
+ Reward value from the rubric, or 0.0 if no rubric is set.
242
+
243
+ Usage in step_async():
244
+ async def step_async(self, action: MyAction, ...) -> MyObservation:
245
+ # ... execute action and create observation ...
246
+ observation.reward = await self._apply_rubric_async(action, observation)
247
+ return observation
248
+ """
249
+ if self.rubric is not None:
250
+ result = self.rubric(action, observation)
251
+ # If rubric returns a coroutine, await it
252
+ if inspect.iscoroutine(result):
253
+ return await result
254
+ return result
255
+ return 0.0
256
+
257
+ def _reset_rubric(self) -> None:
258
+ """Reset the rubric state if one is provided.
259
+
260
+ Call this in reset() to clear any trajectory state in the rubric.
261
+
262
+ Usage in reset():
263
+ def reset(self, ...) -> MyObservation:
264
+ self._reset_rubric()
265
+ # ... create initial observation ...
266
+ return observation
267
+ """
268
+ if self.rubric is not None:
269
+ self.rubric.reset()
270
+
271
+ async def _reset_rubric_async(self) -> None:
272
+ """Reset the rubric state asynchronously if one is provided.
273
+
274
+ Call this in reset_async() to clear any trajectory state in the rubric.
275
+
276
+ Usage in reset_async():
277
+ async def reset_async(self, ...) -> MyObservation:
278
+ await self._reset_rubric_async()
279
+ # ... create initial observation ...
280
+ return observation
281
+ """
282
+ if self.rubric is not None:
283
+ # Check if rubric has async reset method
284
+ if hasattr(self.rubric, "reset_async"):
285
+ result = self.rubric.reset_async()
286
+ if inspect.iscoroutine(result):
287
+ await result
288
+ else:
289
+ self.rubric.reset()
290
+
291
+ def close(self) -> None:
292
+ """Clean up resources used by the environment.
293
+
294
+ Override this method to implement custom cleanup logic.
295
+ Called when the environment is being destroyed or reset.
296
+ """
297
+ pass
src/core/env_server/mcp_environment.py ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ MCP Environment base class for OpenEnv.
9
+
10
+ This module provides the MCPEnvironment base class that integrates FastMCP servers
11
+ with OpenEnv's Gym-style Environment interface. It handles MCP tool discovery
12
+ and invocation through the step() API, following RFC 003.
13
+
14
+ Key features:
15
+ - Automatic routing of ListToolsAction and CallToolAction to MCP server
16
+ - Reserved tool name validation (reset, step, state, close are protected)
17
+ - Timeout handling for tool calls
18
+ - Proper error categorization (tool not found, execution errors, timeouts)
19
+ - Mode-aware tool registration (production vs simulation)
20
+ - Code mode support via get_callables() and execute_code()
21
+
22
+ Usage:
23
+ from fastmcp import FastMCP
24
+ from openenv.core.env_server.mcp_environment import MCPEnvironment
25
+
26
+ class MyMCPEnv(MCPEnvironment):
27
+ def __init__(self):
28
+ mcp = FastMCP("my-server")
29
+
30
+ # Register mode-specific tools
31
+ @self.tool(mode="production")
32
+ def my_tool(arg: str) -> str:
33
+ return f"Production: {arg}"
34
+
35
+ @self.tool(mode="simulation")
36
+ def my_tool(arg: str) -> str:
37
+ return f"Simulation: {arg}"
38
+
39
+ super().__init__(mcp)
40
+
41
+ def reset(self, seed=None, episode_id=None, **kwargs):
42
+ # Reset logic here
43
+ ...
44
+
45
+ def _step_impl(self, action):
46
+ # Handle non-MCP actions
47
+ ...
48
+
49
+ @property
50
+ def state(self):
51
+ # Return current state
52
+ ...
53
+ """
54
+
55
+ import asyncio
56
+ import inspect
57
+ from abc import abstractmethod
58
+ from collections import defaultdict
59
+ from contextlib import asynccontextmanager
60
+ from typing import Any, Callable, Dict, Optional
61
+
62
+ from fastmcp import Client
63
+ from fastmcp.client.client import CallToolResult
64
+ from mcp.types import TextContent
65
+
66
+ from ..utils import run_async_safely
67
+ from .interfaces import Environment
68
+ from .mcp_types import (
69
+ CallToolAction,
70
+ CallToolObservation,
71
+ ListToolsAction,
72
+ ListToolsObservation,
73
+ RESERVED_TOOL_NAMES,
74
+ Tool,
75
+ ToolError,
76
+ ToolErrorType,
77
+ )
78
+ from .types import Action, Observation
79
+
80
+
81
+ # Default timeout for MCP tool calls in seconds
82
+ MCP_TOOL_CALL_TIMEOUT = 30.0
83
+
84
+ # Valid modes for tool registration
85
+ VALID_MODES = {"production", "simulation"}
86
+
87
+
88
+ def get_server_tools(mcp_server: Any) -> Dict[str, Any]:
89
+ """
90
+ Get tools from a FastMCP server, compatible with both 2.x and 3.x.
91
+
92
+ Returns:
93
+ Dictionary mapping tool names to tool objects.
94
+ """
95
+ # FastMCP 2.x: get_tools() returns dict {name: Tool}
96
+ if hasattr(mcp_server, "get_tools"):
97
+ result = run_async_safely(mcp_server.get_tools())
98
+ if isinstance(result, dict):
99
+ return result
100
+ # FastMCP 3.x: list_tools() returns list of Tool objects
101
+ if hasattr(mcp_server, "list_tools"):
102
+ tools_list = run_async_safely(mcp_server.list_tools())
103
+ return {t.name: t for t in tools_list}
104
+ return {}
105
+
106
+
107
+ class MCPEnvironment(Environment):
108
+ """
109
+ Base class for environments that expose tools via MCP (Model Context Protocol).
110
+
111
+ MCPEnvironment bridges FastMCP servers with OpenEnv's Gym-style API, allowing
112
+ agents to discover and invoke MCP tools through the standard step() interface.
113
+
114
+ The class automatically handles:
115
+ - ListToolsAction: Returns available tools from the MCP server
116
+ - CallToolAction: Invokes a specific tool with arguments
117
+
118
+ All other actions are delegated to the abstract _step_impl() method,
119
+ which subclasses must implement.
120
+
121
+ Args:
122
+ mcp_server: A FastMCP server instance containing tool definitions.
123
+ The server's tools will be validated against reserved names.
124
+ transform: Optional transform to apply to observations (inherited from Environment).
125
+
126
+ Raises:
127
+ ValueError: If any tool in the MCP server uses a reserved name
128
+ (reset, step, state, close).
129
+
130
+ Example:
131
+ >>> from fastmcp import FastMCP
132
+ >>> mcp = FastMCP("calculator")
133
+ >>> @mcp.tool()
134
+ ... def add(a: int, b: int) -> int:
135
+ ... return a + b
136
+ >>> env = MyMCPEnvironment(mcp)
137
+ >>> obs = env.step(ListToolsAction())
138
+ >>> obs.tools[0].name
139
+ 'add'
140
+ """
141
+
142
+ def __init__(self, mcp_server: Any, transform: Optional[Any] = None) -> None:
143
+ """
144
+ Initialize the MCP environment.
145
+
146
+ Args:
147
+ mcp_server: A FastMCP server instance with tool definitions.
148
+ transform: Optional transform to apply to observations.
149
+
150
+ Raises:
151
+ ValueError: If any tool uses a reserved name (reset, step, state, close).
152
+ """
153
+ super().__init__(transform=transform)
154
+
155
+ # Validate tool names before storing
156
+ self._validate_tool_names(mcp_server)
157
+
158
+ self.mcp_server = mcp_server
159
+ self.mcp_client = Client(mcp_server)
160
+
161
+ # Track mode-specific tools: {tool_name: {mode: func}}
162
+ # mode can be "production", "simulation", or None (available in all modes)
163
+ self._mode_tools = defaultdict(dict)
164
+
165
+ # Track tool schemas for list_tools: {tool_name: {mode: schema}}
166
+ self._mode_tool_schemas = defaultdict(dict)
167
+
168
+ def _require_mcp_client(self) -> Any:
169
+ """Return MCP client or raise if environment has been closed."""
170
+ if self.mcp_client is None:
171
+ raise RuntimeError("MCP client is not available; environment is closed")
172
+ return self.mcp_client
173
+
174
+ def _require_mcp_server(self) -> Any:
175
+ """Return MCP server or raise if environment has been closed."""
176
+ if self.mcp_server is None:
177
+ raise RuntimeError("MCP server is not available; environment is closed")
178
+ return self.mcp_server
179
+
180
+ @asynccontextmanager
181
+ async def mcp_session(self):
182
+ """
183
+ Context manager for MCP client sessions.
184
+
185
+ This wrapper serves two purposes:
186
+
187
+ 1. **Null guard** — raises a clear error if ``close()`` has already
188
+ been called (``mcp_client`` is ``None``).
189
+
190
+ 2. **AsyncExitStack adapter** — FastMCP's ``Client.__aenter__``
191
+ creates a background ``asyncio.Task`` for session management.
192
+ When entered directly via ``AsyncExitStack`` in the HTTP session
193
+ path (``_create_session``), this task can be cancelled by ASGI
194
+ harnesses (e.g. Starlette ``TestClient``) between requests,
195
+ corrupting session state. Wrapping in an ``asynccontextmanager``
196
+ generator isolates the task lifecycle: the generator frame keeps
197
+ ``async with client:`` suspended at ``yield``, so cleanup only
198
+ runs when the stack explicitly closes the generator — not when
199
+ the event loop cancels orphaned tasks.
200
+
201
+ Delegates to FastMCP's ``Client`` context manager which is
202
+ reentrant: the first entry opens the transport and subsequent
203
+ (nested) entries simply increment an internal reference counter.
204
+ The transport is closed only when the outermost context exits.
205
+
206
+ No external lock is needed because ``Client._connect`` /
207
+ ``Client._disconnect`` already serialise connection state changes
208
+ through their own ``anyio.Lock``.
209
+ """
210
+ client = self._require_mcp_client()
211
+ async with client:
212
+ yield client
213
+
214
+ @property
215
+ def supports_code_mode(self) -> bool:
216
+ """Check if this environment supports code mode (execute_code)."""
217
+ return True
218
+
219
+ def _get_server_tools(self, mcp_server: Any) -> Dict[str, Any]:
220
+ """
221
+ Get tools from a FastMCP server, compatible with both 2.x and 3.x.
222
+
223
+ Returns:
224
+ Dictionary mapping tool names to tool objects.
225
+ """
226
+ return get_server_tools(mcp_server)
227
+
228
+ def get_callables(self) -> Dict[str, Callable]:
229
+ """
230
+ Get callable functions for code mode.
231
+
232
+ Returns tool functions as direct Python callables, enabling code mode
233
+ where agents write Python code that calls tools directly (no JSON-RPC
234
+ overhead). Mode-specific tools are filtered by the current mode.
235
+
236
+ Returns:
237
+ Dictionary mapping tool names to callables.
238
+ """
239
+ callables: Dict[str, Callable] = {}
240
+ current_mode = getattr(self, "_mode", None)
241
+
242
+ # Extract callables from FastMCP server using public API
243
+ for tool_name, tool in self._get_server_tools(self.mcp_server).items():
244
+ if hasattr(tool, "fn") and callable(tool.fn):
245
+ callables[tool_name] = tool.fn
246
+
247
+ # Add mode-specific tools available in current mode
248
+ for tool_name, mode_funcs in self._mode_tools.items():
249
+ if None in mode_funcs:
250
+ # Tool available in all modes (already in FastMCP if registered there)
251
+ if tool_name not in callables:
252
+ callables[tool_name] = mode_funcs[None]
253
+ elif current_mode in mode_funcs:
254
+ # Tool available in current mode only
255
+ callables[tool_name] = mode_funcs[current_mode]
256
+
257
+ return callables
258
+
259
+ def execute_code(self, code: str) -> Observation:
260
+ """
261
+ Execute Python code with tools available as callables.
262
+
263
+ This enables the CodeAct pattern where agents write Python code
264
+ that calls tools directly as functions, avoiding JSON-RPC overhead.
265
+
266
+ Args:
267
+ code: Python code to execute. Tools are available as functions
268
+ in the execution namespace. Set a variable named 'result'
269
+ to capture the return value.
270
+
271
+ Returns:
272
+ Observation with result in metadata["result"] or error in
273
+ metadata["error"].
274
+ """
275
+ namespace = self.get_callables()
276
+
277
+ result_dict: Dict[str, Any] = {}
278
+ try:
279
+ exec(code, namespace, result_dict)
280
+ result = result_dict.get("result")
281
+ return Observation(done=False, reward=0.0, metadata={"result": result})
282
+ except SyntaxError as e:
283
+ return Observation(
284
+ done=False, reward=0.0, metadata={"error": f"Syntax error: {str(e)}"}
285
+ )
286
+ except Exception as e:
287
+ return Observation(done=False, reward=0.0, metadata={"error": str(e)})
288
+
289
+ def _validate_tool_names(self, mcp_server: Any) -> None:
290
+ """
291
+ Validate that no tools use reserved names.
292
+
293
+ Reserved names (reset, step, state, close) are protected to maintain
294
+ the dual API boundary between infrastructure and agent APIs.
295
+
296
+ Args:
297
+ mcp_server: The FastMCP server to validate.
298
+
299
+ Raises:
300
+ ValueError: If any tool uses a reserved name.
301
+ """
302
+ tools_dict = self._get_server_tools(mcp_server)
303
+ if tools_dict:
304
+ tool_names = set(tools_dict.keys())
305
+ conflicts = tool_names & RESERVED_TOOL_NAMES
306
+ if conflicts:
307
+ raise ValueError(
308
+ f"MCP tools cannot use reserved names: {sorted(conflicts)}. "
309
+ f"Reserved names are: {sorted(RESERVED_TOOL_NAMES)}"
310
+ )
311
+
312
+ def tool(self, mode: Optional[str] = None) -> Callable:
313
+ """
314
+ Decorator for registering mode-aware tools.
315
+
316
+ Args:
317
+ mode: Optional mode for the tool ("production" or "simulation").
318
+ If None, tool is available in all modes.
319
+
320
+ Returns:
321
+ A decorator function for registering tools.
322
+
323
+ Raises:
324
+ ValueError: If mode is not None, "production", or "simulation".
325
+ """
326
+ if mode is not None and mode not in VALID_MODES:
327
+ raise ValueError(
328
+ f"Invalid mode '{mode}'. Mode must be 'production', 'simulation', or None."
329
+ )
330
+
331
+ def decorator(func: Callable) -> Callable:
332
+ tool_name = func.__name__
333
+ # Validate tool name is not reserved
334
+ if tool_name in RESERVED_TOOL_NAMES:
335
+ raise ValueError(
336
+ f"Tool name '{tool_name}' is reserved and cannot be used. "
337
+ f"Reserved names are: {sorted(RESERVED_TOOL_NAMES)}"
338
+ )
339
+
340
+ # If mode is None, register with FastMCP as usual
341
+ if mode is None:
342
+ mcp_server = self._require_mcp_server()
343
+ decorated_func = mcp_server.tool()(func)
344
+ self._mode_tools[tool_name][None] = func
345
+ return decorated_func
346
+
347
+ # For mode-specific tools, don't register with FastMCP
348
+ # Instead, track them ourselves
349
+ self._mode_tools[tool_name][mode] = func
350
+
351
+ # Extract schema information from function signature
352
+ sig = inspect.signature(func)
353
+ schema = {
354
+ "type": "object",
355
+ "properties": {},
356
+ "required": [],
357
+ }
358
+
359
+ for param_name, param in sig.parameters.items():
360
+ # Get type annotation
361
+ param_type = param.annotation
362
+ json_type = "string" # default
363
+ if param_type in (int, "int"):
364
+ json_type = "integer"
365
+ elif param_type in (float, "float"):
366
+ json_type = "number"
367
+ elif param_type in (bool, "bool"):
368
+ json_type = "boolean"
369
+
370
+ schema["properties"][param_name] = {"type": json_type}
371
+
372
+ # If no default value, it's required
373
+ if param.default == inspect.Parameter.empty:
374
+ schema["required"].append(param_name)
375
+
376
+ # Store the schema for this mode-specific tool
377
+ self._mode_tool_schemas[tool_name][mode] = {
378
+ "name": tool_name,
379
+ "description": func.__doc__ or "",
380
+ "input_schema": schema,
381
+ }
382
+
383
+ return func
384
+
385
+ return decorator
386
+
387
+ def step(
388
+ self,
389
+ action: Action,
390
+ timeout_s: Optional[float] = None,
391
+ **kwargs: Any,
392
+ ) -> Observation:
393
+ """
394
+ Execute an action in the environment.
395
+
396
+ This method routes MCP-specific actions (ListToolsAction, CallToolAction)
397
+ to the appropriate handlers, while delegating all other actions to
398
+ the subclass's _step_impl() method.
399
+
400
+ Args:
401
+ action: The action to execute. Can be:
402
+ - ListToolsAction: Returns available MCP tools
403
+ - CallToolAction: Invokes a specific MCP tool
404
+ - Any other Action: Delegated to _step_impl()
405
+ timeout_s: Optional timeout in seconds for the action.
406
+ Defaults to MCP_TOOL_CALL_TIMEOUT (30s) for MCP actions.
407
+ **kwargs: Additional arguments passed to handlers.
408
+
409
+ Returns:
410
+ Observation appropriate to the action type:
411
+ - ListToolsObservation for ListToolsAction
412
+ - CallToolObservation for CallToolAction
413
+ - Subclass-defined Observation for other actions
414
+ """
415
+ if isinstance(action, ListToolsAction):
416
+ return self._handle_list_tools()
417
+ elif isinstance(action, CallToolAction):
418
+ return self._handle_call_tool(action, timeout_s=timeout_s)
419
+ else:
420
+ return self._step_impl(action, timeout_s=timeout_s, **kwargs)
421
+
422
+ def _handle_list_tools(self) -> ListToolsObservation:
423
+ """Sync wrapper — delegates to the canonical async implementation."""
424
+ return run_async_safely(self._async_handle_list_tools())
425
+
426
+ async def _async_list_tools(self) -> list:
427
+ """
428
+ Async helper to list tools from the MCP client.
429
+
430
+ Returns:
431
+ List of tool objects from the MCP server.
432
+ """
433
+ async with self.mcp_session() as client:
434
+ return await client.list_tools()
435
+
436
+ def _handle_call_tool(
437
+ self,
438
+ action: CallToolAction,
439
+ timeout_s: Optional[float] = None,
440
+ ) -> CallToolObservation:
441
+ """Sync wrapper — delegates to the canonical async implementation."""
442
+ return run_async_safely(
443
+ self._async_handle_call_tool(action, timeout_s=timeout_s)
444
+ )
445
+
446
+ async def _async_call_tool(self, tool_name: str, arguments: dict) -> Any:
447
+ """
448
+ Async helper to call a tool on the MCP server.
449
+
450
+ Args:
451
+ tool_name: Name of the tool to invoke.
452
+ arguments: Dictionary of arguments to pass to the tool.
453
+
454
+ Returns:
455
+ The result from the tool execution.
456
+ """
457
+ async with self.mcp_session() as client:
458
+ return await client.call_tool(tool_name, arguments)
459
+
460
+ async def _async_handle_list_tools(self) -> ListToolsObservation:
461
+ """Async version of _handle_list_tools — avoids run_async_safely."""
462
+ try:
463
+ current_mode = getattr(self, "_mode", None)
464
+ tools_result = await self._async_list_tools()
465
+ tools = []
466
+ for tool in tools_result:
467
+ if tool.name not in self._mode_tool_schemas:
468
+ tools.append(
469
+ Tool(
470
+ name=tool.name,
471
+ description=tool.description or "",
472
+ input_schema=tool.inputSchema
473
+ if hasattr(tool, "inputSchema")
474
+ else {},
475
+ )
476
+ )
477
+ for tool_name, mode_schemas in self._mode_tool_schemas.items():
478
+ if None in mode_schemas:
479
+ schema = mode_schemas[None]
480
+ tools.append(
481
+ Tool(
482
+ name=schema["name"],
483
+ description=schema["description"],
484
+ input_schema=schema["input_schema"],
485
+ )
486
+ )
487
+ elif current_mode in mode_schemas:
488
+ schema = mode_schemas[current_mode]
489
+ tools.append(
490
+ Tool(
491
+ name=schema["name"],
492
+ description=schema["description"],
493
+ input_schema=schema["input_schema"],
494
+ )
495
+ )
496
+ return ListToolsObservation(tools=tools)
497
+ except Exception as e:
498
+ return ListToolsObservation(
499
+ tools=[],
500
+ metadata={"error": str(e), "error_type": "list_tools_failed"},
501
+ )
502
+
503
+ async def _async_handle_call_tool(
504
+ self,
505
+ action: CallToolAction,
506
+ timeout_s: Optional[float] = None,
507
+ ) -> CallToolObservation:
508
+ """Async version of _handle_call_tool — avoids run_async_safely."""
509
+ timeout = timeout_s if timeout_s is not None else MCP_TOOL_CALL_TIMEOUT
510
+ tool_name = action.tool_name
511
+ current_mode = getattr(self, "_mode", None)
512
+
513
+ if tool_name in self._mode_tools:
514
+ mode_info = self._mode_tools[tool_name]
515
+ if None in mode_info:
516
+ func = mode_info[None]
517
+ elif current_mode in mode_info:
518
+ func = mode_info[current_mode]
519
+ else:
520
+ return CallToolObservation(
521
+ tool_name=tool_name,
522
+ result=None,
523
+ error=ToolError(
524
+ error_type=ToolErrorType.TOOL_NOT_FOUND,
525
+ message=f"Tool '{tool_name}' not available in {current_mode} mode",
526
+ ),
527
+ )
528
+ try:
529
+ if inspect.iscoroutinefunction(func):
530
+ result = await func(**action.arguments)
531
+ else:
532
+ result = func(**action.arguments)
533
+ return CallToolObservation(
534
+ tool_name=tool_name,
535
+ result=CallToolResult(
536
+ content=[TextContent(type="text", text=str(result))],
537
+ structured_content={"result": result},
538
+ meta=None,
539
+ data=result,
540
+ is_error=False,
541
+ ),
542
+ )
543
+ except Exception as e:
544
+ return CallToolObservation(
545
+ tool_name=tool_name,
546
+ result=None,
547
+ error=ToolError(
548
+ error_type=ToolErrorType.EXECUTION_ERROR,
549
+ message=str(e),
550
+ ),
551
+ )
552
+
553
+ try:
554
+ result = await asyncio.wait_for(
555
+ self._async_call_tool(action.tool_name, action.arguments),
556
+ timeout=timeout,
557
+ )
558
+ return CallToolObservation(tool_name=action.tool_name, result=result)
559
+ except asyncio.TimeoutError:
560
+ return CallToolObservation(
561
+ tool_name=action.tool_name,
562
+ result=None,
563
+ error=ToolError(
564
+ error_type=ToolErrorType.TIMEOUT,
565
+ message=f"Tool '{action.tool_name}' timed out after {timeout} seconds",
566
+ ),
567
+ )
568
+ except Exception as e:
569
+ error_message = str(e)
570
+ if (
571
+ "not found" in error_message.lower()
572
+ or "unknown tool" in error_message.lower()
573
+ ):
574
+ error_type = ToolErrorType.TOOL_NOT_FOUND
575
+ elif (
576
+ "invalid" in error_message.lower()
577
+ or "argument" in error_message.lower()
578
+ ):
579
+ error_type = ToolErrorType.INVALID_ARGS
580
+ else:
581
+ error_type = ToolErrorType.EXECUTION_ERROR
582
+ return CallToolObservation(
583
+ tool_name=action.tool_name,
584
+ result=None,
585
+ error=ToolError(error_type=error_type, message=error_message),
586
+ )
587
+
588
+ async def step_async(
589
+ self,
590
+ action: Action,
591
+ timeout_s: Optional[float] = None,
592
+ **kwargs: Any,
593
+ ) -> Observation:
594
+ """
595
+ Async step that routes MCP actions without going through run_async_safely.
596
+
597
+ The WebSocket handler calls this directly on the outer event loop, where
598
+ the MCP session is already open, avoiding the thread/event-loop deadlock
599
+ that occurs when the sync step() path is used via run_in_executor.
600
+ """
601
+ if isinstance(action, ListToolsAction):
602
+ return await self._async_handle_list_tools()
603
+ elif isinstance(action, CallToolAction):
604
+ return await self._async_handle_call_tool(action, timeout_s=timeout_s)
605
+ else:
606
+ loop = asyncio.get_event_loop()
607
+ return await loop.run_in_executor(
608
+ None, lambda: self._step_impl(action, timeout_s=timeout_s, **kwargs)
609
+ )
610
+
611
+ @abstractmethod
612
+ def _step_impl(
613
+ self,
614
+ action: Action,
615
+ timeout_s: Optional[float] = None,
616
+ **kwargs: Any,
617
+ ) -> Observation:
618
+ """
619
+ Handle non-MCP actions in the environment.
620
+
621
+ Subclasses must implement this method to handle any actions that are
622
+ not ListToolsAction or CallToolAction. This is where environment-specific
623
+ action processing should occur.
624
+
625
+ Args:
626
+ action: The action to execute (guaranteed not to be an MCP action).
627
+ timeout_s: Optional timeout in seconds.
628
+ **kwargs: Additional arguments.
629
+
630
+ Returns:
631
+ An Observation appropriate for the action.
632
+ """
633
+ pass
634
+
635
+ def close(self) -> None:
636
+ """
637
+ Clean up resources used by the environment.
638
+
639
+ This method cleans up the MCP client and any other resources.
640
+ Subclasses should call super().close() if they override this method.
641
+ """
642
+ # The MCP client uses async context manager, so cleanup happens
643
+ # automatically when the context exits. We just clear references.
644
+ self.mcp_client = None
645
+ self.mcp_server = None