Spaces:
Running
Running
File size: 7,215 Bytes
b4ac377 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Integration tests for CodingEnv with Docker.
These tests require Docker to be running and the coding-env image to be built:
docker build -t coding-env:latest -f envs/coding_env/server/Dockerfile .
Run with:
PYTHONPATH=src:envs uv run pytest tests/envs/test_coding_env_integration.py -v
"""
import os
import sys
from pathlib import Path
import pytest
# Add paths for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "envs"))
# Skip if Docker is not available or image not built
docker_available = pytest.mark.skipif(
os.environ.get("SKIP_DOCKER_TESTS", "1") == "1",
reason="Docker tests disabled. Set SKIP_DOCKER_TESTS=0 to enable.",
)
from coding_env import CodeAction, CodingEnv
# ============================================================================
# Fixtures
# ============================================================================
@pytest.fixture(scope="module")
def coding_env_client():
"""Create a CodingEnv client from Docker image.
This fixture is module-scoped to avoid starting/stopping containers
for each test, which is slow.
"""
client = CodingEnv.from_docker_image("coding-env:latest")
yield client
client.close()
# ============================================================================
# Integration Tests
# ============================================================================
@docker_available
class TestCodingEnvDocker:
"""Integration tests that run against the Docker container."""
def test_reset(self, coding_env_client):
"""Test that reset returns a valid observation."""
result = coding_env_client.reset()
assert result.observation is not None
assert result.observation.exit_code == 0
assert result.observation.stderr == ""
def test_step_simple_print(self, coding_env_client):
"""Test executing a simple print statement."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="print('Hello, World!')"))
assert result.observation.exit_code == 0
assert "Hello, World!" in result.observation.stdout
assert result.reward is not None
def test_step_calculation(self, coding_env_client):
"""Test executing a calculation."""
coding_env_client.reset()
result = coding_env_client.step(
CodeAction(code="x = 5 + 3\nprint(f'Result: {x}')")
)
assert result.observation.exit_code == 0
assert "Result: 8" in result.observation.stdout
def test_step_import_math(self, coding_env_client):
"""Test importing and using the math module."""
coding_env_client.reset()
result = coding_env_client.step(
CodeAction(code="import math\nprint(f'Pi: {math.pi:.4f}')")
)
assert result.observation.exit_code == 0
assert "Pi: 3.1416" in result.observation.stdout
def test_step_multiline(self, coding_env_client):
"""Test executing multi-line code."""
coding_env_client.reset()
code = """
for i in range(1, 4):
print(f'{i} squared is {i**2}')
"""
result = coding_env_client.step(CodeAction(code=code))
assert result.observation.exit_code == 0
assert "1 squared is 1" in result.observation.stdout
assert "2 squared is 4" in result.observation.stdout
assert "3 squared is 9" in result.observation.stdout
def test_error_division_by_zero(self, coding_env_client):
"""Test that division by zero returns an error."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="x = 1 / 0"))
assert result.observation.exit_code == 1
assert (
"ZeroDivisionError" in result.observation.stderr
or result.observation.stderr != ""
)
def test_error_undefined_variable(self, coding_env_client):
"""Test that undefined variable returns an error."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="print(undefined_variable)"))
assert result.observation.exit_code == 1
def test_error_syntax_error(self, coding_env_client):
"""Test that syntax error returns an error."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="print('Hello'"))
assert result.observation.exit_code == 1
def test_state_tracking(self, coding_env_client):
"""Test that state is properly tracked."""
coding_env_client.reset()
state = coding_env_client.state()
assert state.episode_id is not None
assert state.step_count == 0
coding_env_client.step(CodeAction(code="x = 1"))
state = coding_env_client.state()
assert state.step_count == 1
coding_env_client.step(CodeAction(code="y = 2"))
state = coding_env_client.state()
assert state.step_count == 2
def test_reward_safe_code(self, coding_env_client):
"""Test that safe code receives a positive or zero reward."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="x = 5"))
assert result.reward is not None
assert result.reward >= 0 # Safe code should not be penalized
def test_reward_dangerous_code(self, coding_env_client):
"""Test that dangerous code receives a negative reward."""
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="import os"))
assert result.reward is not None
assert result.reward < 0 # Dangerous code should be penalized
def test_variable_persistence_within_episode(self, coding_env_client):
"""Test that variables persist within an episode."""
coding_env_client.reset()
# Define a variable
coding_env_client.step(CodeAction(code="my_var = 42"))
# Use the variable in a subsequent step
result = coding_env_client.step(CodeAction(code="print(my_var)"))
assert result.observation.exit_code == 0
assert "42" in result.observation.stdout
def test_reset_clears_variables(self, coding_env_client):
"""Test that reset clears variables from previous episode."""
# Define a variable
coding_env_client.reset()
coding_env_client.step(CodeAction(code="my_var = 42"))
# Reset and try to use the variable
coding_env_client.reset()
result = coding_env_client.step(CodeAction(code="print(my_var)"))
# Should fail because my_var is no longer defined
assert result.observation.exit_code == 1
|