Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

File size: 10,442 Bytes

31715b5

"""OpenEnv 0.2.x protocol conformance tests for the OpenSleuth env.

These tests are *additive* and orthogonal to the existing legacy contract
covered in ``test_env.py`` / ``test_open_env.py``.

What we verify:

* The OpenEnv ``Environment`` adapter (:class:`OpenSleuthEnvironment`) implements
  all four required methods (``reset`` / ``step`` / ``state`` / ``get_metadata``)
  and returns instances of OpenEnv's ``Observation`` / ``State`` /
  ``EnvironmentMetadata`` base classes (so it would pass any ``isinstance``
  check by an OpenEnv-aware harness).
* The ``/openenv/*`` HTTP sub-app exposes every endpoint OpenEnv 0.2.x
  promises: ``/health``, ``/metadata``, ``/schema``, ``/state``, ``/reset``,
  ``/step``. (The ``/ws`` WebSocket is exercised separately via the
  ``smoke_openenv_client.py`` script run against the live Space.)
* ``/openenv/reset`` returns the canonical ``{"observation", "reward", "done"}``
  envelope (NOT a bare observation, which is the legacy shape).
* ``/openenv/step`` accepts the canonical ``{"action": {...}}`` envelope (NOT
  ``{"episode_id", "action"}``, which is the legacy shape).
* The legacy bare ``/reset`` and ``/step`` routes the trainer uses are
  untouched.
"""

from __future__ import annotations

import pytest

pytest.importorskip(
    "openenv.core.env_server.types",
    reason="openenv-core not installed; conformance tests skipped.",
)

from fastapi.testclient import TestClient
from openenv.core.env_server.types import (
    EnvironmentMetadata,
    Observation as OEObservation,
    State as OEState,
)

from opensleuth_env.openenv_adapter import (
    OpenSleuthAction,
    OpenSleuthEnvironment,
    OpenSleuthObservation,
    OpenSleuthState,
)


# ---------------------------------------------------------------------------
# Adapter-level: exercises the Environment subclass directly (no HTTP).
# ---------------------------------------------------------------------------


class TestEnvironmentSubclass:
    def test_observation_inherits_openenv_base(self) -> None:
        env = OpenSleuthEnvironment()
        obs = env.reset()
        assert isinstance(obs, OEObservation), (
            "OpenSleuthObservation must subclass openenv.core...types.Observation "
            "so OpenEnv tooling (rubrics, evals, web UI) can introspect it."
        )
        # Must expose the OpenEnv-required fields.
        assert obs.done is False
        assert obs.reward is None
        assert isinstance(obs.metadata, dict)

    def test_state_inherits_openenv_base(self) -> None:
        env = OpenSleuthEnvironment()
        env.reset()
        state = env.state
        assert isinstance(state, OEState)
        assert state.episode_id is not None
        assert state.step_count == 0

    def test_metadata_is_openenv_environment_metadata(self) -> None:
        env = OpenSleuthEnvironment()
        meta = env.get_metadata()
        assert isinstance(meta, EnvironmentMetadata)
        assert meta.name == "OpenSleuth"
        assert meta.description
        assert meta.version

    def test_reset_step_full_loop(self) -> None:
        env = OpenSleuthEnvironment()
        env.reset(target_name="fibonacci", max_steps=10, seed=0)

        probe = env.step(
            OpenSleuthAction(action_type="probe", input_repr="10")
        )
        assert probe.done is False
        assert probe.reward is not None and probe.reward > 0
        assert probe.probe_history[-1]["output_repr"] == "55"
        assert env.state.step_count == 1

        submit = env.step(
            OpenSleuthAction(
                action_type="submit",
                code="def fibonacci(n):\n    a,b=0,1\n    for _ in range(n-1):\n        a,b=b,a+b\n    return b\n",
            )
        )
        assert submit.done is True
        assert submit.reward is not None
        assert env.state.finished is True

    def test_reset_with_no_args_uses_safe_default(self) -> None:
        """OpenEnv requires reset() to work with zero arguments. We use
        'fibonacci' as the implicit default so a bare reset always produces
        a valid episode."""
        env = OpenSleuthEnvironment()
        obs = env.reset()
        assert obs.target_function_name == "fibonacci"

    def test_supports_concurrent_sessions_flag(self) -> None:
        """OpenEnv's HTTPEnvServer refuses max_concurrent_envs > 1 unless
        the env opts in via SUPPORTS_CONCURRENT_SESSIONS."""
        assert OpenSleuthEnvironment.SUPPORTS_CONCURRENT_SESSIONS is True

    def test_action_is_extra_forbid(self) -> None:
        """OpenEnv Action base sets extra='forbid' to catch typo'd fields
        early. Our OpenSleuthAction must inherit that behavior."""
        from pydantic import ValidationError

        with pytest.raises(ValidationError):
            OpenSleuthAction(action_type="probe", input_repr="1", made_up_field=1)


# ---------------------------------------------------------------------------
# HTTP-level: verifies the /openenv/* sub-app routes that judges will hit.
# ---------------------------------------------------------------------------


@pytest.fixture(scope="module")
def http_client() -> TestClient:
    from server import app

    with TestClient(app) as client:
        yield client


class TestOpenEnvHttpSurface:
    """The endpoints the OpenEnv spec / `openenv validate` look for."""

    def test_health(self, http_client: TestClient) -> None:
        r = http_client.get("/openenv/health")
        assert r.status_code == 200, r.text
        assert r.json() == {"status": "healthy"}

    def test_metadata(self, http_client: TestClient) -> None:
        r = http_client.get("/openenv/metadata")
        assert r.status_code == 200, r.text
        body = r.json()
        for key in ("name", "description", "version"):
            assert key in body, f"missing {key} in /openenv/metadata"
        assert body["name"] == "OpenSleuth"

    def test_schema(self, http_client: TestClient) -> None:
        r = http_client.get("/openenv/schema")
        assert r.status_code == 200, r.text
        body = r.json()
        for key in ("action", "observation", "state"):
            assert key in body, f"missing {key} in /openenv/schema"
            assert "properties" in body[key], (
                f"/openenv/schema {key!r} is not a valid JSON schema"
            )
        # action discriminator should be visible in the schema
        assert "action_type" in body["action"]["properties"]

    def test_state(self, http_client: TestClient) -> None:
        r = http_client.get("/openenv/state")
        assert r.status_code == 200, r.text
        body = r.json()
        assert "episode_id" in body
        assert "step_count" in body

    def test_reset_returns_canonical_envelope(self, http_client: TestClient) -> None:
        r = http_client.post("/openenv/reset", json={"target_name": "fibonacci"})
        assert r.status_code == 200, r.text
        body = r.json()
        # Canonical OpenEnv shape: {"observation": {...}, "reward": ..., "done": ...}
        assert set(body.keys()) == {"observation", "reward", "done"}, (
            f"Expected OpenEnv envelope, got keys: {sorted(body)}"
        )
        assert body["done"] is False
        assert body["observation"]["target_function_name"] == "fibonacci"

    def test_reset_with_no_body_works(self, http_client: TestClient) -> None:
        """OpenEnv ResetRequest defaults to an empty body. Must still work."""
        r = http_client.post("/openenv/reset")
        assert r.status_code == 200, r.text
        body = r.json()
        assert "observation" in body

    def test_step_canonical_envelope_with_probe(self, http_client: TestClient) -> None:
        r = http_client.post(
            "/openenv/step",
            json={"action": {"action_type": "probe", "input_repr": "10"}},
        )
        assert r.status_code == 200, r.text
        body = r.json()
        assert set(body.keys()) == {"observation", "reward", "done"}
        # Note: under HTTP (stateless), each /openenv/step gets a fresh env;
        # we auto-reset so a probe still produces a valid history.
        assert body["observation"]["probe_history"], "probe should produce history"

    def test_step_rejects_unknown_action_field(self, http_client: TestClient) -> None:
        r = http_client.post(
            "/openenv/step",
            json={"action": {"action_type": "probe", "input_repr": "1", "wat": True}},
        )
        # OpenEnv's deserialize_action raises ValidationError -> 422.
        assert r.status_code == 422


# ---------------------------------------------------------------------------
# Regression: the legacy trainer-facing routes must still work unchanged.
# ---------------------------------------------------------------------------


class TestLegacyContractPreserved:
    def test_legacy_health(self, http_client: TestClient) -> None:
        r = http_client.get("/health")
        assert r.status_code == 200
        assert r.json()["status"] == "ok"

    def test_legacy_reset_returns_bare_observation(self, http_client: TestClient) -> None:
        """Trainer expects {episode_id, target_function_name, ...} at the top
        level (NOT wrapped in {observation: ...}). Must NOT regress."""
        r = http_client.post(
            "/reset",
            json={"target_name": "fibonacci", "seed": 0, "max_steps": 5},
        )
        assert r.status_code == 200, r.text
        body = r.json()
        assert "episode_id" in body, (
            "Legacy /reset must return a bare observation, not the OpenEnv envelope. "
            "If this fails the trainer will break."
        )
        assert "observation" not in body  # don't accidentally double-wrap

    def test_legacy_step_returns_step_response(self, http_client: TestClient) -> None:
        reset = http_client.post(
            "/reset",
            json={"target_name": "fibonacci", "seed": 0, "max_steps": 5},
        ).json()
        eid = reset["episode_id"]
        r = http_client.post(
            "/step",
            json={
                "episode_id": eid,
                "action": {"action_type": "probe", "input_repr": "5"},
            },
        )
        assert r.status_code == 200, r.text
        body = r.json()
        # Legacy shape: {observation, reward, done, info}
        assert {"observation", "reward", "done", "info"} <= set(body.keys())
        assert "execution_reward" not in body  # only present on submit info