Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 12 days ago

Commit

9030acd

verified ·

1 Parent(s): 77e65fb

env: add GET /tasks/{name}/sample_inputs for trainer-side fuzz delegation

Browse files

Files changed (2) hide show

server.py +38 -1
tests/test_open_env.py +500 -0

server.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import logging
 from typing import Optional
 from fastapi import FastAPI, HTTPException, Query
@@ -17,11 +18,12 @@ from opensleuth_env import (
     SubmitAction,
     TaskCatalog,
 )
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
 log = logging.getLogger("opensleuth.server")
-app = FastAPI(title="OpenSleuth Env", version="0.4.0")
 env = OpenSleuthEnv()
@@ -141,3 +143,38 @@ def probe_once(target_name: str, input_repr: str):
     obs = env.reset(target_name=target_name)
     resp = env.step(obs.episode_id, ProbeAction(input_repr=input_repr))
     return resp

 from __future__ import annotations
 import logging
+import random
 from typing import Optional
 from fastapi import FastAPI, HTTPException, Query
     SubmitAction,
     TaskCatalog,
 )
+from opensleuth_env.task_catalog import TaskResolutionError
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
 log = logging.getLogger("opensleuth.server")
+app = FastAPI(title="OpenSleuth Env", version="0.4.1")
 env = OpenSleuthEnv()
     obs = env.reset(target_name=target_name)
     resp = env.step(obs.episode_id, ProbeAction(input_repr=input_repr))
     return resp
+@app.get("/tasks/{name}/sample_inputs")
+def sample_inputs(
+    name: str,
+    n: int = Query(8, ge=1, le=64, description="How many inputs to draw."),
+    seed: int = Query(0, description="Deterministic seed for the fuzzer."),
+):
+    """Return ``n`` Python-literal `repr` strings drawn from the task's
+    auto-fuzzer (or hand-written fuzzer for builtins).
+    Used by the trainer to build in-context probe pools without having to
+    duplicate the auto-fuzzer logic on the trainer side. Each returned
+    string is `ast.literal_eval`-safe and can be POSTed straight back to
+    `/step` as a `ProbeAction.input_repr`.
+    """
+    try:
+        spec = env.catalog.resolve(target_name=name)
+    except TaskResolutionError as e:
+        raise HTTPException(status_code=404, detail=str(e)) from e
+    rng = random.Random(seed)
+    try:
+        raw_inputs = spec.fuzzer(rng, n)
+    except Exception as e:  # noqa: BLE001
+        raise HTTPException(
+            status_code=500,
+            detail=f"fuzzer for {name!r} failed: {type(e).__name__}: {e}",
+        ) from e
+    return {
+        "name": name,
+        "n": n,
+        "seed": seed,
+        "unpack_args": bool(getattr(spec, "unpack_args", False)),
+        "inputs": [repr(x) for x in raw_inputs],
+    }

tests/test_open_env.py ADDED Viewed

	@@ -0,0 +1,500 @@

+"""Tests for OpenSleuth Level 2: auto-fuzzer + TaskCatalog + open /reset.
+These tests do *not* require Hub network access. The Hub-availability test
+is opportunistic: it asserts ``>=15`` total tasks if the dataset loads, but
+silently passes (with a marker) if the Hub is offline / the env is sandboxed.
+"""
+from __future__ import annotations
+import os
+import random
+import typing
+from typing import Optional, Literal
+import pytest
+from fastapi.testclient import TestClient
+from opensleuth_env import (
+    BLACK_BOX_FUNCTIONS,
+    OpenSleuthEnv,
+    ProbeAction,
+    SubmitAction,
+    TaskCatalog,
+    TaskResolutionError,
+    auto_fuzz,
+)
+# ---------------------------------------------------------------------------
+# Auto-fuzzer
+# ---------------------------------------------------------------------------
+class TestAutoFuzzerTypes:
+    def _rng(self, seed: int = 0) -> random.Random:
+        return random.Random(seed)
+    def test_int_inputs_are_ints(self):
+        def f(n: int) -> int:
+            return n
+        rng = self._rng()
+        outs = auto_fuzz(f, 50, rng)
+        assert len(outs) == 50
+        assert all(isinstance(t, tuple) and len(t) == 1 for t in outs)
+        assert all(isinstance(t[0], int) and not isinstance(t[0], bool) for t in outs)
+    def test_str_inputs_are_strs(self):
+        def f(s: str) -> int:
+            return len(s)
+        outs = auto_fuzz(f, 30, self._rng())
+        assert all(isinstance(t[0], str) for t in outs)
+    def test_list_int_inputs_are_lists_of_ints(self):
+        def f(xs: list[int]) -> int:
+            return sum(xs)
+        outs = auto_fuzz(f, 30, self._rng())
+        for (xs,) in outs:
+            assert isinstance(xs, list)
+            assert all(isinstance(x, int) for x in xs)
+    def test_homogeneous_tuple_inputs(self):
+        def f(xs: tuple[int, ...]) -> int:
+            return sum(xs)
+        outs = auto_fuzz(f, 30, self._rng())
+        for (xs,) in outs:
+            assert isinstance(xs, tuple)
+            assert all(isinstance(x, int) for x in xs)
+    def test_heterogeneous_tuple_inputs(self):
+        def f(t: tuple[int, str]) -> int:
+            return len(t[1])
+        outs = auto_fuzz(f, 30, self._rng())
+        for (t,) in outs:
+            assert isinstance(t, tuple) and len(t) == 2
+            assert isinstance(t[0], int)
+            assert isinstance(t[1], str)
+    def test_optional_inputs_sometimes_None(self):
+        def f(x: Optional[int]) -> int:
+            return 0
+        outs = auto_fuzz(f, 200, self._rng(seed=42))
+        seen_none = any(t[0] is None for t in outs)
+        seen_int = any(isinstance(t[0], int) and not isinstance(t[0], bool) for t in outs)
+        assert seen_none, "Optional[int] should occasionally yield None"
+        assert seen_int, "Optional[int] should also yield ints"
+    def test_literal_inputs_only_pick_listed_values(self):
+        def f(mode: Literal["a", "b", "c"]) -> int:
+            return 0
+        outs = auto_fuzz(f, 50, self._rng())
+        for (m,) in outs:
+            assert m in ("a", "b", "c")
+    def test_dict_str_int_inputs(self):
+        def f(d: dict[str, int]) -> int:
+            return len(d)
+        outs = auto_fuzz(f, 20, self._rng())
+        for (d,) in outs:
+            assert isinstance(d, dict)
+            for k, v in d.items():
+                assert isinstance(k, str)
+                assert isinstance(v, int)
+    def test_multi_arg_returns_full_tuples(self):
+        def f(a: int, b: str) -> int:
+            return 0
+        outs = auto_fuzz(f, 20, self._rng())
+        for t in outs:
+            assert isinstance(t, tuple)
+            assert len(t) == 2
+            assert isinstance(t[0], int)
+            assert isinstance(t[1], str)
+    def test_unannotated_param_falls_back_to_int(self):
+        def f(x):  # no annotation
+            return x
+        outs = auto_fuzz(f, 30, self._rng())
+        for (x,) in outs:
+            assert isinstance(x, int)
+class TestAutoFuzzerSpecOverride:
+    def test_int_min_max_overrides_default_range(self):
+        def f(n: int) -> int:
+            return n
+        outs = auto_fuzz(f, 100, random.Random(0), fuzz_spec={"n": {"type": "int", "min": 1, "max": 5}})
+        for (n,) in outs:
+            assert 1 <= n <= 5, f"expected n in [1, 5], got {n}"
+    def test_str_alphabet_override(self):
+        def f(s: str) -> int:
+            return len(s)
+        outs = auto_fuzz(
+            f, 100, random.Random(0),
+            fuzz_spec={"s": {"type": "str", "alphabet": "ab", "max_len": 4}},
+        )
+        for (s,) in outs:
+            assert len(s) <= 4
+            for ch in s:
+                assert ch in "ab", f"unexpected char {ch!r} in {s!r}"
+    def test_list_elem_override(self):
+        def f(xs: list[int]) -> int:
+            return sum(xs)
+        outs = auto_fuzz(
+            f, 80, random.Random(0),
+            fuzz_spec={"xs": {"type": "list", "elem": {"type": "int", "min": 0, "max": 3}, "max_len": 4}},
+        )
+        for (xs,) in outs:
+            assert len(xs) <= 4
+            for v in xs:
+                assert 0 <= v <= 3
+    def test_tuple_elems_override(self):
+        def f(t):
+            return t
+        outs = auto_fuzz(
+            f, 30, random.Random(0),
+            fuzz_spec={"t": {"type": "tuple", "elems": [
+                {"type": "int", "min": 0, "max": 1},
+                {"type": "str", "alphabet": "x", "max_len": 2},
+            ]}},
+        )
+        for (t,) in outs:
+            assert isinstance(t, tuple) and len(t) == 2
+            assert 0 <= t[0] <= 1
+            for ch in t[1]:
+                assert ch == "x"
+# ---------------------------------------------------------------------------
+# TaskCatalog
+# ---------------------------------------------------------------------------
+class TestTaskCatalog:
+    def test_resolves_builtin_by_name(self):
+        cat = TaskCatalog(enable_hub=False)
+        spec = cat.resolve(target_name="fibonacci")
+        assert spec.name == "fibonacci"
+        assert spec is BLACK_BOX_FUNCTIONS["fibonacci"]
+        assert spec.unpack_args is False
+        assert spec.source == "builtin"
+    def test_resolves_caller_supplied_target_code(self):
+        cat = TaskCatalog(enable_hub=False)
+        code = "def add(a: int, b: int) -> int:\n    return a + b\n"
+        spec = cat.resolve(target_code=code, target_function_name="add")
+        assert spec.name == "add"
+        assert spec.unpack_args is True  # 2-arg
+        assert spec.source == "user"
+        # The wrapped fuzzer must produce calls that succeed end-to-end.
+        rng = random.Random(0)
+        inputs = spec.fuzzer(rng, 10)
+        for args in inputs:
+            assert isinstance(args, tuple) and len(args) == 2
+            assert spec.fn(*args) == args[0] + args[1]
+    def test_caller_supplied_unary_uses_unwrapped_call(self):
+        cat = TaskCatalog(enable_hub=False)
+        code = "def square(n: int) -> int:\n    return n * n\n"
+        spec = cat.resolve(target_code=code, target_function_name="square")
+        assert spec.unpack_args is False
+        rng = random.Random(0)
+        inputs = spec.fuzzer(rng, 5)
+        for x in inputs:
+            assert isinstance(x, int)
+            assert spec.fn(x) == x * x
+    def test_resolve_with_no_source_raises(self):
+        cat = TaskCatalog(enable_hub=False)
+        with pytest.raises(TaskResolutionError):
+            cat.resolve()
+    def test_resolve_unknown_name_raises(self):
+        cat = TaskCatalog(enable_hub=False)
+        with pytest.raises(TaskResolutionError):
+            cat.resolve(target_name="this_does_not_exist")
+    def test_target_code_without_function_name_raises(self):
+        cat = TaskCatalog(enable_hub=False)
+        with pytest.raises(TaskResolutionError):
+            cat.resolve(target_code="def foo(): return 1\n")
+    def test_rejects_oracle_import(self):
+        cat = TaskCatalog(enable_hub=False)
+        bad = (
+            "import opensleuth_env\n"
+            "def f(x): return x\n"
+        )
+        with pytest.raises(TaskResolutionError):
+            cat.resolve(target_code=bad, target_function_name="f")
+        bad2 = (
+            "from opensleuth_env.black_box import _fibonacci\n"
+            "def f(x): return _fibonacci(x)\n"
+        )
+        with pytest.raises(TaskResolutionError):
+            cat.resolve(target_code=bad2, target_function_name="f")
+    def test_target_code_using_open_is_blocked_at_call_time(self):
+        """`open` is not in the safe-builtins whitelist. The catalog will
+        compile the function (since `open` is only resolved at call-time
+        via NameError), but invoking it must fail safely."""
+        cat = TaskCatalog(enable_hub=False)
+        code = (
+            "def f(x):\n"
+            "    open('/tmp/x', 'w')\n"
+            "    return 0\n"
+        )
+        spec = cat.resolve(target_code=code, target_function_name="f")
+        with pytest.raises(NameError):
+            spec.fn(0)
+    def test_caller_supplied_edge_cases_are_parsed(self):
+        cat = TaskCatalog(enable_hub=False)
+        spec = cat.resolve(
+            target_code="def neg(n: int) -> int:\n    return -n\n",
+            target_function_name="neg",
+            edge_cases=["0", "1", "-1", "100"],
+        )
+        assert spec.edge_cases == [0, 1, -1, 100]
+    def test_caller_supplied_fuzz_spec_is_used(self):
+        cat = TaskCatalog(enable_hub=False)
+        spec = cat.resolve(
+            target_code="def f(n: int) -> int:\n    return n\n",
+            target_function_name="f",
+            fuzz_spec={"n": {"type": "int", "min": 7, "max": 9}},
+        )
+        rng = random.Random(0)
+        inputs = spec.fuzzer(rng, 50)
+        for x in inputs:
+            assert 7 <= x <= 9
+    def test_list_builtin_returns_nine_entries(self):
+        cat = TaskCatalog(enable_hub=False)
+        builtins_list = cat.list_builtin()
+        assert len(builtins_list) == 9
+        for entry in builtins_list:
+            assert entry["source"] == "builtin"
+            assert "name" in entry
+            assert "signature" in entry
+            assert "difficulty" in entry
+# ---------------------------------------------------------------------------
+# End-to-end via OpenSleuthEnv
+# ---------------------------------------------------------------------------
+class TestEnvOpenEnded:
+    def test_legacy_reset_by_target_name_unchanged(self):
+        env = OpenSleuthEnv(fuzz_count=10)
+        obs = env.reset(target_name="fibonacci")
+        assert obs.target_function_name == "fibonacci"
+        assert obs.difficulty == "easy"
+        assert obs.steps_taken == 0
+        # Probe via the same path as before.
+        resp = env.step(obs.episode_id, ProbeAction(input_repr="10"))
+        assert resp.observation.probe_history[-1].output_repr == "55"
+    def test_env_caller_supplied_unary_full_loop(self):
+        env = OpenSleuthEnv(fuzz_count=10)
+        obs = env.reset(
+            target_code="def square(n: int) -> int:\n    return n * n\n",
+            target_function_name="square",
+        )
+        assert obs.target_function_name == "square"
+        # Probe.
+        resp = env.step(obs.episode_id, ProbeAction(input_repr="5"))
+        assert resp.observation.probe_history[-1].output_repr == "25"
+        # Submit a perfect implementation.
+        code = "def square(n):\n    return n * n\n"
+        resp = env.step(obs.episode_id, SubmitAction(code=code))
+        assert resp.done is True
+        assert resp.info["execution_reward"] == pytest.approx(100.0)
+        assert resp.reward > 140.0
+    def test_env_caller_supplied_multi_arg_full_loop(self):
+        env = OpenSleuthEnv(fuzz_count=10)
+        obs = env.reset(
+            target_code="def add(a: int, b: int) -> int:\n    return a + b\n",
+            target_function_name="add",
+            edge_cases=["(0, 0)", "(1, -1)", "(100, 0)"],
+        )
+        assert obs.target_function_name == "add"
+        # Probe with a 2-tuple.
+        resp = env.step(obs.episode_id, ProbeAction(input_repr="(2, 3)"))
+        assert resp.observation.probe_history[-1].output_repr == "5"
+        # Submit a perfect implementation.
+        code = "def add(a, b):\n    return a + b\n"
+        resp = env.step(obs.episode_id, SubmitAction(code=code))
+        assert resp.done is True
+        assert resp.info["execution_reward"] == pytest.approx(100.0)
+        assert resp.reward > 140.0
+    def test_env_caller_supplied_buggy_submission_scored_negative(self):
+        env = OpenSleuthEnv(fuzz_count=10)
+        obs = env.reset(
+            target_code="def add(a: int, b: int) -> int:\n    return a + b\n",
+            target_function_name="add",
+        )
+        bad = "def add(a, b):\n    return a - b\n"
+        resp = env.step(obs.episode_id, SubmitAction(code=bad))
+        assert resp.done is True
+        assert resp.info["execution_reward"] < 50.0
+        assert resp.reward < 0.0
+    def test_env_caller_supplied_oracle_import_rejected(self):
+        env = OpenSleuthEnv()
+        with pytest.raises(ValueError):
+            env.reset(
+                target_code="import opensleuth_env\ndef f(x): return x\n",
+                target_function_name="f",
+            )
+# ---------------------------------------------------------------------------
+# HTTP layer
+# ---------------------------------------------------------------------------
+@pytest.fixture(scope="module")
+def http_client():
+    from server import app
+    with TestClient(app) as client:
+        yield client
+class TestHttpLayer:
+    def test_tasks_endpoint_lists_at_least_nine_builtin(self, http_client):
+        r = http_client.get("/tasks?source=builtin")
+        assert r.status_code == 200
+        body = r.json()
+        assert body["count"] >= 9
+        names = [t["name"] for t in body["tasks"]]
+        for name in BLACK_BOX_FUNCTIONS:
+            assert name in names
+    def test_tasks_all_includes_at_least_builtins(self, http_client):
+        r = http_client.get("/tasks?source=all")
+        assert r.status_code == 200
+        body = r.json()
+        # The builtins are always present. If the Hub is reachable we'd
+        # expect 15+, but the test must pass even if Hub is unavailable
+        # (e.g. CI sandboxes block egress).
+        assert body["count"] >= 9
+        if not body["hub"].get("enabled", False) or body["hub"].get("error"):
+            pytest.skip(f"hub not reachable: {body['hub']}")
+        # Hub reachable -> dataset should have 15+ rows after bootstrap.
+        assert body["count"] >= 15
+    def test_sample_inputs_returns_n_repr_strings_for_builtin(self, http_client):
+        r = http_client.get("/tasks/fibonacci/sample_inputs?n=5&seed=7")
+        assert r.status_code == 200, r.text
+        body = r.json()
+        assert body["name"] == "fibonacci"
+        assert body["n"] == 5
+        assert body["seed"] == 7
+        assert isinstance(body["inputs"], list)
+        assert len(body["inputs"]) == 5
+        # Every returned string must be ast.literal_eval-safe so the trainer
+        # can post it straight back to /step as a probe input_repr.
+        import ast
+        for s in body["inputs"]:
+            assert isinstance(s, str)
+            ast.literal_eval(s)
+        # Determinism: same seed -> identical inputs.
+        r2 = http_client.get("/tasks/fibonacci/sample_inputs?n=5&seed=7")
+        assert r2.json()["inputs"] == body["inputs"]
+    def test_sample_inputs_unknown_target_404s(self, http_client):
+        r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
+        assert r.status_code == 404
+    def test_reset_legacy_target_name_still_works(self, http_client):
+        r = http_client.post("/reset", json={
+            "target_name": "fibonacci", "seed": 0, "max_steps": 10,
+        })
+        assert r.status_code == 200
+        body = r.json()
+        assert body["target_function_name"] == "fibonacci"
+        assert "fibonacci" in body["target_function_signature"]
+    def test_reset_caller_supplied_target_code(self, http_client):
+        payload = {
+            "target_code": "def add(a: int, b: int) -> int:\n    return a + b\n",
+            "target_function_name": "add",
+            "edge_cases": ["(0, 0)", "(1, -1)"],
+            "max_steps": 5,
+        }
+        r = http_client.post("/reset", json=payload)
+        assert r.status_code == 200, r.text
+        body = r.json()
+        assert body["target_function_name"] == "add"
+        eid = body["episode_id"]
+        # Probe -> verify wrapping.
+        r = http_client.post("/step", json={
+            "episode_id": eid,
+            "action": {"action_type": "probe", "input_repr": "(7, 8)"},
+        })
+        assert r.status_code == 200, r.text
+        body = r.json()
+        assert body["observation"]["probe_history"][-1]["output_repr"] == "15"
+        # Submit perfect.
+        r = http_client.post("/step", json={
+            "episode_id": eid,
+            "action": {"action_type": "submit", "code": "def add(a, b):\n    return a + b\n"},
+        })
+        assert r.status_code == 200, r.text
+        body = r.json()
+        assert body["done"] is True
+        assert body["info"]["execution_reward"] == pytest.approx(100.0)
+        assert body["reward"] > 140.0
+    def test_reset_with_neither_target_returns_400(self, http_client):
+        r = http_client.post("/reset", json={"seed": 0})
+        assert r.status_code == 400
+    def test_reset_with_target_code_only_no_function_name_returns_400(self, http_client):
+        r = http_client.post("/reset", json={
+            "target_code": "def f(): return 1\n",
+        })
+        assert r.status_code == 400
+    def test_functions_endpoint_unchanged_for_trainer(self, http_client):
+        r = http_client.get("/functions")
+        assert r.status_code == 200
+        body = r.json()
+        assert "functions" in body
+        names = [f["name"] for f in body["functions"]]
+        for name in BLACK_BOX_FUNCTIONS:
+            assert name in names
+        # The original v0.3 fields must all be present.
+        for entry in body["functions"]:
+            for k in ("name", "signature", "description", "difficulty", "edge_case_count"):
+                assert k in entry