File size: 19,588 Bytes
9030acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7fc062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9030acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
"""Tests for OpenSleuth Level 2: auto-fuzzer + TaskCatalog + open /reset.

These tests do *not* require Hub network access. The Hub-availability test
is opportunistic: it asserts ``>=15`` total tasks if the dataset loads, but
silently passes (with a marker) if the Hub is offline / the env is sandboxed.
"""

from __future__ import annotations

import os
import random
import typing
from typing import Optional, Literal

import pytest
from fastapi.testclient import TestClient

from opensleuth_env import (
    BLACK_BOX_FUNCTIONS,
    OpenSleuthEnv,
    ProbeAction,
    SubmitAction,
    TaskCatalog,
    TaskResolutionError,
    auto_fuzz,
)


# ---------------------------------------------------------------------------
# Auto-fuzzer
# ---------------------------------------------------------------------------


class TestAutoFuzzerTypes:
    def _rng(self, seed: int = 0) -> random.Random:
        return random.Random(seed)

    def test_int_inputs_are_ints(self):
        def f(n: int) -> int:
            return n

        rng = self._rng()
        outs = auto_fuzz(f, 50, rng)
        assert len(outs) == 50
        assert all(isinstance(t, tuple) and len(t) == 1 for t in outs)
        assert all(isinstance(t[0], int) and not isinstance(t[0], bool) for t in outs)

    def test_str_inputs_are_strs(self):
        def f(s: str) -> int:
            return len(s)

        outs = auto_fuzz(f, 30, self._rng())
        assert all(isinstance(t[0], str) for t in outs)

    def test_list_int_inputs_are_lists_of_ints(self):
        def f(xs: list[int]) -> int:
            return sum(xs)

        outs = auto_fuzz(f, 30, self._rng())
        for (xs,) in outs:
            assert isinstance(xs, list)
            assert all(isinstance(x, int) for x in xs)

    def test_homogeneous_tuple_inputs(self):
        def f(xs: tuple[int, ...]) -> int:
            return sum(xs)

        outs = auto_fuzz(f, 30, self._rng())
        for (xs,) in outs:
            assert isinstance(xs, tuple)
            assert all(isinstance(x, int) for x in xs)

    def test_heterogeneous_tuple_inputs(self):
        def f(t: tuple[int, str]) -> int:
            return len(t[1])

        outs = auto_fuzz(f, 30, self._rng())
        for (t,) in outs:
            assert isinstance(t, tuple) and len(t) == 2
            assert isinstance(t[0], int)
            assert isinstance(t[1], str)

    def test_optional_inputs_sometimes_None(self):
        def f(x: Optional[int]) -> int:
            return 0

        outs = auto_fuzz(f, 200, self._rng(seed=42))
        seen_none = any(t[0] is None for t in outs)
        seen_int = any(isinstance(t[0], int) and not isinstance(t[0], bool) for t in outs)
        assert seen_none, "Optional[int] should occasionally yield None"
        assert seen_int, "Optional[int] should also yield ints"

    def test_literal_inputs_only_pick_listed_values(self):
        def f(mode: Literal["a", "b", "c"]) -> int:
            return 0

        outs = auto_fuzz(f, 50, self._rng())
        for (m,) in outs:
            assert m in ("a", "b", "c")

    def test_dict_str_int_inputs(self):
        def f(d: dict[str, int]) -> int:
            return len(d)

        outs = auto_fuzz(f, 20, self._rng())
        for (d,) in outs:
            assert isinstance(d, dict)
            for k, v in d.items():
                assert isinstance(k, str)
                assert isinstance(v, int)

    def test_multi_arg_returns_full_tuples(self):
        def f(a: int, b: str) -> int:
            return 0

        outs = auto_fuzz(f, 20, self._rng())
        for t in outs:
            assert isinstance(t, tuple)
            assert len(t) == 2
            assert isinstance(t[0], int)
            assert isinstance(t[1], str)

    def test_unannotated_param_falls_back_to_int(self):
        def f(x):  # no annotation
            return x

        outs = auto_fuzz(f, 30, self._rng())
        for (x,) in outs:
            assert isinstance(x, int)


class TestAutoFuzzerSpecOverride:
    def test_int_min_max_overrides_default_range(self):
        def f(n: int) -> int:
            return n

        outs = auto_fuzz(f, 100, random.Random(0), fuzz_spec={"n": {"type": "int", "min": 1, "max": 5}})
        for (n,) in outs:
            assert 1 <= n <= 5, f"expected n in [1, 5], got {n}"

    def test_str_alphabet_override(self):
        def f(s: str) -> int:
            return len(s)

        outs = auto_fuzz(
            f, 100, random.Random(0),
            fuzz_spec={"s": {"type": "str", "alphabet": "ab", "max_len": 4}},
        )
        for (s,) in outs:
            assert len(s) <= 4
            for ch in s:
                assert ch in "ab", f"unexpected char {ch!r} in {s!r}"

    def test_list_elem_override(self):
        def f(xs: list[int]) -> int:
            return sum(xs)

        outs = auto_fuzz(
            f, 80, random.Random(0),
            fuzz_spec={"xs": {"type": "list", "elem": {"type": "int", "min": 0, "max": 3}, "max_len": 4}},
        )
        for (xs,) in outs:
            assert len(xs) <= 4
            for v in xs:
                assert 0 <= v <= 3

    def test_tuple_elems_override(self):
        def f(t):
            return t

        outs = auto_fuzz(
            f, 30, random.Random(0),
            fuzz_spec={"t": {"type": "tuple", "elems": [
                {"type": "int", "min": 0, "max": 1},
                {"type": "str", "alphabet": "x", "max_len": 2},
            ]}},
        )
        for (t,) in outs:
            assert isinstance(t, tuple) and len(t) == 2
            assert 0 <= t[0] <= 1
            for ch in t[1]:
                assert ch == "x"


# ---------------------------------------------------------------------------
# TaskCatalog
# ---------------------------------------------------------------------------


class TestTaskCatalog:
    def test_resolves_builtin_by_name(self):
        cat = TaskCatalog(enable_hub=False)
        spec = cat.resolve(target_name="fibonacci")
        assert spec.name == "fibonacci"
        assert spec is BLACK_BOX_FUNCTIONS["fibonacci"]
        assert spec.unpack_args is False
        assert spec.source == "builtin"

    def test_resolves_caller_supplied_target_code(self):
        cat = TaskCatalog(enable_hub=False)
        code = "def add(a: int, b: int) -> int:\n    return a + b\n"
        spec = cat.resolve(target_code=code, target_function_name="add")
        assert spec.name == "add"
        assert spec.unpack_args is True  # 2-arg
        assert spec.source == "user"
        # The wrapped fuzzer must produce calls that succeed end-to-end.
        rng = random.Random(0)
        inputs = spec.fuzzer(rng, 10)
        for args in inputs:
            assert isinstance(args, tuple) and len(args) == 2
            assert spec.fn(*args) == args[0] + args[1]

    def test_caller_supplied_unary_uses_unwrapped_call(self):
        cat = TaskCatalog(enable_hub=False)
        code = "def square(n: int) -> int:\n    return n * n\n"
        spec = cat.resolve(target_code=code, target_function_name="square")
        assert spec.unpack_args is False
        rng = random.Random(0)
        inputs = spec.fuzzer(rng, 5)
        for x in inputs:
            assert isinstance(x, int)
            assert spec.fn(x) == x * x

    def test_resolve_with_no_source_raises(self):
        cat = TaskCatalog(enable_hub=False)
        with pytest.raises(TaskResolutionError):
            cat.resolve()

    def test_resolve_unknown_name_raises(self):
        cat = TaskCatalog(enable_hub=False)
        with pytest.raises(TaskResolutionError):
            cat.resolve(target_name="this_does_not_exist")

    def test_target_code_without_function_name_raises(self):
        cat = TaskCatalog(enable_hub=False)
        with pytest.raises(TaskResolutionError):
            cat.resolve(target_code="def foo(): return 1\n")

    def test_rejects_oracle_import(self):
        cat = TaskCatalog(enable_hub=False)
        bad = (
            "import opensleuth_env\n"
            "def f(x): return x\n"
        )
        with pytest.raises(TaskResolutionError):
            cat.resolve(target_code=bad, target_function_name="f")

        bad2 = (
            "from opensleuth_env.black_box import _fibonacci\n"
            "def f(x): return _fibonacci(x)\n"
        )
        with pytest.raises(TaskResolutionError):
            cat.resolve(target_code=bad2, target_function_name="f")

    def test_target_code_using_open_is_blocked_at_call_time(self):
        """`open` is not in the safe-builtins whitelist. The catalog will
        compile the function (since `open` is only resolved at call-time
        via NameError), but invoking it must fail safely."""
        cat = TaskCatalog(enable_hub=False)
        code = (
            "def f(x):\n"
            "    open('/tmp/x', 'w')\n"
            "    return 0\n"
        )
        spec = cat.resolve(target_code=code, target_function_name="f")
        with pytest.raises(NameError):
            spec.fn(0)

    def test_caller_supplied_edge_cases_are_parsed(self):
        cat = TaskCatalog(enable_hub=False)
        spec = cat.resolve(
            target_code="def neg(n: int) -> int:\n    return -n\n",
            target_function_name="neg",
            edge_cases=["0", "1", "-1", "100"],
        )
        assert spec.edge_cases == [0, 1, -1, 100]

    def test_caller_supplied_fuzz_spec_is_used(self):
        cat = TaskCatalog(enable_hub=False)
        spec = cat.resolve(
            target_code="def f(n: int) -> int:\n    return n\n",
            target_function_name="f",
            fuzz_spec={"n": {"type": "int", "min": 7, "max": 9}},
        )
        rng = random.Random(0)
        inputs = spec.fuzzer(rng, 50)
        for x in inputs:
            assert 7 <= x <= 9

    def test_list_builtin_returns_nine_entries(self):
        cat = TaskCatalog(enable_hub=False)
        builtins_list = cat.list_builtin()
        assert len(builtins_list) == 9
        for entry in builtins_list:
            assert entry["source"] == "builtin"
            assert "name" in entry
            assert "signature" in entry
            assert "difficulty" in entry


# ---------------------------------------------------------------------------
# End-to-end via OpenSleuthEnv
# ---------------------------------------------------------------------------


class TestEnvOpenEnded:
    def test_legacy_reset_by_target_name_unchanged(self):
        env = OpenSleuthEnv(fuzz_count=10)
        obs = env.reset(target_name="fibonacci")
        assert obs.target_function_name == "fibonacci"
        assert obs.difficulty == "easy"
        assert obs.steps_taken == 0

        # Probe via the same path as before.
        resp = env.step(obs.episode_id, ProbeAction(input_repr="10"))
        assert resp.observation.probe_history[-1].output_repr == "55"

    def test_env_caller_supplied_unary_full_loop(self):
        env = OpenSleuthEnv(fuzz_count=10)
        obs = env.reset(
            target_code="def square(n: int) -> int:\n    return n * n\n",
            target_function_name="square",
        )
        assert obs.target_function_name == "square"

        # Probe.
        resp = env.step(obs.episode_id, ProbeAction(input_repr="5"))
        assert resp.observation.probe_history[-1].output_repr == "25"

        # Submit a perfect implementation.
        code = "def square(n):\n    return n * n\n"
        resp = env.step(obs.episode_id, SubmitAction(code=code))
        assert resp.done is True
        assert resp.info["execution_reward"] == pytest.approx(100.0)
        assert resp.reward > 140.0

    def test_env_caller_supplied_multi_arg_full_loop(self):
        env = OpenSleuthEnv(fuzz_count=10)
        obs = env.reset(
            target_code="def add(a: int, b: int) -> int:\n    return a + b\n",
            target_function_name="add",
            edge_cases=["(0, 0)", "(1, -1)", "(100, 0)"],
        )
        assert obs.target_function_name == "add"

        # Probe with a 2-tuple.
        resp = env.step(obs.episode_id, ProbeAction(input_repr="(2, 3)"))
        assert resp.observation.probe_history[-1].output_repr == "5"

        # Submit a perfect implementation.
        code = "def add(a, b):\n    return a + b\n"
        resp = env.step(obs.episode_id, SubmitAction(code=code))
        assert resp.done is True
        assert resp.info["execution_reward"] == pytest.approx(100.0)
        assert resp.reward > 140.0

    def test_env_caller_supplied_buggy_submission_scored_negative(self):
        env = OpenSleuthEnv(fuzz_count=10)
        obs = env.reset(
            target_code="def add(a: int, b: int) -> int:\n    return a + b\n",
            target_function_name="add",
        )
        bad = "def add(a, b):\n    return a - b\n"
        resp = env.step(obs.episode_id, SubmitAction(code=bad))
        assert resp.done is True
        assert resp.info["execution_reward"] < 50.0
        assert resp.reward < 0.0

    def test_env_caller_supplied_oracle_import_rejected(self):
        env = OpenSleuthEnv()
        with pytest.raises(ValueError):
            env.reset(
                target_code="import opensleuth_env\ndef f(x): return x\n",
                target_function_name="f",
            )


# ---------------------------------------------------------------------------
# HTTP layer
# ---------------------------------------------------------------------------


@pytest.fixture(scope="module")
def http_client():
    from server import app

    with TestClient(app) as client:
        yield client


class TestHttpLayer:
    def test_tasks_endpoint_lists_at_least_nine_builtin(self, http_client):
        r = http_client.get("/tasks?source=builtin")
        assert r.status_code == 200
        body = r.json()
        assert body["count"] >= 9
        names = [t["name"] for t in body["tasks"]]
        for name in BLACK_BOX_FUNCTIONS:
            assert name in names

    def test_tasks_all_includes_at_least_builtins(self, http_client):
        r = http_client.get("/tasks?source=all")
        assert r.status_code == 200
        body = r.json()
        # The builtins are always present. If the Hub is reachable we'd
        # expect 15+, but the test must pass even if Hub is unavailable
        # (e.g. CI sandboxes block egress).
        assert body["count"] >= 9
        if not body["hub"].get("enabled", False) or body["hub"].get("error"):
            pytest.skip(f"hub not reachable: {body['hub']}")
        # Hub reachable -> dataset should have 15+ rows after bootstrap.
        assert body["count"] >= 15

    def test_sample_inputs_returns_n_repr_strings_for_builtin(self, http_client):
        r = http_client.get("/tasks/fibonacci/sample_inputs?n=5&seed=7")
        assert r.status_code == 200, r.text
        body = r.json()
        assert body["name"] == "fibonacci"
        assert body["n"] == 5
        assert body["seed"] == 7
        assert isinstance(body["inputs"], list)
        assert len(body["inputs"]) == 5
        # Every returned string must be ast.literal_eval-safe so the trainer
        # can post it straight back to /step as a probe input_repr.
        import ast
        for s in body["inputs"]:
            assert isinstance(s, str)
            ast.literal_eval(s)
        # Determinism: same seed -> identical inputs.
        r2 = http_client.get("/tasks/fibonacci/sample_inputs?n=5&seed=7")
        assert r2.json()["inputs"] == body["inputs"]

    def test_sample_inputs_unknown_target_404s(self, http_client):
        r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
        assert r.status_code == 404

    def test_obviously_wrong_submission_scores_low_under_thread_pool(self, http_client):
        """Regression: TestClient uses a worker thread, exercising the
        same `signal.signal` -> ValueError path that uvicorn workers hit
        in production. Before the verifier fix, this returned 100/100 for
        any defined function (incl. ``def fibonacci(n): return n``).
        After the fix, an obviously-wrong submission should score near
        zero and trigger the floor penalty.
        """
        ep = http_client.post("/reset", json={
            "target_name": "fibonacci", "seed": 42, "max_steps": 2,
        }).json()
        eid = ep["episode_id"]
        r = http_client.post("/step", json={
            "episode_id": eid,
            "action": {"action_type": "submit", "code": "def fibonacci(n):\n    return n\n"},
        })
        assert r.status_code == 200, r.text
        body = r.json()
        info = body["info"]
        # ``return n`` matches at most a couple of fixed points (n=1, n=2)
        # out of 100+ random inputs; execution_reward should be tiny.
        assert info["execution_reward"] < 20.0, info
        assert info["matches"] < info["fuzz_count"] // 4, info
        # Floor penalty should kick in.
        assert info["floor_penalty"] == 25.0, info
        # And the perfect-bonus must NOT fire.
        assert info["perfect_bonus"] == 0.0, info

    def test_reset_legacy_target_name_still_works(self, http_client):
        r = http_client.post("/reset", json={
            "target_name": "fibonacci", "seed": 0, "max_steps": 10,
        })
        assert r.status_code == 200
        body = r.json()
        assert body["target_function_name"] == "fibonacci"
        assert "fibonacci" in body["target_function_signature"]

    def test_reset_caller_supplied_target_code(self, http_client):
        payload = {
            "target_code": "def add(a: int, b: int) -> int:\n    return a + b\n",
            "target_function_name": "add",
            "edge_cases": ["(0, 0)", "(1, -1)"],
            "max_steps": 5,
        }
        r = http_client.post("/reset", json=payload)
        assert r.status_code == 200, r.text
        body = r.json()
        assert body["target_function_name"] == "add"
        eid = body["episode_id"]

        # Probe -> verify wrapping.
        r = http_client.post("/step", json={
            "episode_id": eid,
            "action": {"action_type": "probe", "input_repr": "(7, 8)"},
        })
        assert r.status_code == 200, r.text
        body = r.json()
        assert body["observation"]["probe_history"][-1]["output_repr"] == "15"

        # Submit perfect.
        r = http_client.post("/step", json={
            "episode_id": eid,
            "action": {"action_type": "submit", "code": "def add(a, b):\n    return a + b\n"},
        })
        assert r.status_code == 200, r.text
        body = r.json()
        assert body["done"] is True
        assert body["info"]["execution_reward"] == pytest.approx(100.0)
        assert body["reward"] > 140.0

    def test_reset_with_neither_target_returns_400(self, http_client):
        r = http_client.post("/reset", json={"seed": 0})
        assert r.status_code == 400

    def test_reset_with_target_code_only_no_function_name_returns_400(self, http_client):
        r = http_client.post("/reset", json={
            "target_code": "def f(): return 1\n",
        })
        assert r.status_code == 400

    def test_functions_endpoint_unchanged_for_trainer(self, http_client):
        r = http_client.get("/functions")
        assert r.status_code == 200
        body = r.json()
        assert "functions" in body
        names = [f["name"] for f in body["functions"]]
        for name in BLACK_BOX_FUNCTIONS:
            assert name in names
        # The original v0.3 fields must all be present.
        for entry in body["functions"]:
            for k in ("name", "signature", "description", "difficulty", "edge_case_count"):
                assert k in entry