Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

App Files Files Community

anugrah55 commited on 12 days ago

Commit

e7fc062

verified ·

1 Parent(s): 9030acd

verifier: fix SIGALRM-in-worker-thread bug that scored every well-formed submission 100/100 under uvicorn (fall back to no-timeout call when signal.signal raises). Trainer was training on a saturated reward landscape; this restores real per-submission scoring.

Browse files

Files changed (2) hide show

opensleuth_env/verifier.py +40 -8
tests/test_open_env.py +28 -0

opensleuth_env/verifier.py CHANGED Viewed

@@ -181,21 +181,53 @@ class _CallTimeout(Exception):
 def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
-    def _handler(signum, frame):  # noqa: ARG001
-        raise _CallTimeout()
-    old = signal.signal(signal.SIGALRM, _handler)
-    signal.setitimer(signal.ITIMER_REAL, timeout_s)
-    try:
         if unpack:
             if not isinstance(arg, tuple):
                 # Defensive: a multi-param target should always receive a
                 # tuple, but if the agent's probe input_repr happens to
                 # parse to a single value, treat it as a 1-tuple so we get
                 # a clear TypeError rather than a confusing call shape.
-                arg = (arg,)
-            return fn(*arg)
         return fn(arg)
     finally:
         signal.setitimer(signal.ITIMER_REAL, 0)
         signal.signal(signal.SIGALRM, old)

 def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
+    """Call ``fn(arg)`` (or ``fn(*arg)`` if ``unpack``) with a wall-clock
+    timeout when possible.
+    SIGALRM only works in the main thread of the main interpreter. When
+    the verifier runs inside a uvicorn worker thread (FastAPI request
+    handler), ``signal.signal`` raises ``ValueError`` and -- prior to this
+    fix -- both ref and submission calls would short-circuit through the
+    ``except Exception`` branch in ``_safe_call`` with the SAME ValueError,
+    which ``_outputs_equivalent`` then read as a "match", silently
+    awarding 100/100 to *every* submission regardless of correctness.
+    Fix: per-call probe. If SIGALRM isn't installable from the current
+    thread, fall back to a direct call with no in-thread timeout. The
+    definition timeout is still enforced by the multiprocessing-based
+    ``_can_define`` ahead of fuzz scoring, so a malformed submission
+    that hangs at import time is caught there. For the OpenSleuth
+    trainer/eval use case (cooperative, not adversarial), letting a
+    pathological while-True submission stall a single request worker
+    is an acceptable trade-off relative to the current "all submissions
+    are perfect" failure mode.
+    """
+    def _do_call():
         if unpack:
             if not isinstance(arg, tuple):
                 # Defensive: a multi-param target should always receive a
                 # tuple, but if the agent's probe input_repr happens to
                 # parse to a single value, treat it as a 1-tuple so we get
                 # a clear TypeError rather than a confusing call shape.
+                a = (arg,)
+            else:
+                a = arg
+            return fn(*a)
         return fn(arg)
+    def _handler(signum, frame):  # noqa: ARG001
+        raise _CallTimeout()
+    try:
+        old = signal.signal(signal.SIGALRM, _handler)
+    except (ValueError, OSError):
+        # Not in the main thread (uvicorn worker, threadpool, ...).
+        # SIGALRM isn't available; do the unsafe-but-correct thing.
+        return _do_call()
+    signal.setitimer(signal.ITIMER_REAL, timeout_s)
+    try:
+        return _do_call()
     finally:
         signal.setitimer(signal.ITIMER_REAL, 0)
         signal.signal(signal.SIGALRM, old)

tests/test_open_env.py CHANGED Viewed

@@ -434,6 +434,34 @@ class TestHttpLayer:
         r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
         assert r.status_code == 404
     def test_reset_legacy_target_name_still_works(self, http_client):
         r = http_client.post("/reset", json={
             "target_name": "fibonacci", "seed": 0, "max_steps": 10,

         r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
         assert r.status_code == 404
+    def test_obviously_wrong_submission_scores_low_under_thread_pool(self, http_client):
+        """Regression: TestClient uses a worker thread, exercising the
+        same `signal.signal` -> ValueError path that uvicorn workers hit
+        in production. Before the verifier fix, this returned 100/100 for
+        any defined function (incl. ``def fibonacci(n): return n``).
+        After the fix, an obviously-wrong submission should score near
+        zero and trigger the floor penalty.
+        """
+        ep = http_client.post("/reset", json={
+            "target_name": "fibonacci", "seed": 42, "max_steps": 2,
+        }).json()
+        eid = ep["episode_id"]
+        r = http_client.post("/step", json={
+            "episode_id": eid,
+            "action": {"action_type": "submit", "code": "def fibonacci(n):\n    return n\n"},
+        })
+        assert r.status_code == 200, r.text
+        body = r.json()
+        info = body["info"]
+        # ``return n`` matches at most a couple of fixed points (n=1, n=2)
+        # out of 100+ random inputs; execution_reward should be tiny.
+        assert info["execution_reward"] < 20.0, info
+        assert info["matches"] < info["fuzz_count"] // 4, info
+        # Floor penalty should kick in.
+        assert info["floor_penalty"] == 25.0, info
+        # And the perfect-bonus must NOT fire.
+        assert info["perfect_bonus"] == 0.0, info
     def test_reset_legacy_target_name_still_works(self, http_client):
         r = http_client.post("/reset", json={
             "target_name": "fibonacci", "seed": 0, "max_steps": 10,