verifier: fix SIGALRM-in-worker-thread bug that scored every well-formed submission 100/100 under uvicorn (fall back to no-timeout call when signal.signal raises). Trainer was training on a saturated reward landscape; this restores real per-submission scoring.
Browse files- opensleuth_env/verifier.py +40 -8
- tests/test_open_env.py +28 -0
opensleuth_env/verifier.py
CHANGED
|
@@ -181,21 +181,53 @@ class _CallTimeout(Exception):
|
|
| 181 |
|
| 182 |
|
| 183 |
def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
if unpack:
|
| 191 |
if not isinstance(arg, tuple):
|
| 192 |
# Defensive: a multi-param target should always receive a
|
| 193 |
# tuple, but if the agent's probe input_repr happens to
|
| 194 |
# parse to a single value, treat it as a 1-tuple so we get
|
| 195 |
# a clear TypeError rather than a confusing call shape.
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
return fn(arg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
finally:
|
| 200 |
signal.setitimer(signal.ITIMER_REAL, 0)
|
| 201 |
signal.signal(signal.SIGALRM, old)
|
|
|
|
| 181 |
|
| 182 |
|
| 183 |
def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
|
| 184 |
+
"""Call ``fn(arg)`` (or ``fn(*arg)`` if ``unpack``) with a wall-clock
|
| 185 |
+
timeout when possible.
|
| 186 |
+
|
| 187 |
+
SIGALRM only works in the main thread of the main interpreter. When
|
| 188 |
+
the verifier runs inside a uvicorn worker thread (FastAPI request
|
| 189 |
+
handler), ``signal.signal`` raises ``ValueError`` and -- prior to this
|
| 190 |
+
fix -- both ref and submission calls would short-circuit through the
|
| 191 |
+
``except Exception`` branch in ``_safe_call`` with the SAME ValueError,
|
| 192 |
+
which ``_outputs_equivalent`` then read as a "match", silently
|
| 193 |
+
awarding 100/100 to *every* submission regardless of correctness.
|
| 194 |
+
|
| 195 |
+
Fix: per-call probe. If SIGALRM isn't installable from the current
|
| 196 |
+
thread, fall back to a direct call with no in-thread timeout. The
|
| 197 |
+
definition timeout is still enforced by the multiprocessing-based
|
| 198 |
+
``_can_define`` ahead of fuzz scoring, so a malformed submission
|
| 199 |
+
that hangs at import time is caught there. For the OpenSleuth
|
| 200 |
+
trainer/eval use case (cooperative, not adversarial), letting a
|
| 201 |
+
pathological while-True submission stall a single request worker
|
| 202 |
+
is an acceptable trade-off relative to the current "all submissions
|
| 203 |
+
are perfect" failure mode.
|
| 204 |
+
"""
|
| 205 |
+
def _do_call():
|
| 206 |
if unpack:
|
| 207 |
if not isinstance(arg, tuple):
|
| 208 |
# Defensive: a multi-param target should always receive a
|
| 209 |
# tuple, but if the agent's probe input_repr happens to
|
| 210 |
# parse to a single value, treat it as a 1-tuple so we get
|
| 211 |
# a clear TypeError rather than a confusing call shape.
|
| 212 |
+
a = (arg,)
|
| 213 |
+
else:
|
| 214 |
+
a = arg
|
| 215 |
+
return fn(*a)
|
| 216 |
return fn(arg)
|
| 217 |
+
|
| 218 |
+
def _handler(signum, frame): # noqa: ARG001
|
| 219 |
+
raise _CallTimeout()
|
| 220 |
+
|
| 221 |
+
try:
|
| 222 |
+
old = signal.signal(signal.SIGALRM, _handler)
|
| 223 |
+
except (ValueError, OSError):
|
| 224 |
+
# Not in the main thread (uvicorn worker, threadpool, ...).
|
| 225 |
+
# SIGALRM isn't available; do the unsafe-but-correct thing.
|
| 226 |
+
return _do_call()
|
| 227 |
+
|
| 228 |
+
signal.setitimer(signal.ITIMER_REAL, timeout_s)
|
| 229 |
+
try:
|
| 230 |
+
return _do_call()
|
| 231 |
finally:
|
| 232 |
signal.setitimer(signal.ITIMER_REAL, 0)
|
| 233 |
signal.signal(signal.SIGALRM, old)
|
tests/test_open_env.py
CHANGED
|
@@ -434,6 +434,34 @@ class TestHttpLayer:
|
|
| 434 |
r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
|
| 435 |
assert r.status_code == 404
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
def test_reset_legacy_target_name_still_works(self, http_client):
|
| 438 |
r = http_client.post("/reset", json={
|
| 439 |
"target_name": "fibonacci", "seed": 0, "max_steps": 10,
|
|
|
|
| 434 |
r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
|
| 435 |
assert r.status_code == 404
|
| 436 |
|
| 437 |
+
def test_obviously_wrong_submission_scores_low_under_thread_pool(self, http_client):
|
| 438 |
+
"""Regression: TestClient uses a worker thread, exercising the
|
| 439 |
+
same `signal.signal` -> ValueError path that uvicorn workers hit
|
| 440 |
+
in production. Before the verifier fix, this returned 100/100 for
|
| 441 |
+
any defined function (incl. ``def fibonacci(n): return n``).
|
| 442 |
+
After the fix, an obviously-wrong submission should score near
|
| 443 |
+
zero and trigger the floor penalty.
|
| 444 |
+
"""
|
| 445 |
+
ep = http_client.post("/reset", json={
|
| 446 |
+
"target_name": "fibonacci", "seed": 42, "max_steps": 2,
|
| 447 |
+
}).json()
|
| 448 |
+
eid = ep["episode_id"]
|
| 449 |
+
r = http_client.post("/step", json={
|
| 450 |
+
"episode_id": eid,
|
| 451 |
+
"action": {"action_type": "submit", "code": "def fibonacci(n):\n return n\n"},
|
| 452 |
+
})
|
| 453 |
+
assert r.status_code == 200, r.text
|
| 454 |
+
body = r.json()
|
| 455 |
+
info = body["info"]
|
| 456 |
+
# ``return n`` matches at most a couple of fixed points (n=1, n=2)
|
| 457 |
+
# out of 100+ random inputs; execution_reward should be tiny.
|
| 458 |
+
assert info["execution_reward"] < 20.0, info
|
| 459 |
+
assert info["matches"] < info["fuzz_count"] // 4, info
|
| 460 |
+
# Floor penalty should kick in.
|
| 461 |
+
assert info["floor_penalty"] == 25.0, info
|
| 462 |
+
# And the perfect-bonus must NOT fire.
|
| 463 |
+
assert info["perfect_bonus"] == 0.0, info
|
| 464 |
+
|
| 465 |
def test_reset_legacy_target_name_still_works(self, http_client):
|
| 466 |
r = http_client.post("/reset", json={
|
| 467 |
"target_name": "fibonacci", "seed": 0, "max_steps": 10,
|