anugrah55 commited on
Commit
e7fc062
·
verified ·
1 Parent(s): 9030acd

verifier: fix SIGALRM-in-worker-thread bug that scored every well-formed submission 100/100 under uvicorn (fall back to no-timeout call when signal.signal raises). Trainer was training on a saturated reward landscape; this restores real per-submission scoring.

Browse files
opensleuth_env/verifier.py CHANGED
@@ -181,21 +181,53 @@ class _CallTimeout(Exception):
181
 
182
 
183
  def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
184
- def _handler(signum, frame): # noqa: ARG001
185
- raise _CallTimeout()
186
-
187
- old = signal.signal(signal.SIGALRM, _handler)
188
- signal.setitimer(signal.ITIMER_REAL, timeout_s)
189
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  if unpack:
191
  if not isinstance(arg, tuple):
192
  # Defensive: a multi-param target should always receive a
193
  # tuple, but if the agent's probe input_repr happens to
194
  # parse to a single value, treat it as a 1-tuple so we get
195
  # a clear TypeError rather than a confusing call shape.
196
- arg = (arg,)
197
- return fn(*arg)
 
 
198
  return fn(arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  finally:
200
  signal.setitimer(signal.ITIMER_REAL, 0)
201
  signal.signal(signal.SIGALRM, old)
 
181
 
182
 
183
  def _call_with_timeout(fn: Callable, arg: Any, timeout_s: float, *, unpack: bool = False):
184
+ """Call ``fn(arg)`` (or ``fn(*arg)`` if ``unpack``) with a wall-clock
185
+ timeout when possible.
186
+
187
+ SIGALRM only works in the main thread of the main interpreter. When
188
+ the verifier runs inside a uvicorn worker thread (FastAPI request
189
+ handler), ``signal.signal`` raises ``ValueError`` and -- prior to this
190
+ fix -- both ref and submission calls would short-circuit through the
191
+ ``except Exception`` branch in ``_safe_call`` with the SAME ValueError,
192
+ which ``_outputs_equivalent`` then read as a "match", silently
193
+ awarding 100/100 to *every* submission regardless of correctness.
194
+
195
+ Fix: per-call probe. If SIGALRM isn't installable from the current
196
+ thread, fall back to a direct call with no in-thread timeout. The
197
+ definition timeout is still enforced by the multiprocessing-based
198
+ ``_can_define`` ahead of fuzz scoring, so a malformed submission
199
+ that hangs at import time is caught there. For the OpenSleuth
200
+ trainer/eval use case (cooperative, not adversarial), letting a
201
+ pathological while-True submission stall a single request worker
202
+ is an acceptable trade-off relative to the current "all submissions
203
+ are perfect" failure mode.
204
+ """
205
+ def _do_call():
206
  if unpack:
207
  if not isinstance(arg, tuple):
208
  # Defensive: a multi-param target should always receive a
209
  # tuple, but if the agent's probe input_repr happens to
210
  # parse to a single value, treat it as a 1-tuple so we get
211
  # a clear TypeError rather than a confusing call shape.
212
+ a = (arg,)
213
+ else:
214
+ a = arg
215
+ return fn(*a)
216
  return fn(arg)
217
+
218
+ def _handler(signum, frame): # noqa: ARG001
219
+ raise _CallTimeout()
220
+
221
+ try:
222
+ old = signal.signal(signal.SIGALRM, _handler)
223
+ except (ValueError, OSError):
224
+ # Not in the main thread (uvicorn worker, threadpool, ...).
225
+ # SIGALRM isn't available; do the unsafe-but-correct thing.
226
+ return _do_call()
227
+
228
+ signal.setitimer(signal.ITIMER_REAL, timeout_s)
229
+ try:
230
+ return _do_call()
231
  finally:
232
  signal.setitimer(signal.ITIMER_REAL, 0)
233
  signal.signal(signal.SIGALRM, old)
tests/test_open_env.py CHANGED
@@ -434,6 +434,34 @@ class TestHttpLayer:
434
  r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
435
  assert r.status_code == 404
436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  def test_reset_legacy_target_name_still_works(self, http_client):
438
  r = http_client.post("/reset", json={
439
  "target_name": "fibonacci", "seed": 0, "max_steps": 10,
 
434
  r = http_client.get("/tasks/__nope__/sample_inputs?n=2&seed=0")
435
  assert r.status_code == 404
436
 
437
+ def test_obviously_wrong_submission_scores_low_under_thread_pool(self, http_client):
438
+ """Regression: TestClient uses a worker thread, exercising the
439
+ same `signal.signal` -> ValueError path that uvicorn workers hit
440
+ in production. Before the verifier fix, this returned 100/100 for
441
+ any defined function (incl. ``def fibonacci(n): return n``).
442
+ After the fix, an obviously-wrong submission should score near
443
+ zero and trigger the floor penalty.
444
+ """
445
+ ep = http_client.post("/reset", json={
446
+ "target_name": "fibonacci", "seed": 42, "max_steps": 2,
447
+ }).json()
448
+ eid = ep["episode_id"]
449
+ r = http_client.post("/step", json={
450
+ "episode_id": eid,
451
+ "action": {"action_type": "submit", "code": "def fibonacci(n):\n return n\n"},
452
+ })
453
+ assert r.status_code == 200, r.text
454
+ body = r.json()
455
+ info = body["info"]
456
+ # ``return n`` matches at most a couple of fixed points (n=1, n=2)
457
+ # out of 100+ random inputs; execution_reward should be tiny.
458
+ assert info["execution_reward"] < 20.0, info
459
+ assert info["matches"] < info["fuzz_count"] // 4, info
460
+ # Floor penalty should kick in.
461
+ assert info["floor_penalty"] == 25.0, info
462
+ # And the perfect-bonus must NOT fire.
463
+ assert info["perfect_bonus"] == 0.0, info
464
+
465
  def test_reset_legacy_target_name_still_works(self, http_client):
466
  r = http_client.post("/reset", json={
467
  "target_name": "fibonacci", "seed": 0, "max_steps": 10,