Spaces:
Building on CPU Upgrade
Building on CPU Upgrade
File size: 3,393 Bytes
3eec386 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | """Regression test for doom-loop false-positive on legitimate polling.
Reproduces the failure mode in observatory sessions 40fcb414 ($32.59),
8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a
long-running job with `bash sleep 300 && wc -l output` four times in a
row. The arguments were byte-identical, but the results moved (27210 →
36454 → 45770 → 55138 — actual progress). The detector hashed args only
and false-fired DOOM LOOP, which made the agent abandon perfectly valid
polling.
After the fix the signature includes the tool result hash, so identical
args + different results no longer trips the detector.
"""
from litellm import ChatCompletionMessageToolCall, Message
from agent.core.doom_loop import check_for_doom_loop
def _assistant(call_id: str, name: str, args: str) -> Message:
return Message(
role="assistant",
content=None,
tool_calls=[
ChatCompletionMessageToolCall(
id=call_id,
type="function",
function={"name": name, "arguments": args},
)
],
)
def _tool(call_id: str, name: str, content: str) -> Message:
return Message(role="tool", content=content, tool_call_id=call_id, name=name)
_POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ | wc -l"}'
def test_polling_with_progressing_results_does_not_fire():
msgs = [
Message(role="user", content="run the job"),
_assistant("c1", "bash", _POLL_ARGS),
_tool("c1", "bash", "27210"),
_assistant("c2", "bash", _POLL_ARGS),
_tool("c2", "bash", "36454"),
_assistant("c3", "bash", _POLL_ARGS),
_tool("c3", "bash", "45770"),
_assistant("c4", "bash", _POLL_ARGS),
_tool("c4", "bash", "55138"),
]
assert check_for_doom_loop(msgs) is None
def test_truly_stuck_polling_with_identical_results_still_fires():
"""If the same poll returns the same number, the job is genuinely
stuck and the detector SHOULD fire."""
msgs = [
_assistant("c1", "bash", _POLL_ARGS),
_tool("c1", "bash", "55138"),
_assistant("c2", "bash", _POLL_ARGS),
_tool("c2", "bash", "55138"),
_assistant("c3", "bash", _POLL_ARGS),
_tool("c3", "bash", "55138"),
]
prompt = check_for_doom_loop(msgs)
assert prompt is not None
assert "DOOM LOOP" in prompt
assert "bash" in prompt
def test_identical_calls_with_no_results_yet_still_fires():
"""If three identical calls have no tool results (e.g. all cancelled
or errored before a result was recorded), treat as a real loop."""
msgs = [
_assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'),
_assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'),
_assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'),
]
prompt = check_for_doom_loop(msgs)
assert prompt is not None
assert "DOOM LOOP" in prompt
assert "write" in prompt
def test_different_args_does_not_fire():
msgs = [
_assistant("c1", "bash", '{"command": "ls /a"}'),
_tool("c1", "bash", "ok"),
_assistant("c2", "bash", '{"command": "ls /b"}'),
_tool("c2", "bash", "ok"),
_assistant("c3", "bash", '{"command": "ls /c"}'),
_tool("c3", "bash", "ok"),
]
assert check_for_doom_loop(msgs) is None
|