Spaces:

smolagents
/

ml-intern

Building on CPU Upgrade

File size: 3,393 Bytes

3eec386

"""Regression test for doom-loop false-positive on legitimate polling.

Reproduces the failure mode in observatory sessions 40fcb414 ($32.59),
8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a
long-running job with `bash sleep 300 && wc -l output` four times in a
row. The arguments were byte-identical, but the results moved (27210 →
36454 → 45770 → 55138 — actual progress). The detector hashed args only
and false-fired DOOM LOOP, which made the agent abandon perfectly valid
polling.

After the fix the signature includes the tool result hash, so identical
args + different results no longer trips the detector.
"""

from litellm import ChatCompletionMessageToolCall, Message

from agent.core.doom_loop import check_for_doom_loop


def _assistant(call_id: str, name: str, args: str) -> Message:
    return Message(
        role="assistant",
        content=None,
        tool_calls=[
            ChatCompletionMessageToolCall(
                id=call_id,
                type="function",
                function={"name": name, "arguments": args},
            )
        ],
    )


def _tool(call_id: str, name: str, content: str) -> Message:
    return Message(role="tool", content=content, tool_call_id=call_id, name=name)


_POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ | wc -l"}'


def test_polling_with_progressing_results_does_not_fire():
    msgs = [
        Message(role="user", content="run the job"),
        _assistant("c1", "bash", _POLL_ARGS),
        _tool("c1", "bash", "27210"),
        _assistant("c2", "bash", _POLL_ARGS),
        _tool("c2", "bash", "36454"),
        _assistant("c3", "bash", _POLL_ARGS),
        _tool("c3", "bash", "45770"),
        _assistant("c4", "bash", _POLL_ARGS),
        _tool("c4", "bash", "55138"),
    ]
    assert check_for_doom_loop(msgs) is None


def test_truly_stuck_polling_with_identical_results_still_fires():
    """If the same poll returns the same number, the job is genuinely
    stuck and the detector SHOULD fire."""
    msgs = [
        _assistant("c1", "bash", _POLL_ARGS),
        _tool("c1", "bash", "55138"),
        _assistant("c2", "bash", _POLL_ARGS),
        _tool("c2", "bash", "55138"),
        _assistant("c3", "bash", _POLL_ARGS),
        _tool("c3", "bash", "55138"),
    ]
    prompt = check_for_doom_loop(msgs)
    assert prompt is not None
    assert "DOOM LOOP" in prompt
    assert "bash" in prompt


def test_identical_calls_with_no_results_yet_still_fires():
    """If three identical calls have no tool results (e.g. all cancelled
    or errored before a result was recorded), treat as a real loop."""
    msgs = [
        _assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'),
        _assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'),
        _assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'),
    ]
    prompt = check_for_doom_loop(msgs)
    assert prompt is not None
    assert "DOOM LOOP" in prompt
    assert "write" in prompt


def test_different_args_does_not_fire():
    msgs = [
        _assistant("c1", "bash", '{"command": "ls /a"}'),
        _tool("c1", "bash", "ok"),
        _assistant("c2", "bash", '{"command": "ls /b"}'),
        _tool("c2", "bash", "ok"),
        _assistant("c3", "bash", '{"command": "ls /c"}'),
        _tool("c3", "bash", "ok"),
    ]
    assert check_for_doom_loop(msgs) is None