Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

akseljoonas HF Staff commited on Apr 4

Commit

fdddeaa

1 Parent(s): 9f009a3

Fix premature agent loop exit in long-running autonomous tasks

- Fix context_length initialization bug: was set to max_context (180K) on
startup, which is above the compaction threshold (170K), causing spurious
compaction on the very first iteration. Now starts at 0.

- Add diagnostics at loop exit point: log finish_reason, token_count,
context_length, and truncated response text whenever the agent loop
breaks due to no tool calls. Also emit a tool_log event for CLI visibility.

- Add logging at compaction and ContextWindowExceededError points so we
can trace context pressure in headless run logs.

- Add autonomous mode guidance to system prompt: instruct the model to
always include tool calls (text-only response kills the loop), continue
after training to evaluate/iterate/save, and check the timer.

- Add action hint to truncated bash output so the model doesn't stall
after receiving large training output.

Files changed (5) hide show

agent/context_manager/manager.py +1 -1
agent/core/agent_loop.py +46 -0
agent/main.py +2 -0
agent/prompts/system_prompt_v3.yaml +20 -0
agent/tools/local_tools.py +1 -0

agent/context_manager/manager.py CHANGED Viewed

@@ -89,7 +89,7 @@ class ContextManager:
         )
         self.max_context = max_context - 10000
         self.compact_size = int(max_context * compact_size)
-        self.context_length = max_context
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]

         )
         self.max_context = max_context - 10000
         self.compact_size = int(max_context * compact_size)
+        self.context_length = 0  # Updated after each LLM call with actual usage
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]

agent/core/agent_loop.py CHANGED Viewed

@@ -204,6 +204,11 @@ async def _compact_and_notify(session: Session) -> None:
         model_name=session.config.model_name,
     )
     old_length = session.context_manager.context_length
     tool_specs = session.tool_router.get_tool_specs_for_llm()
     await session.context_manager.compact(
         model_name=session.config.model_name,
@@ -211,6 +216,11 @@ async def _compact_and_notify(session: Session) -> None:
     )
     new_length = session.context_manager.context_length
     if new_length != old_length:
         await session.send_event(
             Event(
                 event_type="compacted",
@@ -582,6 +592,34 @@ class Handlers:
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
@@ -788,6 +826,14 @@ class Handlers:
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
                 session.context_manager.context_length = (
                     session.context_manager.max_context + 1
                 )

         model_name=session.config.model_name,
     )
     old_length = session.context_manager.context_length
+    max_ctx = session.context_manager.max_context
+    logger.debug(
+        "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
+        old_length, max_ctx, old_length > max_ctx,
+    )
     tool_specs = session.tool_router.get_tool_specs_for_llm()
     await session.context_manager.compact(
         model_name=session.config.model_name,
     )
     new_length = session.context_manager.context_length
     if new_length != old_length:
+        logger.warning(
+            "Context compacted: %d -> %d tokens (max=%d, %d messages)",
+            old_length, new_length, max_ctx,
+            len(session.context_manager.items),
+        )
         await session.send_event(
             Event(
                 event_type="compacted",
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
+                    logger.warning(
+                        "Agent loop ending: no tool calls. "
+                        "finish_reason=%s, token_count=%d, "
+                        "context_length=%d, max_context=%d, "
+                        "iteration=%d/%d, "
+                        "response_text=%s",
+                        finish_reason,
+                        token_count,
+                        session.context_manager.context_length,
+                        session.context_manager.max_context,
+                        iteration,
+                        effective_max,
+                        (content or "")[:500],
+                    )
+                    await session.send_event(
+                        Event(
+                            event_type="tool_log",
+                            data={
+                                "tool": "system",
+                                "log": (
+                                    f"Loop exit: no tool calls. "
+                                    f"finish_reason={finish_reason}, "
+                                    f"tokens={token_count}/{session.context_manager.max_context}, "
+                                    f"iter={iteration}/{effective_max}"
+                                ),
+                            },
+                        )
+                    )
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
+                logger.warning(
+                    "ContextWindowExceededError at iteration %d — forcing compaction "
+                    "(context_length=%d, max_context=%d, messages=%d)",
+                    iteration,
+                    session.context_manager.context_length,
+                    session.context_manager.max_context,
+                    len(session.context_manager.items),
+                )
                 session.context_manager.context_length = (
                     session.context_manager.max_context + 1
                 )

agent/main.py CHANGED Viewed

@@ -973,6 +973,8 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
         elif event.event_type in ("turn_complete", "interrupted"):
             shimmer.stop()
             stream_buf.discard()
             break
     # Shutdown

         elif event.event_type in ("turn_complete", "interrupted"):
             shimmer.stop()
             stream_buf.discard()
+            history_size = event.data.get("history_size", "?") if event.data else "?"
+            print(f"\n--- Agent {event.event_type} (history_size={history_size}) ---", file=sys.stderr)
             break
     # Shutdown

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -112,6 +112,26 @@ system_prompt: |
   Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
   Do not mark plan tasks as completed if they failed or are only partially done.
   # Communication
   - Be concise and direct. No filler, no restating what the user said.

   Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
   Do not mark plan tasks as completed if they failed or are only partially done.
+  # Autonomous / headless mode
+  When running autonomously (no human in the loop), you MUST follow these rules:
+  NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, check the timer, or verify outputs. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
+  After training completes:
+  1. Check the output for errors or warnings
+  2. Copy/save the trained model to the required output location (e.g. final_model/)
+  3. Run evaluation to measure performance
+  4. If time remains and performance can improve: iterate (adjust hyperparameters, train longer, try different data)
+  5. Verify the final output exists and is valid before stopping
+  Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
+  The task is NOT done until:
+  - The required output directory exists (e.g. final_model/) with a valid model
+  - You have evaluated the model and confirmed it works
+  - You have used all available time productively
   # Communication
   - Be concise and direct. No filler, no restating what the user said.

agent/tools/local_tools.py CHANGED Viewed

@@ -78,6 +78,7 @@ def _truncate_output(output: str, max_chars: int = MAX_OUTPUT_CHARS, head_ratio:
     meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
     if spill_path:
         meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
     return head + meta + tail

     meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
     if spill_path:
         meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
+    meta += "IMPORTANT: The command has finished. Analyze the output above and continue with your next action.\n"
     return head + meta + tail