Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

akseljoonas HF Staff commited on Apr 5

Commit

73882d9

1 Parent(s): 477a013

Improve agent loop, tools, and research subagent for long-running autonomous tasks

- Add --no-stream and --max-iterations CLI flags for headless operation
- Simplify context manager: remove staged pruning, track actual token usage
- Add doom-loop detection and context budget to research subagent
- Improve file tools: better enforcement, error messages, descriptions
- Improve bash tool: longer timeouts, background execution guidance
- Add training logging and HP sweep guidance to system prompt
- Relax Python requirement to >=3.11
- Remove unused file_content_cache

Files changed (14) hide show

agent/config.py +1 -2
agent/context_manager/manager.py +16 -111
agent/core/agent_loop.py +49 -44
agent/core/session.py +0 -3
agent/main.py +42 -6
agent/prompts/system_prompt_v3.yaml +31 -0
agent/tools/edit_utils.py +9 -3
agent/tools/file_content_cache.py +0 -40
agent/tools/local_tools.py +155 -42
agent/tools/research_tool.py +83 -5
agent/tools/sandbox_client.py +59 -46
agent/tools/sandbox_tool.py +0 -19
pyproject.toml +11 -1
uv.lock +4 -1

agent/config.py CHANGED Viewed

@@ -23,8 +23,7 @@ class Config(BaseModel):
     session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
     auto_save_interval: int = 3  # Save every N user turns (0 = disabled)
     yolo_mode: bool = False  # Auto-approve all tool calls without confirmation
-    max_tool_failures_per_turn: int = 3  # Disable a tool after this many failures in one turn
-    max_requests_per_turn: int = 50  # Hard cap on LLM requests per agent turn
     # Permission control parameters
     confirm_cpu_jobs: bool = True

     session_dataset_repo: str = "akseljoonas/hf-agent-sessions"
     auto_save_interval: int = 3  # Save every N user turns (0 = disabled)
     yolo_mode: bool = False  # Auto-approve all tool calls without confirmation
+    max_iterations: int = 300  # Max LLM calls per agent turn (-1 = unlimited)
     # Permission control parameters
     confirm_cpu_jobs: bool = True

agent/context_manager/manager.py CHANGED Viewed

@@ -89,7 +89,7 @@ class ContextManager:
         )
         self.max_context = max_context - 10000
         self.compact_size = int(max_context * compact_size)
-        self.context_length = max_context
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
@@ -243,114 +243,10 @@ class ContextManager:
         return False
-    # Tools whose outputs should never be pruned (too valuable to summarise)
-    _PRUNE_SKIP_TOOLS: set[str] = {"research", "plan_tool"}
-    # Tools whose outputs are pruned via a cheap LLM call instead of
-    # deterministic truncation (the output structure is too complex for
-    # a fixed head-slice to capture the answer reliably).
-    _LLM_PRUNE_TOOLS: set[str] = {"hf_jobs"}
-    async def prune_old_tool_outputs(self, model_name: str | None = None) -> None:
-        """Stage 1 compaction: shrink old tool outputs.
-        For any tool message older than the last 6 messages whose content
-        exceeds 500 chars:
-        - Tools in _LLM_PRUNE_TOOLS get a cheap LLM summarisation (≤600 tokens).
-        - All other tools get a deterministic one-line summary.
-        tool_call_id and name are always preserved.
-        """
-        if len(self.items) <= 6:
-            return
-        cutoff = len(self.items) - 6
-        # Find the preceding assistant tool_call arguments so the LLM
-        # knows what question the tool output was answering.
-        def _find_tool_call_args(tool_call_id: str) -> str | None:
-            for msg in self.items:
-                if getattr(msg, "role", None) != "assistant":
-                    continue
-                for tc in getattr(msg, "tool_calls", None) or []:
-                    tc_id = tc.id if hasattr(tc, "id") else tc.get("id")
-                    if tc_id == tool_call_id:
-                        fn = tc.function if hasattr(tc, "function") else tc.get("function", {})
-                        return fn.arguments if hasattr(fn, "arguments") else fn.get("arguments", "")
-            return None
-        for i in range(cutoff - 1, -1, -1):
-            msg = self.items[i]
-            if getattr(msg, "role", None) != "tool":
-                continue
-            content = getattr(msg, "content", None) or ""
-            if len(content) <= 500:
-                continue
-            tool_name = getattr(msg, "name", None) or "tool"
-            if tool_name in self._PRUNE_SKIP_TOOLS:
-                continue
-            # --- LLM-based pruning for complex tool outputs ---
-            if tool_name in self._LLM_PRUNE_TOOLS and model_name:
-                call_args = _find_tool_call_args(getattr(msg, "tool_call_id", ""))
-                context_line = (
-                    f"The tool was called with: {call_args}\n\n" if call_args else ""
-                )
-                try:
-                    hf_key = os.environ.get("INFERENCE_TOKEN")
-                    resp = await acompletion(
-                        model=model_name,
-                        messages=[
-                            Message(
-                                role="user",
-                                content=(
-                                    f"{context_line}"
-                                    f"Below is the raw output of the '{tool_name}' tool.\n"
-                                    "Give the answer to the original request unchanged — "
-                                    "preserve all job IDs, numbers, status values, error "
-                                    "messages, and metrics exactly. Omit filler/boilerplate. "
-                                    "Stay under 600 tokens.\n\n"
-                                    f"{content}"
-                                ),
-                            )
-                        ],
-                        max_completion_tokens=600,
-                        api_key=hf_key
-                        if hf_key and model_name.startswith("huggingface/")
-                        else None,
-                    )
-                    msg.content = resp.choices[0].message.content
-                    continue
-                except Exception:
-                    logger.warning(
-                        "LLM prune failed for %s, falling back to deterministic",
-                        tool_name,
-                    )
-                    # fall through to deterministic pruning below
-            # --- Deterministic pruning ---
-            preview = content[:80]
-            total = len(content)
-            if tool_name == "bash":
-                exit_code_part = ""
-                if "exit_code" in content[:200]:
-                    for line in content[:200].splitlines():
-                        if "exit_code" in line:
-                            exit_code_part = "exit_code visible if present, "
-                            break
-                summary = f"[bash: {exit_code_part}{preview}... ({total} chars)]"
-            else:
-                summary = f"[{tool_name}: {preview}... ({total} chars)]"
-            msg.content = summary
     async def compact(
         self, model_name: str, tool_specs: list[dict] | None = None
     ) -> None:
         """Remove old messages to keep history under target size"""
-        await self.prune_old_tool_outputs(model_name=model_name)
         if (self.context_length <= self.max_context) or not self.items:
             return
@@ -358,6 +254,15 @@ class ContextManager:
             self.items[0] if self.items and self.items[0].role == "system" else None
         )
         # Don't summarize a certain number of just-preceding messages
         # Walk back to find a user message to make sure we keep an assistant -> user ->
         # assistant general conversation structure
@@ -366,7 +271,7 @@ class ContextManager:
             idx -= 1
         recent_messages = self.items[idx:]
-        messages_to_summarize = self.items[1:idx]
         # improbable, messages would have to very long
         if not messages_to_summarize:
@@ -393,11 +298,11 @@ class ContextManager:
             role="assistant", content=response.choices[0].message.content
         )
-        # Reconstruct: system + summary + recent messages (includes tools)
-        if system_msg:
-            self.items = [system_msg, summarized_message] + recent_messages
-        else:
-            self.items = [summarized_message] + recent_messages
         self.context_length = (
             len(self.system_prompt) // 4 + response.usage.completion_tokens

         )
         self.max_context = max_context - 10000
         self.compact_size = int(max_context * compact_size)
+        self.context_length = 0  # Updated after each LLM call with actual usage
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
         return False
     async def compact(
         self, model_name: str, tool_specs: list[dict] | None = None
     ) -> None:
         """Remove old messages to keep history under target size"""
         if (self.context_length <= self.max_context) or not self.items:
             return
             self.items[0] if self.items and self.items[0].role == "system" else None
         )
+        # Preserve the first user message (task prompt) — never summarize it
+        first_user_msg = None
+        first_user_idx = 1
+        for i in range(1, len(self.items)):
+            if getattr(self.items[i], "role", None) == "user":
+                first_user_msg = self.items[i]
+                first_user_idx = i
+                break
         # Don't summarize a certain number of just-preceding messages
         # Walk back to find a user message to make sure we keep an assistant -> user ->
         # assistant general conversation structure
             idx -= 1
         recent_messages = self.items[idx:]
+        messages_to_summarize = self.items[first_user_idx + 1:idx]
         # improbable, messages would have to very long
         if not messages_to_summarize:
             role="assistant", content=response.choices[0].message.content
         )
+        # Reconstruct: system + first user msg + summary + recent messages
+        head = [system_msg] if system_msg else []
+        if first_user_msg:
+            head.append(first_user_msg)
+        self.items = head + [summarized_message] + recent_messages
         self.context_length = (
             len(self.system_prompt) // 4 + response.usage.completion_tokens

agent/core/agent_loop.py CHANGED Viewed

@@ -153,35 +153,6 @@ _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
-def _append_failure_warning(
-    output: str,
-    tool_name: str,
-    tool_error_counts: dict[str, int],
-    max_failures: int,
-) -> str:
-    """Track a tool failure and append a warning to the output.
-    Returns the output with an appended warning indicating how many
-    failures have occurred and whether the LLM should switch approach.
-    """
-    tool_error_counts[tool_name] = tool_error_counts.get(tool_name, 0) + 1
-    count = tool_error_counts[tool_name]
-    if count >= max_failures:
-        return output + (
-            f"\n\n⚠ Tool '{tool_name}' has now failed "
-            f"{count} times this turn. You should try a "
-            f"different approach instead of calling this "
-            f"tool again."
-        )
-    remaining = max_failures - count
-    return output + (
-        f"\n\n⚠ Tool '{tool_name}' has failed "
-        f"{count}/{max_failures} times this turn. "
-        f"{remaining} attempt(s) before you should "
-        f"switch to a different approach."
-    )
 def _is_transient_error(error: Exception) -> bool:
     """Return True for errors that are likely transient and worth retrying."""
     err_str = str(error).lower()
@@ -200,10 +171,12 @@ def _is_transient_error(error: Exception) -> bool:
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
-    await session.context_manager.prune_old_tool_outputs(
-        model_name=session.config.model_name,
-    )
     old_length = session.context_manager.context_length
     tool_specs = session.tool_router.get_tool_specs_for_llm()
     await session.context_manager.compact(
         model_name=session.config.model_name,
@@ -211,6 +184,11 @@ async def _compact_and_notify(session: Session) -> None:
     )
     new_length = session.context_manager.context_length
     if new_length != old_length:
         await session.send_event(
             Event(
                 event_type="compacted",
@@ -446,7 +424,7 @@ class Handlers:
     @staticmethod
     async def run_agent(
-        session: Session, text: str, max_iterations: int = 300
     ) -> str | None:
         """
         Handle user input (like user_input_or_turn in codex.rs:1291)
@@ -474,10 +452,9 @@ class Handlers:
         iteration = 0
         final_response = None
         errored = False
-        tool_error_counts: dict[str, int] = {}
-        effective_max = min(max_iterations, session.config.max_requests_per_turn)
-        while iteration < effective_max:
             # ── Cancellation check: before LLM call ──
             if session.is_cancelled:
                 break
@@ -582,6 +559,34 @@ class Handlers:
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
@@ -722,15 +727,7 @@ class Handlers:
                     results = gather_task.result()
                     # 4. Record results and send outputs (order preserved)
-                    max_failures = session.config.max_tool_failures_per_turn
                     for tc, tool_name, tool_args, output, success in results:
-                        if not success:
-                            output = _append_failure_warning(
-                                output, tool_name, tool_error_counts, max_failures,
-                            )
-                        else:
-                            tool_error_counts.pop(tool_name, None)
                         tool_msg = Message(
                             role="tool",
                             content=output,
@@ -788,6 +785,14 @@ class Handlers:
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
                 session.context_manager.context_length = (
                     session.context_manager.max_context + 1
                 )

 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
 def _is_transient_error(error: Exception) -> bool:
     """Return True for errors that are likely transient and worth retrying."""
     err_str = str(error).lower()
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
     old_length = session.context_manager.context_length
+    max_ctx = session.context_manager.max_context
+    logger.debug(
+        "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
+        old_length, max_ctx, old_length > max_ctx,
+    )
     tool_specs = session.tool_router.get_tool_specs_for_llm()
     await session.context_manager.compact(
         model_name=session.config.model_name,
     )
     new_length = session.context_manager.context_length
     if new_length != old_length:
+        logger.warning(
+            "Context compacted: %d -> %d tokens (max=%d, %d messages)",
+            old_length, new_length, max_ctx,
+            len(session.context_manager.items),
+        )
         await session.send_event(
             Event(
                 event_type="compacted",
     @staticmethod
     async def run_agent(
+        session: Session, text: str,
     ) -> str | None:
         """
         Handle user input (like user_input_or_turn in codex.rs:1291)
         iteration = 0
         final_response = None
         errored = False
+        max_iterations = session.config.max_iterations
+        while max_iterations == -1 or iteration < max_iterations:
             # ── Cancellation check: before LLM call ──
             if session.is_cancelled:
                 break
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
+                    logger.warning(
+                        "Agent loop ending: no tool calls. "
+                        "finish_reason=%s, token_count=%d, "
+                        "context_length=%d, max_context=%d, "
+                        "iteration=%d/%d, "
+                        "response_text=%s",
+                        finish_reason,
+                        token_count,
+                        session.context_manager.context_length,
+                        session.context_manager.max_context,
+                        iteration,
+                        max_iterations,
+                        (content or "")[:500],
+                    )
+                    await session.send_event(
+                        Event(
+                            event_type="tool_log",
+                            data={
+                                "tool": "system",
+                                "log": (
+                                    f"Loop exit: no tool calls. "
+                                    f"finish_reason={finish_reason}, "
+                                    f"tokens={token_count}/{session.context_manager.max_context}, "
+                                    f"iter={iteration}/{max_iterations}"
+                                ),
+                            },
+                        )
+                    )
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
                     results = gather_task.result()
                     # 4. Record results and send outputs (order preserved)
                     for tc, tool_name, tool_args, output, success in results:
                         tool_msg = Message(
                             role="tool",
                             content=output,
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
+                logger.warning(
+                    "ContextWindowExceededError at iteration %d — forcing compaction "
+                    "(context_length=%d, max_context=%d, messages=%d)",
+                    iteration,
+                    session.context_manager.context_length,
+                    session.context_manager.max_context,
+                    len(session.context_manager.items),
+                )
                 session.context_manager.context_length = (
                     session.context_manager.max_context + 1
                 )

agent/core/session.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import Any, Optional
 from agent.config import Config
 from agent.context_manager.manager import ContextManager
-from agent.tools.file_content_cache import FileContentCache
 logger = logging.getLogger(__name__)
@@ -110,8 +109,6 @@ class Session:
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
-        self.file_content_cache = FileContentCache()
         # Session trajectory logging
         self.logged_events: list[dict] = []
         self.session_start_time = datetime.now().isoformat()

 from agent.config import Config
 from agent.context_manager.manager import ContextManager
 logger = logging.getLogger(__name__)
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
         # Session trajectory logging
         self.logged_events: list[dict] = []
         self.session_start_time = datetime.now().isoformat()

agent/main.py CHANGED Viewed

@@ -858,7 +858,12 @@ async def main():
     get_console().print("\n[dim]Bye.[/dim]\n")
-async def headless_main(prompt: str, model: str | None = None) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
@@ -876,12 +881,13 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
-        if model not in VALID_MODEL_IDS:
-            print(f"ERROR: Unknown model '{model}'. Valid: {', '.join(VALID_MODEL_IDS)}", file=sys.stderr)
-            sys.exit(1)
         config.model_name = model
     print(f"Model: {config.model_name}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
@@ -900,7 +906,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
             session_holder=session_holder,
             hf_token=hf_token,
             local_mode=True,
-            stream=True,
         )
     )
@@ -922,6 +928,7 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
     shimmer = _ThinkingShimmer(console)
     stream_buf = _StreamBuffer(console)
     _hl_last_tool = [None]
     shimmer.start()
     while True:
@@ -960,6 +967,26 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
             log = event.data.get("log", "") if event.data else ""
             if log:
                 print_tool_log(tool, log)
         elif event.event_type == "compacted":
             old_tokens = event.data.get("old_tokens", 0) if event.data else 0
             new_tokens = event.data.get("new_tokens", 0) if event.data else 0
@@ -973,6 +1000,8 @@ async def headless_main(prompt: str, model: str | None = None) -> None:
         elif event.event_type in ("turn_complete", "interrupted"):
             shimmer.stop()
             stream_buf.discard()
             break
     # Shutdown
@@ -999,11 +1028,18 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
     parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
     parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
     args = parser.parse_args()
     try:
         if args.prompt:
-            asyncio.run(headless_main(args.prompt, model=args.model))
         else:
             asyncio.run(main())
     except KeyboardInterrupt:

     get_console().print("\n[dim]Bye.[/dim]\n")
+async def headless_main(
+    prompt: str,
+    model: str | None = None,
+    max_iterations: int | None = None,
+    stream: bool = True,
+) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
         config.model_name = model
+    if max_iterations is not None:
+        config.max_iterations = max_iterations
     print(f"Model: {config.model_name}", file=sys.stderr)
+    print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
             session_holder=session_holder,
             hf_token=hf_token,
             local_mode=True,
+            stream=stream,
         )
     )
     shimmer = _ThinkingShimmer(console)
     stream_buf = _StreamBuffer(console)
     _hl_last_tool = [None]
+    _hl_sub_id = [1]
     shimmer.start()
     while True:
             log = event.data.get("log", "") if event.data else ""
             if log:
                 print_tool_log(tool, log)
+        elif event.event_type == "approval_required":
+            # Auto-approve everything in headless mode (safety net if yolo_mode
+            # didn't prevent the approval event for some reason)
+            tools_data = event.data.get("tools", []) if event.data else []
+            approvals = [
+                {
+                    "tool_call_id": t.get("tool_call_id", ""),
+                    "approved": True,
+                    "feedback": None,
+                }
+                for t in tools_data
+            ]
+            _hl_sub_id[0] += 1
+            await submission_queue.put(Submission(
+                id=f"hl_approval_{_hl_sub_id[0]}",
+                operation=Operation(
+                    op_type=OpType.EXEC_APPROVAL,
+                    data={"approvals": approvals},
+                ),
+            ))
         elif event.event_type == "compacted":
             old_tokens = event.data.get("old_tokens", 0) if event.data else 0
             new_tokens = event.data.get("new_tokens", 0) if event.data else 0
         elif event.event_type in ("turn_complete", "interrupted"):
             shimmer.stop()
             stream_buf.discard()
+            history_size = event.data.get("history_size", "?") if event.data else "?"
+            print(f"\n--- Agent {event.event_type} (history_size={history_size}) ---", file=sys.stderr)
             break
     # Shutdown
     parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
     parser.add_argument("prompt", nargs="?", default=None, help="Run headlessly with this prompt")
     parser.add_argument("--model", "-m", default=None, help=f"Model to use (default: from config)")
+    parser.add_argument("--max-iterations", type=int, default=None,
+                        help="Max LLM requests per turn (default: 50, use -1 for unlimited)")
+    parser.add_argument("--no-stream", action="store_true",
+                        help="Disable token streaming (use non-streaming LLM calls)")
     args = parser.parse_args()
     try:
         if args.prompt:
+            max_iter = args.max_iterations
+            if max_iter is not None and max_iter < 0:
+                max_iter = 10_000  # effectively unlimited
+            asyncio.run(headless_main(args.prompt, model=args.model, max_iterations=max_iter, stream=not args.no_stream))
         else:
             asyncio.run(main())
     except KeyboardInterrupt:

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -46,6 +46,8 @@ system_prompt: |
   2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
   3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
   Dataset format requirements by training method:
     SFT: "messages", "text", or "prompt"/"completion"
     DPO: "prompt", "chosen", "rejected"
@@ -112,6 +114,35 @@ system_prompt: |
   Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
   Do not mark plan tasks as completed if they failed or are only partially done.
   # Communication
   - Be concise and direct. No filler, no restating what the user said.

   2. Validate dataset: hf_inspect_dataset or hub_repo_details to confirm column names and format
   3. Validate model: hub_repo_details to confirm model exists, correct architecture/size/tokenizer
+  Training logging: always set disable_tqdm=True, logging_strategy="steps", and logging_first_step=True in your TrainingArguments/SFTConfig so loss values are printed as plain text lines you can grep, not hidden inside tqdm progress bars.
   Dataset format requirements by training method:
     SFT: "messages", "text", or "prompt"/"completion"
     DPO: "prompt", "chosen", "rejected"
   Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
   Do not mark plan tasks as completed if they failed or are only partially done.
+  # Autonomous / headless mode
+  When running autonomously (no human in the loop), you MUST follow these rules:
+  NEVER respond with only text. Every response MUST include at least one tool call. If you have nothing to do, check the plan, check the timer, or verify outputs. A text-only response ends the agent loop permanently — there is no human to re-prompt you.
+  NEVER STOP WORKING. Do NOT decide you are "done" while time remains. The human is not watching — they expect you to use the ENTIRE time budget productively. Do NOT ask "should I continue?" or "is this a good stopping point?" — there is nobody to answer. You run until the timer expires or you are manually killed.
+  Your workflow is a loop, not a checklist. Once you have a working result, KEEP ITERATING:
+  LOOP UNTIL TIME RUNS OUT:
+  1. Research the approach (read docs, find examples, check current APIs)
+  2. Implement the solution (write code, set up training)
+  3. Train and evaluate
+  4. Save the model to the required output location
+  5. Check the timer — if time remains, improve: tune hyperparameters, try different data preprocessing, adjust the training recipe, try a different approach entirely
+  6. Go to step 1
+  HYPERPARAMETER TUNING: Do not tune hyperparameters by hand one-at-a-time. Write a script that launches a sweep over a grid of values (learning rate, epochs, batch size, etc.) and evaluates each run automatically. One well-designed sweep script beats ten manual experiments.
+  If you run out of ideas: research. Use the research tool to find papers on the task or technique — look for recent methods, ablation results, tricks that worked for similar problems. Re-read the task prompt for angles you missed. Re-read the training logs for clues. Try combining approaches from different papers. Try a fundamentally different strategy from the literature. There is always a paper you haven't read yet.
+  Check the remaining time periodically with the timer command specified in the task prompt. Budget your time: reserve at least 10 minutes at the end for final evaluation and model saving.
+  The task is NOT done until:
+  - The required output directory exists (e.g. final_model/) with a valid model
+  - You have evaluated the model and confirmed it works
+  - The timer has expired or is about to expire
   # Communication
   - Be concise and direct. No filler, no restating what the user said.

agent/tools/edit_utils.py CHANGED Viewed

@@ -181,7 +181,11 @@ def apply_edit(
     if old_str not in content:
         original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
         if original_match is None:
-            raise ValueError("old_str not found in file.")
         old_str = original_match
     count = content.count(old_str)
@@ -189,8 +193,10 @@ def apply_edit(
     if mode == "replace":
         if count > 1 and not replace_all:
             raise ValueError(
-                f"old_str appears {count} times. Use replace_all=true to replace all, "
-                "or provide a more specific old_str."
             )
         if replace_all:
             new_content = content.replace(old_str, new_str)

     if old_str not in content:
         original_match, fuzzy_note = fuzzy_find_original_match(content, old_str)
         if original_match is None:
+            raise ValueError(
+                "old_str was not found in the file. Make sure old_str matches "
+                "the file contents exactly, including whitespace and indentation. "
+                "Use the read tool to verify the current file contents before retrying."
+            )
         old_str = original_match
     count = content.count(old_str)
     if mode == "replace":
         if count > 1 and not replace_all:
             raise ValueError(
+                f"Found {count} matches of old_str in the file, but replace_all is "
+                f"false. To replace all occurrences, set replace_all to true. To "
+                f"replace only one, provide a larger old_str with more surrounding "
+                f"context to uniquely identify the instance."
             )
         if replace_all:
             new_content = content.replace(old_str, new_str)

agent/tools/file_content_cache.py DELETED Viewed

@@ -1,40 +0,0 @@
-"""Cache for detecting unchanged local file re-reads."""
-from __future__ import annotations
-import hashlib
-def _short_hash(content: str) -> str:
-    return hashlib.sha256(content.encode()).hexdigest()[:16]
-def _resolve(path: str) -> str:
-    try:
-        from pathlib import Path
-        return str(Path(path).resolve())
-    except Exception:
-        return path
-class FileContentCache:
-    """Tracks file content hashes to skip re-reading unchanged files."""
-    def __init__(self) -> None:
-        self._cache: dict[str, tuple[str, int]] = {}
-    def record_read(self, path: str, content: str, turn: int) -> None:
-        key = _resolve(path)
-        self._cache[key] = (_short_hash(content), turn)
-    def check_unchanged(self, path: str, content: str) -> tuple[bool, int | None]:
-        key = _resolve(path)
-        cached = self._cache.get(key)
-        if cached is None:
-            return False, None
-        cached_hash, turn = cached
-        return _short_hash(content) == cached_hash, turn
-    def clear_path(self, path: str) -> None:
-        key = _resolve(path)
-        self._cache.pop(key, None)

agent/tools/local_tools.py CHANGED Viewed

@@ -15,16 +15,25 @@ import tempfile
 from pathlib import Path
 from typing import Any
-from agent.tools.sandbox_client import Sandbox
 MAX_OUTPUT_CHARS = 25_000
-MAX_LINE_LENGTH = 2000
 DEFAULT_READ_LINES = 2000
 DEFAULT_TIMEOUT = 120
-MAX_TIMEOUT = 600
 _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
 def _atomic_write(path: Path, content: str) -> None:
     """Write file atomically via temp file + os.replace().
@@ -78,6 +87,7 @@ def _truncate_output(output: str, max_chars: int = MAX_OUTPUT_CHARS, head_ratio:
     meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
     if spill_path:
         meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
     return head + meta + tail
@@ -104,7 +114,14 @@ async def _bash_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
             output = "(no output)"
         return output, result.returncode == 0
     except subprocess.TimeoutExpired:
-        return f"Command timed out after {timeout}s.", False
     except Exception as e:
         return f"bash error: {e}", False
@@ -123,17 +140,7 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     except Exception as e:
         return f"read error: {e}", False
-    # Check if file is unchanged since last read
-    session = _kw.get("session")
-    if session is not None:
-        is_unchanged, last_turn = session.file_content_cache.check_unchanged(
-            file_path, raw_content
-        )
-        if is_unchanged:
-            return (
-                f"[File unchanged since turn {last_turn}, "
-                f"content already in context.]"
-            ), True
     lines = raw_content.splitlines()
     offset = max((args.get("offset") or 1), 1)
@@ -146,11 +153,6 @@ async def _read_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
             line = line[:MAX_LINE_LENGTH] + "..."
         numbered.append(f"{i:>6}\t{line}")
-    if session is not None:
-        session.file_content_cache.record_read(
-            file_path, raw_content, session.turn_count
-        )
     return "\n".join(numbered), True
@@ -160,11 +162,14 @@ async def _write_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     if not file_path:
         return "No path provided.", False
     p = Path(file_path)
     try:
         _atomic_write(p, content)
-        session = _kw.get("session")
-        if session is not None:
-            session.file_content_cache.clear_path(file_path)
         msg = f"Wrote {len(content)} bytes to {file_path}"
         # Syntax validation for Python files
         if p.suffix == ".py":
@@ -194,6 +199,11 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     p = Path(file_path)
     if not p.exists():
         return f"File not found: {file_path}", False
     try:
         text = p.read_text()
@@ -212,10 +222,6 @@ async def _edit_handler(args: dict[str, Any], **_kw) -> tuple[str, bool]:
     except Exception as e:
         return f"edit write error: {e}", False
-    session = _kw.get("session")
-    if session is not None:
-        session.file_content_cache.clear_path(file_path)
     msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
     if fuzzy_note:
         msg += f" {fuzzy_note}"
@@ -234,18 +240,22 @@ _LOCAL_TOOL_SPECS = {
         "description": (
             "Run a shell command on the local machine and return stdout/stderr.\n"
             "\n"
-            "Commands run in a shell at the working directory (default: current directory). "
-            "Each invocation is independent.\n"
-            "\n"
-            "AVOID using bash for operations covered by specialized tools:\n"
-            "- File reading: use read (not cat/head/tail)\n"
-            "- File editing: use edit (not sed/awk)\n"
-            "- File writing: use write (not echo/cat <<EOF)\n"
             "\n"
             "Chain dependent commands with &&. Independent commands should be "
             "separate bash calls (they can run in parallel).\n"
             "\n"
-            "Timeout default 120s, max 600s."
         ),
         "parameters": {
             "type": "object",
@@ -266,22 +276,125 @@ _LOCAL_TOOL_SPECS = {
                 },
                 "timeout": {
                     "type": "integer",
-                    "description": "Timeout in seconds (default: 120, max: 600).",
                 },
             },
         },
     },
     "read": {
-        "description": Sandbox.TOOLS["read"]["description"],
-        "parameters": Sandbox.TOOLS["read"]["parameters"],
     },
     "write": {
-        "description": Sandbox.TOOLS["write"]["description"],
-        "parameters": Sandbox.TOOLS["write"]["parameters"],
     },
     "edit": {
-        "description": Sandbox.TOOLS["edit"]["description"],
-        "parameters": Sandbox.TOOLS["edit"]["parameters"],
     },
 }

 from pathlib import Path
 from typing import Any
 MAX_OUTPUT_CHARS = 25_000
+MAX_LINE_LENGTH = 4000
 DEFAULT_READ_LINES = 2000
 DEFAULT_TIMEOUT = 120
+MAX_TIMEOUT = 36000  # 10 hours — needed for long training runs (e.g. PostTrainBench)
 _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]|\x1b\].*?\x07')
+# Track files that have been read this session (enforces read-before-write/edit)
+_files_read: set[str] = set()
+def _resolve_path(path: str) -> str:
+    try:
+        return str(Path(path).resolve())
+    except Exception:
+        return path
 def _atomic_write(path: Path, content: str) -> None:
     """Write file atomically via temp file + os.replace().
     meta = f"\n\n... ({omitted:,} of {total:,} chars omitted, showing first {head_budget:,} + last {tail_budget:,}) ...\n"
     if spill_path:
         meta += f"Full output saved to {spill_path} — use the read tool with offset/limit to inspect specific sections.\n"
+    meta += "IMPORTANT: The command has finished. Analyze the output above and continue with your next action.\n"
     return head + meta + tail
             output = "(no output)"
         return output, result.returncode == 0
     except subprocess.TimeoutExpired:
+        return (
+            f"Command timed out after {timeout}s and was killed.\n\n"
+            f"For long-running commands, run in the background and poll:\n"
+            f"  nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
+            f"Then check status with:\n"
+            f"  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+            f"  tail -n 50 /tmp/output.log"
+        ), False
     except Exception as e:
         return f"bash error: {e}", False
     except Exception as e:
         return f"read error: {e}", False
+    _files_read.add(_resolve_path(file_path))
     lines = raw_content.splitlines()
     offset = max((args.get("offset") or 1), 1)
             line = line[:MAX_LINE_LENGTH] + "..."
         numbered.append(f"{i:>6}\t{line}")
     return "\n".join(numbered), True
     if not file_path:
         return "No path provided.", False
     p = Path(file_path)
+    if p.exists() and _resolve_path(file_path) not in _files_read:
+        return (
+            f"You must read {file_path} before overwriting it. "
+            f"Use the read tool first to see current contents."
+        ), False
     try:
         _atomic_write(p, content)
+        _files_read.add(_resolve_path(file_path))
         msg = f"Wrote {len(content)} bytes to {file_path}"
         # Syntax validation for Python files
         if p.suffix == ".py":
     p = Path(file_path)
     if not p.exists():
         return f"File not found: {file_path}", False
+    if _resolve_path(file_path) not in _files_read:
+        return (
+            f"You must read {file_path} before editing it. "
+            f"Use the read tool first to see current contents."
+        ), False
     try:
         text = p.read_text()
     except Exception as e:
         return f"edit write error: {e}", False
     msg = f"Edited {file_path} ({replacements} replacement{'s' if replacements > 1 else ''})"
     if fuzzy_note:
         msg += f" {fuzzy_note}"
         "description": (
             "Run a shell command on the local machine and return stdout/stderr.\n"
             "\n"
+            "IMPORTANT: Do NOT use bash for file operations — use the dedicated tools instead:\n"
+            "- To read files: use read (not cat/head/tail)\n"
+            "- To edit files: use edit (not sed/awk)\n"
+            "- To write files: use write (not echo/cat <<EOF)\n"
             "\n"
+            "Commands run in a shell at the working directory. Each invocation is independent.\n"
             "Chain dependent commands with &&. Independent commands should be "
             "separate bash calls (they can run in parallel).\n"
             "\n"
+            "For long-running commands (training, evaluation), run in the background and poll:\n"
+            "  nohup <command> > /tmp/output.log 2>&1 & echo $!\n"
+            "Then check status:\n"
+            "  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+            "  tail -n 50 /tmp/output.log\n"
+            "\n"
+            "Timeout default 120s, max 36000s."
         ),
         "parameters": {
             "type": "object",
                 },
                 "timeout": {
                     "type": "integer",
+                    "description": "Optional timeout in seconds (default: 120, max: 36000).",
                 },
             },
         },
     },
     "read": {
+        "description": (
+            "Reads a file from the local filesystem. Returns contents with line numbers "
+            "(cat -n format).\n"
+            "\n"
+            "Usage:\n"
+            "- By default, reads up to 2000 lines from the beginning of the file.\n"
+            "- You can optionally specify offset and limit for large files, but prefer "
+            "reading the whole file first.\n"
+            "- Lines longer than 4000 chars are truncated.\n"
+            "- Cannot read directories — use bash with 'ls' instead.\n"
+            "- You should read multiple potentially useful files in parallel when possible.\n"
+            "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
+            "write tools will reject operations on files you haven't read."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to read.",
+                },
+                "offset": {
+                    "type": "integer",
+                    "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": "The number of lines to read. Only provide if the file is too large to read at once.",
+                },
+            },
+        },
     },
     "write": {
+        "description": (
+            "Writes a file to the local filesystem. Overwrites the existing file if one "
+            "exists at the path.\n"
+            "\n"
+            "- If this is an existing file, you MUST use the read tool first. This tool "
+            "will fail if you did not read the file first.\n"
+            "- ALWAYS prefer editing existing files with the edit tool over overwriting "
+            "with write.\n"
+            "- Creates parent directories as needed."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path", "content"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to write.",
+                },
+                "content": {
+                    "type": "string",
+                    "description": "The complete file content to write.",
+                },
+            },
+        },
     },
     "edit": {
+        "description": (
+            "Performs string replacements in files. Supports exact matching with "
+            "fuzzy fallback.\n"
+            "\n"
+            "Usage:\n"
+            "- You must read the file at least once before editing. This tool will "
+            "error if you attempt an edit without reading the file.\n"
+            "- The edit will FAIL if old_str is not unique in the file. Either provide "
+            "a larger string with more surrounding context to make it unique, or set "
+            "replace_all to true.\n"
+            "- old_str and new_str must differ.\n"
+            "- Preserve indentation exactly as it appears in the file.\n"
+            "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
+            "- To delete code, set new_str to empty string.\n"
+            "- Use replace_all for renaming variables or strings across the file.\n"
+            "\n"
+            "Modes:\n"
+            "- replace (default): replace first occurrence of old_str with new_str.\n"
+            "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
+            "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
+        ),
+        "parameters": {
+            "type": "object",
+            "required": ["path", "old_str", "new_str"],
+            "additionalProperties": False,
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Absolute path to the file to edit.",
+                },
+                "old_str": {
+                    "type": "string",
+                    "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
+                },
+                "new_str": {
+                    "type": "string",
+                    "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
+                },
+                "replace_all": {
+                    "type": "boolean",
+                    "description": "Replace all occurrences of old_str (default: false).",
+                    "default": False,
+                },
+                "mode": {
+                    "type": "string",
+                    "enum": ["replace", "append_after", "prepend_before"],
+                    "description": "Edit mode (default: replace).",
+                    "default": "replace",
+                },
+            },
+        },
     },
 }

agent/tools/research_tool.py CHANGED Viewed

@@ -14,10 +14,17 @@ from typing import Any
 from litellm import Message, acompletion
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
 # Tools the research agent can use (read-only subset)
 RESEARCH_TOOL_NAMES = {
     "read",
@@ -171,7 +178,7 @@ def _resolve_llm_params(model_name: str) -> dict:
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
-        return "anthropic/claude-haiku-4-5-20251001"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model
@@ -221,12 +228,60 @@ async def research_handler(
     _tool_uses = 0
     _total_tokens = 0
     await _log("Starting research sub-agent...")
-    # Run the research loop (max 20 iterations — research should be focused)
-    max_iterations = 20
     for _iteration in range(max_iterations):
         try:
             response = await acompletion(
                 messages=messages,
@@ -242,7 +297,7 @@ async def research_handler(
         # Track tokens
         if response.usage:
-            _total_tokens += response.usage.total_tokens
             await _log(f"tokens:{_total_tokens}")
         choice = response.choices[0]
@@ -308,8 +363,31 @@ async def research_handler(
                 )
             )
     return (
-        "Research agent hit iteration limit (20). "
         "Partial findings may be incomplete — try a more focused task.",
         False,
     )

 from litellm import Message, acompletion
+from agent.core.doom_loop import check_for_doom_loop
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
+# Context budget for the research subagent (tokens).
+# When usage exceeds WARN threshold, the subagent is told to wrap up.
+# At MAX, the loop is force-stopped and whatever content exists is returned.
+_RESEARCH_CONTEXT_WARN = 170_000  # 85% of 200k
+_RESEARCH_CONTEXT_MAX = 190_000
 # Tools the research agent can use (read-only subset)
 RESEARCH_TOOL_NAMES = {
     "read",
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
+        return "anthropic/claude-sonnet-4-6"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model
     _tool_uses = 0
     _total_tokens = 0
+    _warned_context = False
     await _log("Starting research sub-agent...")
+    # Run the research loop — context budget is the real limiter
+    max_iterations = 60
     for _iteration in range(max_iterations):
+        # ── Doom-loop detection ──
+        doom_prompt = check_for_doom_loop(messages)
+        if doom_prompt:
+            logger.warning("Research sub-agent doom loop detected at iteration %d", _iteration)
+            await _log("Doom loop detected — injecting corrective prompt")
+            messages.append(Message(role="user", content=doom_prompt))
+        # ── Context budget: warn at 75%, hard-stop at 95% ──
+        if _total_tokens >= _RESEARCH_CONTEXT_MAX:
+            logger.warning(
+                "Research sub-agent hit context max (%d tokens) — forcing summary",
+                _total_tokens,
+            )
+            await _log(f"Context limit reached ({_total_tokens} tokens) — forcing wrap-up")
+            # Ask for a final summary with no tools
+            messages.append(Message(
+                role="user",
+                content=(
+                    "[SYSTEM: CONTEXT LIMIT REACHED] You have used all available context. "
+                    "Summarize your findings NOW. Do NOT call any more tools."
+                ),
+            ))
+            try:
+                response = await acompletion(
+                    messages=messages,
+                    tools=None,  # no tools — force text response
+                    stream=False,
+                    timeout=120,
+                    **llm_params,
+                )
+                content = response.choices[0].message.content or ""
+                return content or "Research context exhausted — no summary produced.", bool(content)
+            except Exception:
+                return "Research context exhausted and summary call failed.", False
+        if not _warned_context and _total_tokens >= _RESEARCH_CONTEXT_WARN:
+            _warned_context = True
+            await _log(f"Context at {_total_tokens} tokens — nudging to wrap up")
+            messages.append(Message(
+                role="user",
+                content=(
+                    "[SYSTEM: You have used 75% of your context budget. "
+                    "Start wrapping up: finish any critical lookups, then "
+                    "produce your final summary within the next 1-2 iterations.]"
+                ),
+            ))
         try:
             response = await acompletion(
                 messages=messages,
         # Track tokens
         if response.usage:
+            _total_tokens = response.usage.total_tokens
             await _log(f"tokens:{_total_tokens}")
         choice = response.choices[0]
                 )
             )
+    # ── Iteration limit: try to salvage findings ──
+    await _log("Iteration limit reached — extracting summary")
+    messages.append(Message(
+        role="user",
+        content=(
+            "[SYSTEM: ITERATION LIMIT] You have reached the maximum number of research "
+            "iterations. Summarize ALL findings so far. Do NOT call any more tools."
+        ),
+    ))
+    try:
+        response = await acompletion(
+            messages=messages,
+            tools=None,
+            stream=False,
+            timeout=120,
+            **llm_params,
+        )
+        content = response.choices[0].message.content or ""
+        if content:
+            return content, True
+    except Exception as e:
+        logger.error("Research summary call failed: %s", e)
     return (
+        "Research agent hit iteration limit (60). "
         "Partial findings may be incomplete — try a more focused task.",
         False,
     )

agent/tools/sandbox_client.py CHANGED Viewed

@@ -57,7 +57,7 @@ HARDWARE_OPTIONS = [
     "a100-large",
 ]
 OUTPUT_LIMIT = 25000
-LINE_LIMIT = 2000
 DEFAULT_READ_LIMIT = 2000
 DEFAULT_TIMEOUT = 240
 MAX_TIMEOUT = 1200
@@ -855,22 +855,23 @@ class Sandbox:
             "description": (
                 "Run a shell command in the remote sandbox and return stdout/stderr.\n"
                 "\n"
-                "Commands run in a shell at the working directory (default /app). "
-                "Each invocation is independent — use files in /app to persist state.\n"
-                "\n"
-                "AVOID using bash for operations covered by specialized tools:\n"
-                "- File reading: use read (not cat/head/tail)\n"
-                "- File editing: use edit (not sed/awk)\n"
-                "- File writing: use write (not echo/cat <<EOF)\n"
-                "\n"
-                "For long-running tasks, background them:\n"
-                "  nohup uv run train.py > /app/train.log 2>&1 &\n"
-                "Then check with read on the log file.\n"
                 "\n"
                 "Chain dependent commands with &&. Independent commands should be "
                 "separate bash calls (they can run in parallel).\n"
                 "\n"
-                "Timeout default 120s, max 600s."
             ),
             "parameters": {
                 "type": "object",
@@ -883,7 +884,7 @@ class Sandbox:
                     },
                     "description": {
                         "type": "string",
-                        "description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
                     },
                     "work_dir": {
                         "type": "string",
@@ -891,20 +892,25 @@ class Sandbox:
                     },
                     "timeout": {
                         "type": "integer",
-                        "description": "Timeout in seconds (default: 240, max: 1200).",
                     },
                 },
             },
         },
         "read": {
             "description": (
-                "Read file contents with line numbers (cat -n format).\n"
-                "\n"
-                "Returns the first 2000 lines by default. For large files, use offset/limit "
-                "to read a specific range. Line numbers always match the original file.\n"
                 "\n"
-                "Lines longer than 2000 chars are truncated.\n"
-                "Cannot read directories — use bash with 'ls' instead."
             ),
             "parameters": {
                 "type": "object",
@@ -917,21 +923,25 @@ class Sandbox:
                     },
                     "offset": {
                         "type": "integer",
-                        "description": "Start from this line (1-based). Only if file is too large.",
                     },
                     "limit": {
                         "type": "integer",
-                        "description": "Number of lines to read. Only if file is too large.",
                     },
                 },
             },
         },
         "write": {
             "description": (
-                "Create or overwrite a file. Creates parent directories as needed.\n"
                 "\n"
-                "For existing files, you MUST read the file first (system enforced). "
-                "Prefer edit for modifications."
             ),
             "parameters": {
                 "type": "object",
@@ -944,32 +954,32 @@ class Sandbox:
                     },
                     "content": {
                         "type": "string",
-                        "description": "Complete file content.",
                     },
                 },
             },
         },
         "edit": {
             "description": (
-                "Targeted edit via string replacement with fuzzy matching fallback.\n"
                 "\n"
-                "Modes:\n"
-                "- replace (default): replace first occurrence of old_str with new_str.\n"
-                "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
-                "- prepend_before: insert new_str immediately before old_str (old_str is kept).\n"
-                "\n"
-                "Rules:\n"
-                "- old_str must appear EXACTLY once (unless replace_all is true).\n"
-                "- Include enough context in old_str for uniqueness.\n"
                 "- old_str and new_str must differ.\n"
-                "- Preserve indentation exactly.\n"
                 "- To delete code, set new_str to empty string.\n"
-                "- File MUST have been read this session (system enforced).\n"
-                "- Do NOT include line number prefixes in old_str/new_str.\n"
                 "\n"
-                "If exact match fails, the tool automatically tries trimmed/normalized matching.\n"
-                "Use replace_all=true for batch operations like variable renaming.\n"
-                "Use append_after/prepend_before to insert code without replacing existing code."
             ),
             "parameters": {
                 "type": "object",
@@ -978,16 +988,19 @@ class Sandbox:
                 "properties": {
                     "path": {
                         "type": "string",
-                        "description": "Absolute path to the file.",
                     },
                     "old_str": {
                         "type": "string",
-                        "description": "Text to find (fuzzy matching used as fallback).",
                     },
-                    "new_str": {"type": "string", "description": "Replacement text (or text to insert for append_after/prepend_before)."},
                     "replace_all": {
                         "type": "boolean",
-                        "description": "Replace all occurrences (default: false).",
                         "default": False,
                     },
                     "mode": {

     "a100-large",
 ]
 OUTPUT_LIMIT = 25000
+LINE_LIMIT = 4000
 DEFAULT_READ_LIMIT = 2000
 DEFAULT_TIMEOUT = 240
 MAX_TIMEOUT = 1200
             "description": (
                 "Run a shell command in the remote sandbox and return stdout/stderr.\n"
                 "\n"
+                "IMPORTANT: Do NOT use bash for file operations — use the dedicated tools instead:\n"
+                "- To read files: use read (not cat/head/tail)\n"
+                "- To edit files: use edit (not sed/awk)\n"
+                "- To write files: use write (not echo/cat <<EOF)\n"
                 "\n"
+                "Commands run in a shell at /app. Each invocation is independent — "
+                "use files in /app to persist state.\n"
                 "Chain dependent commands with &&. Independent commands should be "
                 "separate bash calls (they can run in parallel).\n"
                 "\n"
+                "For long-running commands (training, evaluation), run in the background and poll:\n"
+                "  nohup <command> > /app/output.log 2>&1 & echo $!\n"
+                "Then check status:\n"
+                "  kill -0 <PID> 2>/dev/null && echo 'running' || echo 'done'\n"
+                "  tail -n 50 /app/output.log\n"
+                "\n"
+                "Timeout default 240s, max 1200s."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "description": {
                         "type": "string",
+                        "description": "Short description (5-10 words, active voice).",
                     },
                     "work_dir": {
                         "type": "string",
                     },
                     "timeout": {
                         "type": "integer",
+                        "description": "Optional timeout in seconds (default: 240, max: 1200).",
                     },
                 },
             },
         },
         "read": {
             "description": (
+                "Reads a file from the sandbox filesystem. Returns contents with line "
+                "numbers (cat -n format).\n"
                 "\n"
+                "Usage:\n"
+                "- By default, reads up to 2000 lines from the beginning of the file.\n"
+                "- You can optionally specify offset and limit for large files, but prefer "
+                "reading the whole file first.\n"
+                "- Lines longer than 4000 chars are truncated.\n"
+                "- Cannot read directories — use bash with 'ls' instead.\n"
+                "- You should read multiple potentially useful files in parallel when possible.\n"
+                "- IMPORTANT: Always read a file before editing or overwriting it. The edit and "
+                "write tools will reject operations on files you haven't read."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "offset": {
                         "type": "integer",
+                        "description": "The line number to start reading from (1-based). Only provide if the file is too large to read at once.",
                     },
                     "limit": {
                         "type": "integer",
+                        "description": "The number of lines to read. Only provide if the file is too large to read at once.",
                     },
                 },
             },
         },
         "write": {
             "description": (
+                "Writes a file to the sandbox filesystem. Overwrites the existing file if "
+                "one exists at the path.\n"
                 "\n"
+                "- If this is an existing file, you MUST use the read tool first. This tool "
+                "will fail if you did not read the file first.\n"
+                "- ALWAYS prefer editing existing files with the edit tool over overwriting "
+                "with write.\n"
+                "- Creates parent directories as needed."
             ),
             "parameters": {
                 "type": "object",
                     },
                     "content": {
                         "type": "string",
+                        "description": "The complete file content to write.",
                     },
                 },
             },
         },
         "edit": {
             "description": (
+                "Performs string replacements in files. Supports exact matching with "
+                "fuzzy fallback.\n"
                 "\n"
+                "Usage:\n"
+                "- You must read the file at least once before editing. This tool will "
+                "error if you attempt an edit without reading the file.\n"
+                "- The edit will FAIL if old_str is not unique in the file. Either provide "
+                "a larger string with more surrounding context to make it unique, or set "
+                "replace_all to true.\n"
                 "- old_str and new_str must differ.\n"
+                "- Preserve indentation exactly as it appears in the file.\n"
+                "- Do NOT include line number prefixes from read output in old_str or new_str.\n"
                 "- To delete code, set new_str to empty string.\n"
+                "- Use replace_all for renaming variables or strings across the file.\n"
                 "\n"
+                "Modes:\n"
+                "- replace (default): replace first occurrence of old_str with new_str.\n"
+                "- append_after: insert new_str immediately after old_str (old_str is kept).\n"
+                "- prepend_before: insert new_str immediately before old_str (old_str is kept)."
             ),
             "parameters": {
                 "type": "object",
                 "properties": {
                     "path": {
                         "type": "string",
+                        "description": "Absolute path to the file to edit.",
                     },
                     "old_str": {
                         "type": "string",
+                        "description": "The text to find in the file. Must match exactly (fuzzy matching is used as fallback).",
+                    },
+                    "new_str": {
+                        "type": "string",
+                        "description": "The replacement text. For append_after/prepend_before modes, the text to insert.",
                     },
                     "replace_all": {
                         "type": "boolean",
+                        "description": "Replace all occurrences of old_str (default: false).",
                         "default": False,
                     },
                     "mode": {

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -245,25 +245,6 @@ def _make_tool_handler(sandbox_tool_name: str):
             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
             if result.success:
                 output = result.output or "(no output)"
-                cache = getattr(session, "file_content_cache", None)
-                file_path = args.get("path", "")
-                if sandbox_tool_name == "read" and cache and file_path:
-                    is_unchanged, last_turn = cache.check_unchanged(
-                        f"sandbox:{file_path}", output
-                    )
-                    if is_unchanged:
-                        return (
-                            f"[File unchanged since turn {last_turn}, "
-                            f"content already in context.]"
-                        ), True
-                    cache.record_read(
-                        f"sandbox:{file_path}", output, session.turn_count
-                    )
-                if sandbox_tool_name in ("write", "edit") and cache and file_path:
-                    cache.clear_path(f"sandbox:{file_path}")
                 return output, True
             else:
                 error_msg = result.error or "Unknown error"

             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
             if result.success:
                 output = result.output or "(no output)"
                 return output, True
             else:
                 error_msg = result.error or "Unknown error"

pyproject.toml CHANGED Viewed

@@ -3,7 +3,7 @@ name = "hf-agent"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.12"
 dependencies = [
     "datasets>=4.4.1",
     # Core dependencies (always required)
@@ -49,3 +49,13 @@ dev = [
 all = [
     "hf-agent[agent,eval,dev]",
 ]

 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
+requires-python = ">=3.11"
 dependencies = [
     "datasets>=4.4.1",
     # Core dependencies (always required)
 all = [
     "hf-agent[agent,eval,dev]",
 ]
+[build-system]
+requires = ["setuptools>=64"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["agent*"]
+[tool.uv]
+package = true

uv.lock CHANGED Viewed

@@ -871,7 +871,7 @@ wheels = [
 [[package]]
 name = "hf-agent"
 version = "0.1.0"
-source = { virtual = "." }
 dependencies = [
     { name = "datasets" },
     { name = "pydantic" },
@@ -890,6 +890,7 @@ agent = [
     { name = "nbformat" },
     { name = "prompt-toolkit" },
     { name = "requests" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "websockets" },
@@ -909,6 +910,7 @@ all = [
     { name = "prompt-toolkit" },
     { name = "pytest" },
     { name = "requests" },
     { name = "tenacity" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
@@ -945,6 +947,7 @@ requires-dist = [
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
     { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
     { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
     { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },

 [[package]]
 name = "hf-agent"
 version = "0.1.0"
+source = { editable = "." }
 dependencies = [
     { name = "datasets" },
     { name = "pydantic" },
     { name = "nbformat" },
     { name = "prompt-toolkit" },
     { name = "requests" },
+    { name = "rich" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "websockets" },
     { name = "prompt-toolkit" },
     { name = "pytest" },
     { name = "requests" },
+    { name = "rich" },
     { name = "tenacity" },
     { name = "thefuzz" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", marker = "extra == 'agent'", specifier = ">=2.32.5" },
+    { name = "rich", marker = "extra == 'agent'", specifier = ">=13.0.0" },
     { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
     { name = "thefuzz", marker = "extra == 'agent'", specifier = ">=0.22.1" },
     { name = "uvicorn", extras = ["standard"], marker = "extra == 'agent'", specifier = ">=0.32.0" },