Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

akseljoonas HF Staff Claude Opus 4.6 commited on Mar 31

Commit

3c91fc8

1 Parent(s): e229826

feat: kill sandbox processes and cancel HF jobs on user interrupt

When the user clicks cancel, long-running sandbox commands and HF jobs
now actually stop instead of running to completion in the background.

- Sandbox server: track PIDs with Popen, add /api/kill endpoint
- Sandbox client: add kill_all() method
- HF jobs: track running job IDs in session, cancel on interrupt
- Agent loop: cleanup on cancel in both run_agent and exec_approval
- exec_approval: add cancellation support (was completely missing)
- Sandbox title: set descriptive name instead of inheriting template

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

agent/core/agent_loop.py +104 -14
agent/core/session.py +1 -0
agent/tools/jobs_tool.py +8 -0
agent/tools/sandbox_client.py +48 -10
agent/tools/sandbox_tool.py +29 -6

agent/core/agent_loop.py CHANGED Viewed

@@ -146,6 +146,27 @@ def _needs_approval(
     return False
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
     old_length = session.context_manager.context_length
@@ -164,6 +185,32 @@ async def _compact_and_notify(session: Session) -> None:
         )
 class Handlers:
     """Handler functions for each operation type"""
@@ -247,17 +294,37 @@ class Handlers:
             messages = session.context_manager.get_messages()
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
-                # ── Stream the LLM response ──────────────────────────
                 llm_params = _resolve_hf_router_params(session.config.model_name)
-                response = await acompletion(
-                    messages=messages,
-                    tools=tools,
-                    tool_choice="auto",
-                    stream=True,
-                    stream_options={"include_usage": True},
-                    timeout=600,       # 10 min — long tool-use turns can take a while
-                    **llm_params,
-                )
                 full_content = ""
                 tool_calls_acc: dict[int, dict] = {}
@@ -355,8 +422,8 @@ class Handlers:
                     )
                     await session.send_event(
                         Event(
-                            event_type="error",
-                            data={"error": f"Output truncated — retrying with smaller content ({dropped_names})"},
                         )
                     )
                     iteration += 1
@@ -510,6 +577,7 @@ class Handlers:
                             await gather_task
                         except asyncio.CancelledError:
                             pass
                         break
                     cancel_task.cancel()
@@ -593,6 +661,7 @@ class Handlers:
                 break
         if session.is_cancelled:
             await session.send_event(Event(event_type="interrupted"))
         elif not errored:
             await session.send_event(
@@ -743,16 +812,37 @@ class Handlers:
             return (tc, tool_name, output, success, was_edited)
-        # Execute all approved tools concurrently and wait for ALL to complete
         if approved_tasks:
-            results = await asyncio.gather(
                 *[
                     execute_tool(tc, tool_name, tool_args, was_edited)
                     for tc, tool_name, tool_args, was_edited in approved_tasks
                 ],
                 return_exceptions=True,
             )
             # Process results and add to context
             for result in results:
                 if isinstance(result, Exception):

     return False
+# -- LLM retry constants --------------------------------------------------
+_MAX_LLM_RETRIES = 3
+_LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
+def _is_transient_error(error: Exception) -> bool:
+    """Return True for errors that are likely transient and worth retrying."""
+    err_str = str(error).lower()
+    transient_patterns = [
+        "timeout", "timed out",
+        "429", "rate limit", "rate_limit",
+        "503", "service unavailable",
+        "502", "bad gateway",
+        "500", "internal server error",
+        "overloaded", "capacity",
+        "connection reset", "connection refused", "connection error",
+        "eof", "broken pipe",
+    ]
+    return any(pattern in err_str for pattern in transient_patterns)
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
     old_length = session.context_manager.context_length
         )
+async def _cleanup_on_cancel(session: Session) -> None:
+    """Kill sandbox processes and cancel HF jobs when the user interrupts."""
+    # Kill active sandbox processes
+    sandbox = getattr(session, "sandbox", None)
+    if sandbox:
+        try:
+            await asyncio.to_thread(sandbox.kill_all)
+            logger.info("Killed sandbox processes on cancel")
+        except Exception as e:
+            logger.warning("Failed to kill sandbox processes: %s", e)
+    # Cancel running HF jobs
+    job_ids = list(session._running_job_ids)
+    if job_ids:
+        from huggingface_hub import HfApi
+        api = HfApi(token=session.hf_token)
+        for job_id in job_ids:
+            try:
+                await asyncio.to_thread(api.cancel_job, job_id=job_id)
+                logger.info("Cancelled HF job %s on interrupt", job_id)
+            except Exception as e:
+                logger.warning("Failed to cancel HF job %s: %s", job_id, e)
+        session._running_job_ids.clear()
 class Handlers:
     """Handler functions for each operation type"""
             messages = session.context_manager.get_messages()
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
+                # ── Stream the LLM response (with retry for transient errors) ──
                 llm_params = _resolve_hf_router_params(session.config.model_name)
+                response = None
+                for _llm_attempt in range(_MAX_LLM_RETRIES):
+                    try:
+                        response = await acompletion(
+                            messages=messages,
+                            tools=tools,
+                            tool_choice="auto",
+                            stream=True,
+                            stream_options={"include_usage": True},
+                            timeout=600,
+                            **llm_params,
+                        )
+                        break
+                    except ContextWindowExceededError:
+                        raise
+                    except Exception as e:
+                        if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
+                            _delay = _LLM_RETRY_DELAYS[_llm_attempt]
+                            logger.warning(
+                                "Transient LLM error (attempt %d/%d): %s — retrying in %ds",
+                                _llm_attempt + 1, _MAX_LLM_RETRIES, e, _delay,
+                            )
+                            await session.send_event(Event(
+                                event_type="tool_log",
+                                data={"tool": "system", "log": f"LLM connection error, retrying in {_delay}s..."},
+                            ))
+                            await asyncio.sleep(_delay)
+                            continue
+                        raise
                 full_content = ""
                 tool_calls_acc: dict[int, dict] = {}
                     )
                     await session.send_event(
                         Event(
+                            event_type="tool_log",
+                            data={"tool": "system", "log": f"Output truncated — retrying with smaller content ({dropped_names})"},
                         )
                     )
                     iteration += 1
                             await gather_task
                         except asyncio.CancelledError:
                             pass
+                        await _cleanup_on_cancel(session)
                         break
                     cancel_task.cancel()
                 break
         if session.is_cancelled:
+            await _cleanup_on_cancel(session)
             await session.send_event(Event(event_type="interrupted"))
         elif not errored:
             await session.send_event(
             return (tc, tool_name, output, success, was_edited)
+        # Execute all approved tools concurrently (cancellable)
         if approved_tasks:
+            gather_task = asyncio.ensure_future(asyncio.gather(
                 *[
                     execute_tool(tc, tool_name, tool_args, was_edited)
                     for tc, tool_name, tool_args, was_edited in approved_tasks
                 ],
                 return_exceptions=True,
+            ))
+            cancel_task = asyncio.ensure_future(session._cancelled.wait())
+            done, _ = await asyncio.wait(
+                [gather_task, cancel_task],
+                return_when=asyncio.FIRST_COMPLETED,
             )
+            if cancel_task in done:
+                gather_task.cancel()
+                try:
+                    await gather_task
+                except asyncio.CancelledError:
+                    pass
+                await _cleanup_on_cancel(session)
+                await session.send_event(Event(event_type="interrupted"))
+                session.increment_turn()
+                await session.auto_save_if_needed()
+                return
+            cancel_task.cancel()
+            results = gather_task.result()
             # Process results and add to context
             for result in results:
                 if isinstance(result, Exception):

agent/core/session.py CHANGED Viewed

@@ -103,6 +103,7 @@ class Session:
         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None
         # Session trajectory logging
         self.logged_events: list[dict] = []

         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None
+        self._running_job_ids: set[str] = set()  # HF job IDs currently executing
         # Session trajectory logging
         self.logged_events: list[dict] = []

agent/tools/jobs_tool.py CHANGED Viewed

@@ -531,6 +531,10 @@ class HfJobsTool:
                 namespace=self.namespace,
             )
             # Send job URL immediately after job creation (before waiting for completion)
             if self.session and self.tool_call_id:
                 await self.session.send_event(
@@ -554,6 +558,10 @@ class HfJobsTool:
                 namespace=self.namespace,
             )
             # Notify frontend of final status
             if self.session and self.tool_call_id:
                 await self.session.send_event(

                 namespace=self.namespace,
             )
+            # Track job ID for cancellation on interrupt
+            if self.session:
+                self.session._running_job_ids.add(job.id)
             # Send job URL immediately after job creation (before waiting for completion)
             if self.session and self.tool_call_id:
                 await self.session.send_event(
                 namespace=self.namespace,
             )
+            # Untrack job ID (completed or failed, no longer needs cancellation)
+            if self.session:
+                self.session._running_job_ids.discard(job.id)
             # Notify frontend of final status
             if self.session and self.tool_call_id:
                 await self.session.send_event(

agent/tools/sandbox_client.py CHANGED Viewed

@@ -97,7 +97,7 @@ CMD ["python", "sandbox_server.py"]
 _SANDBOX_SERVER = '''\
 """Minimal FastAPI server for sandbox operations."""
-import os, subprocess, pathlib
 from fastapi import FastAPI
 from pydantic import BaseModel
 from typing import Optional
@@ -105,6 +105,10 @@ import uvicorn
 app = FastAPI()
 class BashReq(BaseModel):
     command: str
     work_dir: str = "/app"
@@ -135,19 +139,49 @@ def health():
 @app.post("/api/bash")
 def bash(req: BashReq):
     try:
-        r = subprocess.run(
-            req.command, shell=True, capture_output=True, text=True,
-            cwd=req.work_dir, timeout=req.timeout,
         )
-        output = r.stdout + r.stderr
-        if len(output) > 30000:
-            output = output[:30000] + "\\n... (truncated)"
-        return {"success": r.returncode == 0, "output": output, "error": "" if r.returncode == 0 else f"Exit code {r.returncode}"}
-    except subprocess.TimeoutExpired:
-        return {"success": False, "output": "", "error": f"Timeout after {req.timeout}s"}
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
 @app.post("/api/read")
 def read(req: ReadReq):
     try:
@@ -566,6 +600,10 @@ class Sandbox:
             },
         )
     # ── Tool schemas & dispatch ───────────────────────────────────
     TOOLS = {

 _SANDBOX_SERVER = '''\
 """Minimal FastAPI server for sandbox operations."""
+import os, subprocess, pathlib, signal, threading
 from fastapi import FastAPI
 from pydantic import BaseModel
 from typing import Optional
 app = FastAPI()
+# Track active bash processes so they can be killed on cancel
+_active_procs = {}  # pid -> subprocess.Popen
+_proc_lock = threading.Lock()
 class BashReq(BaseModel):
     command: str
     work_dir: str = "/app"
 @app.post("/api/bash")
 def bash(req: BashReq):
     try:
+        proc = subprocess.Popen(
+            req.command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+            text=True, cwd=req.work_dir, start_new_session=True,
         )
+        with _proc_lock:
+            _active_procs[proc.pid] = proc
+        try:
+            stdout, stderr = proc.communicate(timeout=req.timeout)
+            output = stdout + stderr
+            if len(output) > 30000:
+                output = output[:30000] + "\\n... (truncated)"
+            return {"success": proc.returncode == 0, "output": output, "error": "" if proc.returncode == 0 else f"Exit code {proc.returncode}"}
+        except subprocess.TimeoutExpired:
+            try:
+                os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+            except OSError:
+                proc.kill()
+            proc.wait()
+            return {"success": False, "output": "", "error": f"Timeout after {req.timeout}s"}
+        finally:
+            with _proc_lock:
+                _active_procs.pop(proc.pid, None)
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
+@app.post("/api/kill")
+def kill_all():
+    """Kill all active bash processes. Called when user cancels."""
+    with _proc_lock:
+        pids = list(_active_procs.keys())
+    killed = []
+    for pid in pids:
+        try:
+            os.killpg(os.getpgid(pid), signal.SIGTERM)
+            killed.append(pid)
+        except OSError:
+            try:
+                os.kill(pid, signal.SIGKILL)
+                killed.append(pid)
+            except OSError:
+                pass
+    return {"success": True, "output": f"Killed {len(killed)} process(es): {killed}", "error": ""}
 @app.post("/api/read")
 def read(req: ReadReq):
     try:
             },
         )
+    def kill_all(self) -> ToolResult:
+        """Kill all active bash processes on the sandbox. Used on cancellation."""
+        return self._call("kill", {})
     # ── Tool schemas & dispatch ───────────────────────────────────
     TOOLS = {

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -27,11 +27,17 @@ def _looks_like_path(script: str) -> bool:
         isinstance(script, str)
         and script.strip() == script
         and not any(c in script for c in "\r\n\0")
-        and (script.startswith("/") or script.startswith("./") or script.startswith("../"))
     )
-async def resolve_sandbox_script(sandbox: Any, script: str) -> tuple[str | None, str | None]:
     """Read a file from the sandbox if *script* looks like a path.
     Returns:
@@ -42,15 +48,14 @@ async def resolve_sandbox_script(sandbox: Any, script: str) -> tuple[str | None,
     if not sandbox or not _looks_like_path(script):
         return None, None
     try:
-        result = await asyncio.to_thread(
-            sandbox.bash, f"cat {shlex.quote(script)}"
-        )
         if result.success and result.output:
             return result.output, None
         return None, f"Failed to read {script} from sandbox: {result.error}"
     except Exception as e:
         return None, f"Failed to read {script} from sandbox: {e}"
 # ── Tool name mapping (short agent names → Sandbox client names) ──────
@@ -98,12 +103,30 @@ async def _ensure_sandbox(
             Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
         )
-    kwargs = {"owner": owner, "hardware": hardware, "token": token, "log": _log, **create_kwargs}
     if hardware != "cpu-basic":
         kwargs["sleep_time"] = 2700
     sb = await asyncio.to_thread(Sandbox.create, **kwargs)
     session.sandbox = sb
     # Inject the OAuth token into the sandbox so Hub operations work inside it
     await asyncio.to_thread(api.add_space_secret, sb.space_id, "HF_TOKEN", token)

         isinstance(script, str)
         and script.strip() == script
         and not any(c in script for c in "\r\n\0")
+        and (
+            script.startswith("/")
+            or script.startswith("./")
+            or script.startswith("../")
+        )
     )
+async def resolve_sandbox_script(
+    sandbox: Any, script: str
+) -> tuple[str | None, str | None]:
     """Read a file from the sandbox if *script* looks like a path.
     Returns:
     if not sandbox or not _looks_like_path(script):
         return None, None
     try:
+        result = await asyncio.to_thread(sandbox.bash, f"cat {shlex.quote(script)}")
         if result.success and result.output:
             return result.output, None
         return None, f"Failed to read {script} from sandbox: {result.error}"
     except Exception as e:
         return None, f"Failed to read {script} from sandbox: {e}"
 # ── Tool name mapping (short agent names → Sandbox client names) ──────
             Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
         )
+    kwargs = {
+        "owner": owner,
+        "hardware": hardware,
+        "token": token,
+        "log": _log,
+        **create_kwargs,
+    }
     if hardware != "cpu-basic":
         kwargs["sleep_time"] = 2700
     sb = await asyncio.to_thread(Sandbox.create, **kwargs)
     session.sandbox = sb
+    # Set a descriptive title (template title is inherited on duplicate)
+    from huggingface_hub import metadata_update
+    await asyncio.to_thread(
+        metadata_update,
+        sb.space_id,
+        {"title": "ml-agent sandbox"},
+        repo_type="space",
+        overwrite=True,
+        token=token,
+    )
     # Inject the OAuth token into the sandbox so Hub operations work inside it
     await asyncio.to_thread(api.add_space_secret, sb.space_id, "HF_TOKEN", token)