akseljoonas HF Staff commited on
Commit
da5e0c7
·
1 Parent(s): 571b292

stream sandbox creation logs to tool output, fix hf_jobs final status race

Browse files
agent/tools/jobs_tool.py CHANGED
@@ -466,11 +466,17 @@ class HfJobsTool:
466
  await asyncio.sleep(retry_delay)
467
  continue
468
 
469
- # Fetch final job status
470
- job_info = await _async_call(
471
- self.api.inspect_job, job_id=job_id, namespace=namespace
472
- )
473
- final_status = job_info.status.stage
 
 
 
 
 
 
474
 
475
  return final_status, all_logs
476
 
@@ -548,6 +554,20 @@ class HfJobsTool:
548
  namespace=self.namespace,
549
  )
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  # Filter out UV package installation output
552
  filtered_logs = _filter_uv_install_output(all_logs)
553
 
 
466
  await asyncio.sleep(retry_delay)
467
  continue
468
 
469
+ # Fetch final job status — retry briefly if still RUNNING
470
+ # (the API may lag a few seconds behind the log stream ending)
471
+ final_status = "UNKNOWN"
472
+ for _ in range(6):
473
+ job_info = await _async_call(
474
+ self.api.inspect_job, job_id=job_id, namespace=namespace
475
+ )
476
+ final_status = job_info.status.stage
477
+ if final_status in terminal_states:
478
+ break
479
+ await asyncio.sleep(2.5)
480
 
481
  return final_status, all_logs
482
 
 
554
  namespace=self.namespace,
555
  )
556
 
557
+ # Notify frontend of final status
558
+ if self.session and self.tool_call_id:
559
+ await self.session.send_event(
560
+ Event(
561
+ event_type="tool_state_change",
562
+ data={
563
+ "tool_call_id": self.tool_call_id,
564
+ "tool": "hf_jobs",
565
+ "state": final_status.lower(),
566
+ "jobUrl": job.url,
567
+ },
568
+ )
569
+ )
570
+
571
  # Filter out UV package installation output
572
  filtered_logs = _filter_uv_install_output(all_logs)
573
 
agent/tools/sandbox_client.py CHANGED
@@ -41,7 +41,7 @@ import sys
41
  import time
42
  import uuid
43
  from dataclasses import dataclass, field
44
- from typing import Any
45
 
46
  import httpx
47
  from huggingface_hub import CommitOperationAdd, HfApi
@@ -265,6 +265,7 @@ class Sandbox:
265
  sleep_time: int | None = None,
266
  token: str | None = None,
267
  wait_timeout: int = WAIT_TIMEOUT,
 
268
  ) -> Sandbox:
269
  """
270
  Create a new sandbox by duplicating the template Space.
@@ -286,13 +287,14 @@ class Sandbox:
286
  Returns:
287
  A Sandbox instance connected to the running Space.
288
  """
 
289
  api = HfApi(token=token)
290
 
291
  base = name or "sandbox"
292
  suffix = uuid.uuid4().hex[:8]
293
  space_id = f"{owner}/{base}-{suffix}"
294
 
295
- print(f"Creating sandbox: {space_id} (from {template})...")
296
 
297
  kwargs = {
298
  "from_id": template,
@@ -304,25 +306,25 @@ class Sandbox:
304
  kwargs["sleep_time"] = sleep_time
305
 
306
  api.duplicate_space(**kwargs)
307
- print(f"Space created: https://huggingface.co/spaces/{space_id}")
308
 
309
  # Upload sandbox server and Dockerfile (triggers rebuild)
310
- cls._setup_server(space_id, api)
311
 
312
  # Wait for it to come online (rebuild + start)
313
- print(f"Waiting for Space to start (timeout: {wait_timeout}s)...")
314
  deadline = time.time() + wait_timeout
315
  while time.time() < deadline:
316
  runtime = api.get_space_runtime(space_id)
317
  if runtime.stage == "RUNNING":
318
- print(f"Space is running (hardware: {runtime.hardware})")
319
  break
320
  if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
321
  raise RuntimeError(
322
  f"Space failed to start: {runtime.stage}. "
323
  f"Check https://huggingface.co/spaces/{space_id}"
324
  )
325
- print(f" {runtime.stage}...")
326
  time.sleep(WAIT_INTERVAL)
327
  else:
328
  raise TimeoutError(
@@ -333,17 +335,17 @@ class Sandbox:
333
  # Wait for the API server to be responsive (non-fatal)
334
  sb = cls(space_id=space_id, token=token, _owns_space=True)
335
  try:
336
- sb._wait_for_api(timeout=API_WAIT_TIMEOUT)
337
  except TimeoutError as e:
338
- print(
339
  f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
340
  )
341
  return sb
342
 
343
  @staticmethod
344
- def _setup_server(space_id: str, api: HfApi) -> None:
345
  """Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
346
- print(f"Uploading sandbox server to {space_id}...")
347
  api.create_commit(
348
  repo_id=space_id,
349
  repo_type="space",
@@ -359,7 +361,7 @@ class Sandbox:
359
  ],
360
  commit_message="Setup sandbox server",
361
  )
362
- print("Server files uploaded, rebuild triggered.")
363
 
364
  @classmethod
365
  def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
@@ -372,7 +374,7 @@ class Sandbox:
372
  sb._wait_for_api(timeout=60)
373
  return sb
374
 
375
- def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT):
376
  """Poll the health endpoint until the server responds."""
377
  deadline = time.time() + timeout
378
  last_err = None
@@ -382,7 +384,7 @@ class Sandbox:
382
  resp = self._client.get("health", timeout=10)
383
  last_status = resp.status_code
384
  if resp.status_code == 200:
385
- print(f"API is responsive at {self._base_url}")
386
  return
387
  except Exception as e:
388
  last_err = e
 
41
  import time
42
  import uuid
43
  from dataclasses import dataclass, field
44
+ from typing import Any, Callable
45
 
46
  import httpx
47
  from huggingface_hub import CommitOperationAdd, HfApi
 
265
  sleep_time: int | None = None,
266
  token: str | None = None,
267
  wait_timeout: int = WAIT_TIMEOUT,
268
+ log: "Callable[[str], object] | None" = None,
269
  ) -> Sandbox:
270
  """
271
  Create a new sandbox by duplicating the template Space.
 
287
  Returns:
288
  A Sandbox instance connected to the running Space.
289
  """
290
+ _log = log or print
291
  api = HfApi(token=token)
292
 
293
  base = name or "sandbox"
294
  suffix = uuid.uuid4().hex[:8]
295
  space_id = f"{owner}/{base}-{suffix}"
296
 
297
+ _log(f"Creating sandbox: {space_id} (from {template})...")
298
 
299
  kwargs = {
300
  "from_id": template,
 
306
  kwargs["sleep_time"] = sleep_time
307
 
308
  api.duplicate_space(**kwargs)
309
+ _log(f"Space created: https://huggingface.co/spaces/{space_id}")
310
 
311
  # Upload sandbox server and Dockerfile (triggers rebuild)
312
+ cls._setup_server(space_id, api, log=_log)
313
 
314
  # Wait for it to come online (rebuild + start)
315
+ _log(f"Waiting for Space to start (timeout: {wait_timeout}s)...")
316
  deadline = time.time() + wait_timeout
317
  while time.time() < deadline:
318
  runtime = api.get_space_runtime(space_id)
319
  if runtime.stage == "RUNNING":
320
+ _log(f"Space is running (hardware: {runtime.hardware})")
321
  break
322
  if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
323
  raise RuntimeError(
324
  f"Space failed to start: {runtime.stage}. "
325
  f"Check https://huggingface.co/spaces/{space_id}"
326
  )
327
+ _log(f" {runtime.stage}...")
328
  time.sleep(WAIT_INTERVAL)
329
  else:
330
  raise TimeoutError(
 
335
  # Wait for the API server to be responsive (non-fatal)
336
  sb = cls(space_id=space_id, token=token, _owns_space=True)
337
  try:
338
+ sb._wait_for_api(timeout=API_WAIT_TIMEOUT, log=_log)
339
  except TimeoutError as e:
340
+ _log(
341
  f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
342
  )
343
  return sb
344
 
345
  @staticmethod
346
+ def _setup_server(space_id: str, api: HfApi, *, log: Callable[[str], object] = print) -> None:
347
  """Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
348
+ log(f"Uploading sandbox server to {space_id}...")
349
  api.create_commit(
350
  repo_id=space_id,
351
  repo_type="space",
 
361
  ],
362
  commit_message="Setup sandbox server",
363
  )
364
+ log("Server files uploaded, rebuild triggered.")
365
 
366
  @classmethod
367
  def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
 
374
  sb._wait_for_api(timeout=60)
375
  return sb
376
 
377
+ def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT, log: Callable[[str], object] = print):
378
  """Poll the health endpoint until the server responds."""
379
  deadline = time.time() + timeout
380
  last_err = None
 
384
  resp = self._client.get("health", timeout=10)
385
  last_status = resp.status_code
386
  if resp.status_code == 200:
387
+ log(f"API is responsive at {self._base_url}")
388
  return
389
  except Exception as e:
390
  last_err = e
agent/tools/sandbox_tool.py CHANGED
@@ -89,7 +89,16 @@ async def _ensure_sandbox(
89
  )
90
  )
91
 
92
- kwargs = {"owner": owner, "hardware": hardware, "token": token, **create_kwargs}
 
 
 
 
 
 
 
 
 
93
  if hardware != "cpu-basic":
94
  kwargs["sleep_time"] = 1500
95
  sb = await asyncio.to_thread(Sandbox.create, **kwargs)
@@ -122,6 +131,10 @@ SANDBOX_CREATE_TOOL_SPEC = {
122
  "or the script is copied from a verified working example with minimal changes.\n\n"
123
  "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
124
  "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
 
 
 
 
125
  "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
126
  ),
127
  "parameters": {
 
89
  )
90
  )
91
 
92
+ # Thread-safe log callback: posts tool_log events from the worker thread
93
+ loop = asyncio.get_running_loop()
94
+
95
+ def _log(msg: str) -> None:
96
+ loop.call_soon_threadsafe(
97
+ session.event_queue.put_nowait,
98
+ Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
99
+ )
100
+
101
+ kwargs = {"owner": owner, "hardware": hardware, "token": token, "log": _log, **create_kwargs}
102
  if hardware != "cpu-basic":
103
  kwargs["sleep_time"] = 1500
104
  sb = await asyncio.to_thread(Sandbox.create, **kwargs)
 
131
  "or the script is copied from a verified working example with minimal changes.\n\n"
132
  "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
133
  "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
134
+ "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
135
+ "fp32 ≈ 4 bytes/param, plus ~20% overhead for optimizer states during training.\n"
136
+ "Common picks: t4-small (16GB VRAM, fits ≤1-3B), a10g-small (24GB, ≤7B), a100-large (80GB, ≤30B). "
137
+ "If the model won't fit, pick larger hardware upfront — OOM on a sandbox wastes time.\n\n"
138
  "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
139
  ),
140
  "parameters": {
frontend/src/components/Chat/ActivityStatusBar.tsx CHANGED
@@ -9,8 +9,8 @@ const shimmer = keyframes`
9
  `;
10
 
11
  const TOOL_LABELS: Record<string, string> = {
12
- sandbox_create: 'Creating sandbox, this might take 1-2 minutes',
13
- hf_jobs: 'Running job',
14
  hf_repo_files: 'Uploading file',
15
  hf_repo_git: 'Git operation',
16
  hf_inspect_dataset: 'Inspecting dataset',
 
9
  `;
10
 
11
  const TOOL_LABELS: Record<string, string> = {
12
+ sandbox_create: 'Creating sandbox for code development, this might take 1-2 minutes',
13
+ hf_jobs: 'Running a GPU job, this might take a while',
14
  hf_repo_files: 'Uploading file',
15
  hf_repo_git: 'Git operation',
16
  hf_inspect_dataset: 'Inspecting dataset',
frontend/src/hooks/useAgentChat.ts CHANGED
@@ -98,12 +98,13 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
98
  },
99
  onToolLog: (tool: string, log: string) => {
100
  if (!isActiveRef.current) return;
101
- if (tool === 'hf_jobs') {
102
  const state = useAgentStore.getState();
103
  const existingOutput = state.panelData?.output?.content || '';
 
104
  const newContent = existingOutput
105
  ? existingOutput + '\n' + log
106
- : '--- Job execution started ---\n' + log;
107
 
108
  setPanelOutput({ content: newContent, language: 'text' });
109
 
 
98
  },
99
  onToolLog: (tool: string, log: string) => {
100
  if (!isActiveRef.current) return;
101
+ if (tool === 'hf_jobs' || tool === 'sandbox') {
102
  const state = useAgentStore.getState();
103
  const existingOutput = state.panelData?.output?.content || '';
104
+ const header = tool === 'sandbox' ? '--- Sandbox creation ---' : '--- Job execution started ---';
105
  const newContent = existingOutput
106
  ? existingOutput + '\n' + log
107
+ : header + '\n' + log;
108
 
109
  setPanelOutput({ content: newContent, language: 'text' });
110