Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
da5e0c7
1
Parent(s): 571b292
stream sandbox creation logs to tool output, fix hf_jobs final status race
Browse files
agent/tools/jobs_tool.py
CHANGED
|
@@ -466,11 +466,17 @@ class HfJobsTool:
|
|
| 466 |
await asyncio.sleep(retry_delay)
|
| 467 |
continue
|
| 468 |
|
| 469 |
-
# Fetch final job status
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
)
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
return final_status, all_logs
|
| 476 |
|
|
@@ -548,6 +554,20 @@ class HfJobsTool:
|
|
| 548 |
namespace=self.namespace,
|
| 549 |
)
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
# Filter out UV package installation output
|
| 552 |
filtered_logs = _filter_uv_install_output(all_logs)
|
| 553 |
|
|
|
|
| 466 |
await asyncio.sleep(retry_delay)
|
| 467 |
continue
|
| 468 |
|
| 469 |
+
# Fetch final job status — retry briefly if still RUNNING
|
| 470 |
+
# (the API may lag a few seconds behind the log stream ending)
|
| 471 |
+
final_status = "UNKNOWN"
|
| 472 |
+
for _ in range(6):
|
| 473 |
+
job_info = await _async_call(
|
| 474 |
+
self.api.inspect_job, job_id=job_id, namespace=namespace
|
| 475 |
+
)
|
| 476 |
+
final_status = job_info.status.stage
|
| 477 |
+
if final_status in terminal_states:
|
| 478 |
+
break
|
| 479 |
+
await asyncio.sleep(2.5)
|
| 480 |
|
| 481 |
return final_status, all_logs
|
| 482 |
|
|
|
|
| 554 |
namespace=self.namespace,
|
| 555 |
)
|
| 556 |
|
| 557 |
+
# Notify frontend of final status
|
| 558 |
+
if self.session and self.tool_call_id:
|
| 559 |
+
await self.session.send_event(
|
| 560 |
+
Event(
|
| 561 |
+
event_type="tool_state_change",
|
| 562 |
+
data={
|
| 563 |
+
"tool_call_id": self.tool_call_id,
|
| 564 |
+
"tool": "hf_jobs",
|
| 565 |
+
"state": final_status.lower(),
|
| 566 |
+
"jobUrl": job.url,
|
| 567 |
+
},
|
| 568 |
+
)
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
# Filter out UV package installation output
|
| 572 |
filtered_logs = _filter_uv_install_output(all_logs)
|
| 573 |
|
agent/tools/sandbox_client.py
CHANGED
|
@@ -41,7 +41,7 @@ import sys
|
|
| 41 |
import time
|
| 42 |
import uuid
|
| 43 |
from dataclasses import dataclass, field
|
| 44 |
-
from typing import Any
|
| 45 |
|
| 46 |
import httpx
|
| 47 |
from huggingface_hub import CommitOperationAdd, HfApi
|
|
@@ -265,6 +265,7 @@ class Sandbox:
|
|
| 265 |
sleep_time: int | None = None,
|
| 266 |
token: str | None = None,
|
| 267 |
wait_timeout: int = WAIT_TIMEOUT,
|
|
|
|
| 268 |
) -> Sandbox:
|
| 269 |
"""
|
| 270 |
Create a new sandbox by duplicating the template Space.
|
|
@@ -286,13 +287,14 @@ class Sandbox:
|
|
| 286 |
Returns:
|
| 287 |
A Sandbox instance connected to the running Space.
|
| 288 |
"""
|
|
|
|
| 289 |
api = HfApi(token=token)
|
| 290 |
|
| 291 |
base = name or "sandbox"
|
| 292 |
suffix = uuid.uuid4().hex[:8]
|
| 293 |
space_id = f"{owner}/{base}-{suffix}"
|
| 294 |
|
| 295 |
-
|
| 296 |
|
| 297 |
kwargs = {
|
| 298 |
"from_id": template,
|
|
@@ -304,25 +306,25 @@ class Sandbox:
|
|
| 304 |
kwargs["sleep_time"] = sleep_time
|
| 305 |
|
| 306 |
api.duplicate_space(**kwargs)
|
| 307 |
-
|
| 308 |
|
| 309 |
# Upload sandbox server and Dockerfile (triggers rebuild)
|
| 310 |
-
cls._setup_server(space_id, api)
|
| 311 |
|
| 312 |
# Wait for it to come online (rebuild + start)
|
| 313 |
-
|
| 314 |
deadline = time.time() + wait_timeout
|
| 315 |
while time.time() < deadline:
|
| 316 |
runtime = api.get_space_runtime(space_id)
|
| 317 |
if runtime.stage == "RUNNING":
|
| 318 |
-
|
| 319 |
break
|
| 320 |
if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
|
| 321 |
raise RuntimeError(
|
| 322 |
f"Space failed to start: {runtime.stage}. "
|
| 323 |
f"Check https://huggingface.co/spaces/{space_id}"
|
| 324 |
)
|
| 325 |
-
|
| 326 |
time.sleep(WAIT_INTERVAL)
|
| 327 |
else:
|
| 328 |
raise TimeoutError(
|
|
@@ -333,17 +335,17 @@ class Sandbox:
|
|
| 333 |
# Wait for the API server to be responsive (non-fatal)
|
| 334 |
sb = cls(space_id=space_id, token=token, _owns_space=True)
|
| 335 |
try:
|
| 336 |
-
sb._wait_for_api(timeout=API_WAIT_TIMEOUT)
|
| 337 |
except TimeoutError as e:
|
| 338 |
-
|
| 339 |
f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
|
| 340 |
)
|
| 341 |
return sb
|
| 342 |
|
| 343 |
@staticmethod
|
| 344 |
-
def _setup_server(space_id: str, api: HfApi) -> None:
|
| 345 |
"""Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
|
| 346 |
-
|
| 347 |
api.create_commit(
|
| 348 |
repo_id=space_id,
|
| 349 |
repo_type="space",
|
|
@@ -359,7 +361,7 @@ class Sandbox:
|
|
| 359 |
],
|
| 360 |
commit_message="Setup sandbox server",
|
| 361 |
)
|
| 362 |
-
|
| 363 |
|
| 364 |
@classmethod
|
| 365 |
def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
|
|
@@ -372,7 +374,7 @@ class Sandbox:
|
|
| 372 |
sb._wait_for_api(timeout=60)
|
| 373 |
return sb
|
| 374 |
|
| 375 |
-
def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT):
|
| 376 |
"""Poll the health endpoint until the server responds."""
|
| 377 |
deadline = time.time() + timeout
|
| 378 |
last_err = None
|
|
@@ -382,7 +384,7 @@ class Sandbox:
|
|
| 382 |
resp = self._client.get("health", timeout=10)
|
| 383 |
last_status = resp.status_code
|
| 384 |
if resp.status_code == 200:
|
| 385 |
-
|
| 386 |
return
|
| 387 |
except Exception as e:
|
| 388 |
last_err = e
|
|
|
|
| 41 |
import time
|
| 42 |
import uuid
|
| 43 |
from dataclasses import dataclass, field
|
| 44 |
+
from typing import Any, Callable
|
| 45 |
|
| 46 |
import httpx
|
| 47 |
from huggingface_hub import CommitOperationAdd, HfApi
|
|
|
|
| 265 |
sleep_time: int | None = None,
|
| 266 |
token: str | None = None,
|
| 267 |
wait_timeout: int = WAIT_TIMEOUT,
|
| 268 |
+
log: "Callable[[str], object] | None" = None,
|
| 269 |
) -> Sandbox:
|
| 270 |
"""
|
| 271 |
Create a new sandbox by duplicating the template Space.
|
|
|
|
| 287 |
Returns:
|
| 288 |
A Sandbox instance connected to the running Space.
|
| 289 |
"""
|
| 290 |
+
_log = log or print
|
| 291 |
api = HfApi(token=token)
|
| 292 |
|
| 293 |
base = name or "sandbox"
|
| 294 |
suffix = uuid.uuid4().hex[:8]
|
| 295 |
space_id = f"{owner}/{base}-{suffix}"
|
| 296 |
|
| 297 |
+
_log(f"Creating sandbox: {space_id} (from {template})...")
|
| 298 |
|
| 299 |
kwargs = {
|
| 300 |
"from_id": template,
|
|
|
|
| 306 |
kwargs["sleep_time"] = sleep_time
|
| 307 |
|
| 308 |
api.duplicate_space(**kwargs)
|
| 309 |
+
_log(f"Space created: https://huggingface.co/spaces/{space_id}")
|
| 310 |
|
| 311 |
# Upload sandbox server and Dockerfile (triggers rebuild)
|
| 312 |
+
cls._setup_server(space_id, api, log=_log)
|
| 313 |
|
| 314 |
# Wait for it to come online (rebuild + start)
|
| 315 |
+
_log(f"Waiting for Space to start (timeout: {wait_timeout}s)...")
|
| 316 |
deadline = time.time() + wait_timeout
|
| 317 |
while time.time() < deadline:
|
| 318 |
runtime = api.get_space_runtime(space_id)
|
| 319 |
if runtime.stage == "RUNNING":
|
| 320 |
+
_log(f"Space is running (hardware: {runtime.hardware})")
|
| 321 |
break
|
| 322 |
if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
|
| 323 |
raise RuntimeError(
|
| 324 |
f"Space failed to start: {runtime.stage}. "
|
| 325 |
f"Check https://huggingface.co/spaces/{space_id}"
|
| 326 |
)
|
| 327 |
+
_log(f" {runtime.stage}...")
|
| 328 |
time.sleep(WAIT_INTERVAL)
|
| 329 |
else:
|
| 330 |
raise TimeoutError(
|
|
|
|
| 335 |
# Wait for the API server to be responsive (non-fatal)
|
| 336 |
sb = cls(space_id=space_id, token=token, _owns_space=True)
|
| 337 |
try:
|
| 338 |
+
sb._wait_for_api(timeout=API_WAIT_TIMEOUT, log=_log)
|
| 339 |
except TimeoutError as e:
|
| 340 |
+
_log(
|
| 341 |
f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
|
| 342 |
)
|
| 343 |
return sb
|
| 344 |
|
| 345 |
@staticmethod
|
| 346 |
+
def _setup_server(space_id: str, api: HfApi, *, log: Callable[[str], object] = print) -> None:
|
| 347 |
"""Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
|
| 348 |
+
log(f"Uploading sandbox server to {space_id}...")
|
| 349 |
api.create_commit(
|
| 350 |
repo_id=space_id,
|
| 351 |
repo_type="space",
|
|
|
|
| 361 |
],
|
| 362 |
commit_message="Setup sandbox server",
|
| 363 |
)
|
| 364 |
+
log("Server files uploaded, rebuild triggered.")
|
| 365 |
|
| 366 |
@classmethod
|
| 367 |
def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
|
|
|
|
| 374 |
sb._wait_for_api(timeout=60)
|
| 375 |
return sb
|
| 376 |
|
| 377 |
+
def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT, log: Callable[[str], object] = print):
|
| 378 |
"""Poll the health endpoint until the server responds."""
|
| 379 |
deadline = time.time() + timeout
|
| 380 |
last_err = None
|
|
|
|
| 384 |
resp = self._client.get("health", timeout=10)
|
| 385 |
last_status = resp.status_code
|
| 386 |
if resp.status_code == 200:
|
| 387 |
+
log(f"API is responsive at {self._base_url}")
|
| 388 |
return
|
| 389 |
except Exception as e:
|
| 390 |
last_err = e
|
agent/tools/sandbox_tool.py
CHANGED
|
@@ -89,7 +89,16 @@ async def _ensure_sandbox(
|
|
| 89 |
)
|
| 90 |
)
|
| 91 |
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
if hardware != "cpu-basic":
|
| 94 |
kwargs["sleep_time"] = 1500
|
| 95 |
sb = await asyncio.to_thread(Sandbox.create, **kwargs)
|
|
@@ -122,6 +131,10 @@ SANDBOX_CREATE_TOOL_SPEC = {
|
|
| 122 |
"or the script is copied from a verified working example with minimal changes.\n\n"
|
| 123 |
"For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
|
| 124 |
"CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
"Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
|
| 126 |
),
|
| 127 |
"parameters": {
|
|
|
|
| 89 |
)
|
| 90 |
)
|
| 91 |
|
| 92 |
+
# Thread-safe log callback: posts tool_log events from the worker thread
|
| 93 |
+
loop = asyncio.get_running_loop()
|
| 94 |
+
|
| 95 |
+
def _log(msg: str) -> None:
|
| 96 |
+
loop.call_soon_threadsafe(
|
| 97 |
+
session.event_queue.put_nowait,
|
| 98 |
+
Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
kwargs = {"owner": owner, "hardware": hardware, "token": token, "log": _log, **create_kwargs}
|
| 102 |
if hardware != "cpu-basic":
|
| 103 |
kwargs["sleep_time"] = 1500
|
| 104 |
sb = await asyncio.to_thread(Sandbox.create, **kwargs)
|
|
|
|
| 131 |
"or the script is copied from a verified working example with minimal changes.\n\n"
|
| 132 |
"For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
|
| 133 |
"CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
|
| 134 |
+
"Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
|
| 135 |
+
"fp32 ≈ 4 bytes/param, plus ~20% overhead for optimizer states during training.\n"
|
| 136 |
+
"Common picks: t4-small (16GB VRAM, fits ≤1-3B), a10g-small (24GB, ≤7B), a100-large (80GB, ≤30B). "
|
| 137 |
+
"If the model won't fit, pick larger hardware upfront — OOM on a sandbox wastes time.\n\n"
|
| 138 |
"Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
|
| 139 |
),
|
| 140 |
"parameters": {
|
frontend/src/components/Chat/ActivityStatusBar.tsx
CHANGED
|
@@ -9,8 +9,8 @@ const shimmer = keyframes`
|
|
| 9 |
`;
|
| 10 |
|
| 11 |
const TOOL_LABELS: Record<string, string> = {
|
| 12 |
-
sandbox_create: 'Creating sandbox, this might take 1-2 minutes',
|
| 13 |
-
hf_jobs: 'Running job',
|
| 14 |
hf_repo_files: 'Uploading file',
|
| 15 |
hf_repo_git: 'Git operation',
|
| 16 |
hf_inspect_dataset: 'Inspecting dataset',
|
|
|
|
| 9 |
`;
|
| 10 |
|
| 11 |
const TOOL_LABELS: Record<string, string> = {
|
| 12 |
+
sandbox_create: 'Creating sandbox for code development, this might take 1-2 minutes',
|
| 13 |
+
hf_jobs: 'Running a GPU job, this might take a while',
|
| 14 |
hf_repo_files: 'Uploading file',
|
| 15 |
hf_repo_git: 'Git operation',
|
| 16 |
hf_inspect_dataset: 'Inspecting dataset',
|
frontend/src/hooks/useAgentChat.ts
CHANGED
|
@@ -98,12 +98,13 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
|
|
| 98 |
},
|
| 99 |
onToolLog: (tool: string, log: string) => {
|
| 100 |
if (!isActiveRef.current) return;
|
| 101 |
-
if (tool === 'hf_jobs') {
|
| 102 |
const state = useAgentStore.getState();
|
| 103 |
const existingOutput = state.panelData?.output?.content || '';
|
|
|
|
| 104 |
const newContent = existingOutput
|
| 105 |
? existingOutput + '\n' + log
|
| 106 |
-
:
|
| 107 |
|
| 108 |
setPanelOutput({ content: newContent, language: 'text' });
|
| 109 |
|
|
|
|
| 98 |
},
|
| 99 |
onToolLog: (tool: string, log: string) => {
|
| 100 |
if (!isActiveRef.current) return;
|
| 101 |
+
if (tool === 'hf_jobs' || tool === 'sandbox') {
|
| 102 |
const state = useAgentStore.getState();
|
| 103 |
const existingOutput = state.panelData?.output?.content || '';
|
| 104 |
+
const header = tool === 'sandbox' ? '--- Sandbox creation ---' : '--- Job execution started ---';
|
| 105 |
const newContent = existingOutput
|
| 106 |
? existingOutput + '\n' + log
|
| 107 |
+
: header + '\n' + log;
|
| 108 |
|
| 109 |
setPanelOutput({ content: newContent, language: 'text' });
|
| 110 |
|