Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit Β·
c86aaa8
1
Parent(s): 47d6895
fix: retry sandbox API calls on transient failures
Browse filesThe _call method now retries up to 3 times with backoff (3s, 6s, 9s)
on non-JSON responses and connection errors, which are common when the
sandbox is waking from sleep. Timeouts and unexpected exceptions are
not retried. Error messages now include HTTP status and response preview
instead of raw JSON parse errors.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- agent/tools/sandbox_client.py +51 -26
agent/tools/sandbox_client.py
CHANGED
|
@@ -443,34 +443,59 @@ class Sandbox:
|
|
| 443 |
) -> ToolResult:
|
| 444 |
# Strip leading slash for correct httpx base_url resolution
|
| 445 |
endpoint = endpoint.lstrip("/")
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
return ToolResult(
|
| 455 |
-
success=
|
| 456 |
-
|
| 457 |
-
error=data.get("error", ""),
|
| 458 |
)
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
|
|
|
|
|
|
| 474 |
|
| 475 |
# ββ Tools βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 476 |
|
|
|
|
| 443 |
) -> ToolResult:
|
| 444 |
# Strip leading slash for correct httpx base_url resolution
|
| 445 |
endpoint = endpoint.lstrip("/")
|
| 446 |
+
effective_timeout = timeout or self.timeout
|
| 447 |
+
last_error = ""
|
| 448 |
+
|
| 449 |
+
# Retry up to 3 times for transient failures (sandbox waking from
|
| 450 |
+
# sleep returns empty / non-JSON responses while it starts up).
|
| 451 |
+
for attempt in range(3):
|
| 452 |
+
try:
|
| 453 |
+
resp = self._client.post(
|
| 454 |
+
endpoint,
|
| 455 |
+
json=payload,
|
| 456 |
+
timeout=effective_timeout,
|
| 457 |
+
)
|
| 458 |
+
try:
|
| 459 |
+
data = resp.json()
|
| 460 |
+
except (ValueError, UnicodeDecodeError):
|
| 461 |
+
# Non-JSON response β sandbox is likely still starting up.
|
| 462 |
+
body_preview = resp.text[:200] if resp.text else "(empty)"
|
| 463 |
+
last_error = (
|
| 464 |
+
f"Sandbox returned non-JSON response (HTTP {resp.status_code}): "
|
| 465 |
+
f"{body_preview}"
|
| 466 |
+
)
|
| 467 |
+
if attempt < 2:
|
| 468 |
+
time.sleep(3 * (attempt + 1))
|
| 469 |
+
continue
|
| 470 |
+
return ToolResult(success=False, error=last_error)
|
| 471 |
+
|
| 472 |
+
if resp.status_code == 200:
|
| 473 |
+
return ToolResult(
|
| 474 |
+
success=data.get("success", True),
|
| 475 |
+
output=data.get("output", ""),
|
| 476 |
+
error=data.get("error", ""),
|
| 477 |
+
)
|
| 478 |
return ToolResult(
|
| 479 |
+
success=False,
|
| 480 |
+
error=data.get("error", f"HTTP {resp.status_code}"),
|
|
|
|
| 481 |
)
|
| 482 |
+
except httpx.TimeoutException:
|
| 483 |
+
return ToolResult(
|
| 484 |
+
success=False, error=f"Timeout after {effective_timeout}s"
|
| 485 |
+
)
|
| 486 |
+
except httpx.ConnectError:
|
| 487 |
+
last_error = (
|
| 488 |
+
f"Cannot connect to sandbox. Is {self.space_id} running? "
|
| 489 |
+
f"Status: {self.status}"
|
| 490 |
+
)
|
| 491 |
+
if attempt < 2:
|
| 492 |
+
time.sleep(3 * (attempt + 1))
|
| 493 |
+
continue
|
| 494 |
+
return ToolResult(success=False, error=last_error)
|
| 495 |
+
except Exception as e:
|
| 496 |
+
return ToolResult(success=False, error=str(e))
|
| 497 |
+
|
| 498 |
+
return ToolResult(success=False, error=last_error or "Unknown error")
|
| 499 |
|
| 500 |
# ββ Tools βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 501 |
|