akseljoonas HF Staff Claude Opus 4.6 commited on
Commit
28bdef8
Β·
1 Parent(s): c88804e

fix: retry sandbox API calls on transient failures

Browse files

The _call method now retries up to 3 times with backoff (3s, 6s, 9s)
on non-JSON responses and connection errors, which are common when the
sandbox is waking from sleep. Timeouts and unexpected exceptions are
not retried. Error messages now include HTTP status and response preview
instead of raw JSON parse errors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. agent/tools/sandbox_client.py +51 -26
agent/tools/sandbox_client.py CHANGED
@@ -443,34 +443,59 @@ class Sandbox:
443
  ) -> ToolResult:
444
  # Strip leading slash for correct httpx base_url resolution
445
  endpoint = endpoint.lstrip("/")
446
- try:
447
- resp = self._client.post(
448
- endpoint,
449
- json=payload,
450
- timeout=timeout or self.timeout,
451
- )
452
- data = resp.json()
453
- if resp.status_code == 200:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  return ToolResult(
455
- success=data.get("success", True),
456
- output=data.get("output", ""),
457
- error=data.get("error", ""),
458
  )
459
- return ToolResult(
460
- success=False,
461
- error=data.get("error", f"HTTP {resp.status_code}"),
462
- )
463
- except httpx.TimeoutException:
464
- return ToolResult(
465
- success=False, error=f"Timeout after {timeout or self.timeout}s"
466
- )
467
- except httpx.ConnectError:
468
- return ToolResult(
469
- success=False,
470
- error=f"Cannot connect to sandbox. Is {self.space_id} running? Status: {self.status}",
471
- )
472
- except Exception as e:
473
- return ToolResult(success=False, error=str(e))
 
 
474
 
475
  # ── Tools ─────────────────────────────────────────────────────
476
 
 
443
  ) -> ToolResult:
444
  # Strip leading slash for correct httpx base_url resolution
445
  endpoint = endpoint.lstrip("/")
446
+ effective_timeout = timeout or self.timeout
447
+ last_error = ""
448
+
449
+ # Retry up to 3 times for transient failures (sandbox waking from
450
+ # sleep returns empty / non-JSON responses while it starts up).
451
+ for attempt in range(3):
452
+ try:
453
+ resp = self._client.post(
454
+ endpoint,
455
+ json=payload,
456
+ timeout=effective_timeout,
457
+ )
458
+ try:
459
+ data = resp.json()
460
+ except (ValueError, UnicodeDecodeError):
461
+ # Non-JSON response β€” sandbox is likely still starting up.
462
+ body_preview = resp.text[:200] if resp.text else "(empty)"
463
+ last_error = (
464
+ f"Sandbox returned non-JSON response (HTTP {resp.status_code}): "
465
+ f"{body_preview}"
466
+ )
467
+ if attempt < 2:
468
+ time.sleep(3 * (attempt + 1))
469
+ continue
470
+ return ToolResult(success=False, error=last_error)
471
+
472
+ if resp.status_code == 200:
473
+ return ToolResult(
474
+ success=data.get("success", True),
475
+ output=data.get("output", ""),
476
+ error=data.get("error", ""),
477
+ )
478
  return ToolResult(
479
+ success=False,
480
+ error=data.get("error", f"HTTP {resp.status_code}"),
 
481
  )
482
+ except httpx.TimeoutException:
483
+ return ToolResult(
484
+ success=False, error=f"Timeout after {effective_timeout}s"
485
+ )
486
+ except httpx.ConnectError:
487
+ last_error = (
488
+ f"Cannot connect to sandbox. Is {self.space_id} running? "
489
+ f"Status: {self.status}"
490
+ )
491
+ if attempt < 2:
492
+ time.sleep(3 * (attempt + 1))
493
+ continue
494
+ return ToolResult(success=False, error=last_error)
495
+ except Exception as e:
496
+ return ToolResult(success=False, error=str(e))
497
+
498
+ return ToolResult(success=False, error=last_error or "Unknown error")
499
 
500
  # ── Tools ─────────────────────────────────────────────────────
501