surrogate-1 / bin /lib /bridge_retry.py
Ashira Pitchayapakayakul
feat: migrate $HOME/.claude/* to $HOME/.surrogate/* (clean separation from Claude Code)
e36381e
"""Shared HTTP retry library for all cloud bridges.
Handles: exponential backoff + jitter + Retry-After + circuit breaker.
Import at top of any bridge: exec(open(...).read())
Exports: request_with_retry(url, data, headers, max_retries=4, base_delay=2.0)
"""
import json as _json
import os as _os
import random as _random
import time as _time
import urllib.request as _urlreq
import urllib.error as _urlerr
# Circuit breaker state β€” persisted in /tmp so all bridge invocations share
_CB_DIR = "/tmp/bridge-circuits"
_os.makedirs(_CB_DIR, exist_ok=True)
def _cb_state_path(host):
return f"{_CB_DIR}/{host.replace('/', '_')}.json"
def _circuit_open(host):
p = _cb_state_path(host)
try:
with open(p) as f:
s = _json.load(f)
# Circuit closed after timeout
if _time.time() > s.get("open_until", 0):
return False, 0
return True, int(s["open_until"] - _time.time())
except Exception:
return False, 0
def _record_failure(host, open_seconds=60):
"""Called on 429 or 5xx β€” track consecutive failures."""
p = _cb_state_path(host)
try:
with open(p) as f:
s = _json.load(f)
except Exception:
s = {"consec_fails": 0, "open_until": 0}
s["consec_fails"] = s.get("consec_fails", 0) + 1
# Open circuit after 3 consecutive failures
if s["consec_fails"] >= 3:
s["open_until"] = _time.time() + open_seconds
with open(p, "w") as f:
_json.dump(s, f)
def _record_success(host):
"""Called on 2xx β€” reset failure counter."""
p = _cb_state_path(host)
try:
with open(p, "w") as f:
_json.dump({"consec_fails": 0, "open_until": 0}, f)
except Exception:
pass
def _parse_retry_after(headers, default_delay):
"""Honor Retry-After header (seconds) or x-ratelimit-reset-after."""
for h in ("Retry-After", "retry-after", "x-ratelimit-reset-after", "x-ratelimit-reset"):
val = headers.get(h)
if val:
try:
n = int(val)
# x-ratelimit-reset may be absolute epoch β€” convert to delta
if n > 10_000_000_000: # way in future = epoch ms
n = n // 1000 - int(_time.time())
elif n > 1_000_000_000: # epoch seconds
n = n - int(_time.time())
return max(1, min(n, 300)) # clamp 1..300s
except (ValueError, TypeError):
pass
return default_delay
def request_with_retry(url, data, headers, timeout=120, max_retries=4, base_delay=2.0, open_seconds=60):
"""Make HTTP request with exp-backoff retry + circuit breaker.
Args:
open_seconds: how long to open circuit after 3 consecutive failures.
Default 60s. Callers with strict per-minute rate limits (Cloudflare,
SambaNova) should use 120-180s so we don't hammer during cooldown.
Returns: parsed JSON response.
Raises: Exception if circuit open or max retries exhausted.
"""
from urllib.parse import urlparse
host = urlparse(url).netloc
# Circuit breaker check
is_open, remaining = _circuit_open(host)
if is_open:
raise Exception(f"circuit-open for {host} ({remaining}s remaining)")
last_err = None
for attempt in range(max_retries):
try:
req = _urlreq.Request(url, data=data, headers=headers)
with _urlreq.urlopen(req, timeout=timeout) as r:
result = _json.load(r)
_record_success(host)
return result
except _urlerr.HTTPError as e:
last_err = e
if e.code == 429:
# Rate-limited β€” honor Retry-After
base = base_delay * (2 ** attempt)
delay = _parse_retry_after(e.headers, base)
delay *= (1 + _random.uniform(-0.2, 0.2)) # jitter Β±20%
if attempt < max_retries - 1:
_time.sleep(min(delay, 60))
continue
_record_failure(host, open_seconds=open_seconds)
raise Exception(f"HTTP 429 after {max_retries} retries (last Retry-After: {delay:.0f}s)")
elif 500 <= e.code < 600:
# Server error β€” exp backoff with jitter
delay = base_delay * (2 ** attempt) * (1 + _random.uniform(-0.2, 0.2))
if attempt < max_retries - 1:
_time.sleep(min(delay, 30))
continue
_record_failure(host, open_seconds=open_seconds)
raise Exception(f"HTTP {e.code} after {max_retries} retries")
else:
# 4xx other than 429 β€” not retryable (client error)
_record_failure(host, open_seconds=open_seconds)
raise
except (_urlerr.URLError, _os.error) as e:
last_err = e
# Network error β€” retry with backoff
delay = base_delay * (2 ** attempt) * (1 + _random.uniform(-0.2, 0.2))
if attempt < max_retries - 1:
_time.sleep(min(delay, 30))
continue
_record_failure(host, open_seconds=open_seconds)
raise
raise Exception(f"max retries ({max_retries}) exhausted: {last_err}")