Spaces:

Elysiadev11
/

proxy-cerebras

Sleeping

App Files Files Community

Elysiadev11 commited on 13 days ago

Commit

529d4d4

verified ·

1 Parent(s): 79ff08c

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +792 -0

app.py ADDED Viewed

	@@ -0,0 +1,792 @@

+"""
+Cerebras Proxy Server
+- OpenAI-compatible endpoint: /v1/chat/completions
+- Anthropic-compatible endpoint: /v1/messages
+- Token limiting: Max 30,000 request tokens (auto-truncate oldest messages)
+- Multi-key round-robin with failover
+"""
+import os
+import json
+import time
+import uuid
+import asyncio
+import httpx
+import tiktoken
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.requests import ClientDisconnect
+app = FastAPI()
+# =====================================================
+# CONFIG
+# =====================================================
+MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
+CEREBRAS_BASE_URL = os.getenv("CEREBRAS_BASE_URL", "https://api.cerebras.ai/v1")
+MAX_REQUEST_TOKENS = int(os.getenv("MAX_REQUEST_TOKENS", "30000"))
+# Default model for Cerebras
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "llama-4-scout-17b-16e-instruct")
+# Model mapping: incoming model name -> Cerebras model name
+DEFAULT_MODEL_MAPPING = {
+    # Claude models -> Cerebras
+    "claude-opus-4-7": "llama-4-scout-17b-16e-instruct",
+    "claude-opus-4-6": "llama-4-scout-17b-16e-instruct",
+    "claude-opus-4-5": "llama-4-scout-17b-16e-instruct",
+    "claude-opus-4-1": "llama-4-scout-17b-16e-instruct",
+    "claude-opus-4-20250514": "llama-4-scout-17b-16e-instruct",
+    "claude-sonnet-4-6": "llama-4-scout-17b-16e-instruct",
+    "claude-sonnet-4-5": "llama-4-scout-17b-16e-instruct",
+    "claude-sonnet-4-20250514": "llama-4-scout-17b-16e-instruct",
+    "claude-haiku-4-5": "llama-4-scout-17b-16e-instruct",
+    "claude-haiku-4-5-20251001": "llama-4-scout-17b-16e-instruct",
+    # GPT models -> Cerebras
+    "gpt-4": "llama-4-scout-17b-16e-instruct",
+    "gpt-4o": "llama-4-scout-17b-16e-instruct",
+    "gpt-4o-mini": "llama-4-scout-17b-16e-instruct",
+    "gpt-4-turbo": "llama-4-scout-17b-16e-instruct",
+    "gpt-3.5-turbo": "llama-4-scout-17b-16e-instruct",
+}
+def load_model_mapping():
+    mapping = DEFAULT_MODEL_MAPPING.copy()
+    env_map = os.getenv("MODEL_MAP")
+    if env_map:
+        for pair in env_map.split(","):
+            if ":" in pair:
+                parts = pair.split(":", 1)
+                if len(parts) == 2:
+                    mapping[parts[0].strip()] = parts[1].strip()
+    return mapping
+def map_model(model_name: str) -> str:
+    mapping = load_model_mapping()
+    if model_name in mapping:
+        return mapping[model_name]
+    # If model is already a Cerebras model, pass through
+    return model_name
+# =====================================================
+# API KEYS - Load from env: CEREBRAS_KEY_1, CEREBRAS_KEY_2, ...
+# =====================================================
+API_KEYS = []
+for i in range(1, 101):
+    key = os.getenv(f"CEREBRAS_KEY_{i}")
+    if key:
+        API_KEYS.append(key)
+if not API_KEYS:
+    # Fallback: check CEREBRAS_API_KEY
+    fallback = os.getenv("CEREBRAS_API_KEY", "")
+    if fallback:
+        API_KEYS.append(fallback)
+    else:
+        API_KEYS.append("dummy_key")
+# =====================================================
+# KEY STATUS & ROUND ROBIN
+# =====================================================
+key_status = {}
+for idx, k in enumerate(API_KEYS, 1):
+    key_status[k] = {
+        "index": idx,
+        "prefix": k[:8] + "..." if len(k) > 8 else k,
+        "healthy": True,
+        "busy": False,
+        "success": 0,
+        "fail": 0,
+    }
+rr_index = 0
+_key_lock = asyncio.Lock()
+# =====================================================
+# TOKEN COUNTING
+# =====================================================
+# Use cl100k_base (GPT-4 tokenizer) as a reasonable approximation
+try:
+    _encoder = tiktoken.get_encoding("cl100k_base")
+except Exception:
+    _encoder = None
+def count_tokens(text: str) -> int:
+    """Count tokens in text using tiktoken, fallback to char/4 estimate."""
+    if _encoder is None:
+        return len(text) // 4
+    return len(_encoder.encode(text, disallowed_special=()))
+def count_messages_tokens(messages: list) -> int:
+    """Count total tokens in a list of messages."""
+    total = 0
+    for msg in messages:
+        content = msg.get("content", "")
+        if isinstance(content, list):
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    total += count_tokens(block.get("text", ""))
+        elif isinstance(content, str):
+            total += count_tokens(content)
+        # Add overhead for role, etc.
+        total += 4  # ~4 tokens per message overhead
+    return total
+def truncate_messages(messages: list, max_tokens: int) -> list:
+    """
+    Truncate messages to fit within max_tokens.
+    Strategy:
+    1. Always keep the system message (first message if role=system)
+    2. Always keep the last user message
+    3. Remove oldest non-system messages first
+    4. If still over limit, truncate the content of remaining messages
+    """
+    if not messages:
+        return messages
+    total = count_messages_tokens(messages)
+    if total <= max_tokens:
+        return messages
+    log(f"⚠️ Token count {total} exceeds limit {max_tokens}. Truncating...")
+    # Separate system message from others
+    system_msgs = []
+    other_msgs = []
+    for msg in messages:
+        if msg.get("role") == "system":
+            system_msgs.append(msg)
+        else:
+            other_msgs.append(msg)
+    # Always keep last message (usually the latest user message)
+    if not other_msgs:
+        return messages
+    last_msg = other_msgs[-1]
+    middle_msgs = other_msgs[:-1]
+    # Try removing middle messages from oldest first
+    result = system_msgs.copy()
+    remaining_budget = max_tokens - count_messages_tokens(system_msgs) - count_messages_tokens([last_msg])
+    if remaining_budget < 0:
+        # Even system + last msg exceeds limit
+        # Truncate system message content
+        if system_msgs:
+            sys_content = system_msgs[0].get("content", "")
+            if isinstance(sys_content, str):
+                # Keep only first 2000 tokens of system prompt
+                max_sys = min(2000, max_tokens // 4)
+                if _encoder:
+                    tokens = _encoder.encode(sys_content, disallowed_special=())
+                    if len(tokens) > max_sys:
+                        sys_content = _encoder.decode(tokens[:max_sys])
+                else:
+                    sys_content = sys_content[:max_sys * 4]
+                system_msgs[0] = {**system_msgs[0], "content": sys_content}
+        # Truncate last message content too if needed
+        last_content = last_msg.get("content", "")
+        if isinstance(last_content, str):
+            max_last = max_tokens - count_messages_tokens(system_msgs) - 10
+            if max_last > 0:
+                last_tokens = count_tokens(last_content)
+                if last_tokens > max_last:
+                    if _encoder:
+                        tokens = _encoder.encode(last_content, disallowed_special=())
+                        last_content = _encoder.decode(tokens[:max_last])
+                    else:
+                        last_content = last_content[:max_last * 4]
+                    last_msg = {**last_msg, "content": last_content}
+        result = system_msgs + [last_msg]
+        final_count = count_messages_tokens(result)
+        log(f"✂️ Truncated to {final_count} tokens (heavy truncation)")
+        return result
+    # Add middle messages from newest to oldest until budget exhausted
+    kept_middle = []
+    for msg in reversed(middle_msgs):
+        msg_tokens = count_messages_tokens([msg])
+        if remaining_budget >= msg_tokens:
+            kept_middle.insert(0, msg)
+            remaining_budget -= msg_tokens
+        else:
+            # Try to fit a truncated version
+            if remaining_budget > 50:  # Only bother if we have meaningful space
+                content = msg.get("content", "")
+                if isinstance(content, str) and remaining_budget > 10:
+                    if _encoder:
+                        tokens = _encoder.encode(content, disallowed_special=())
+                        truncated = _encoder.decode(tokens[:remaining_budget - 10])
+                    else:
+                        truncated = content[:(remaining_budget - 10) * 4]
+                    kept_middle.insert(0, {**msg, "content": truncated + "\n[...truncated]"})
+                    remaining_budget = 0
+            break
+    result = system_msgs + kept_middle + [last_msg]
+    final_count = count_messages_tokens(result)
+    removed = len(middle_msgs) - len(kept_middle)
+    log(f"✂️ Truncated: removed {removed} messages, final {final_count} tokens")
+    return result
+# =====================================================
+# UTILITY
+# =====================================================
+def log(msg):
+    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)
+def sse(obj):
+    return "data: " + json.dumps(obj, ensure_ascii=False) + "\n\n"
+def auth_ok(req: Request):
+    token = req.headers.get("Authorization", "").replace("Bearer ", "")
+    return token == MASTER_API_KEY
+async def get_key(exclude=None):
+    global rr_index
+    if exclude is None:
+        exclude = set()
+    async with _key_lock:
+        # Check if all keys are unhealthy, reset if so
+        if not any(v["healthy"] for v in key_status.values()):
+            log("⚠️ All API Keys unhealthy. Resetting all...")
+            for v in key_status.values():
+                v["fail"] = 0
+                v["healthy"] = True
+        for _ in range(len(API_KEYS)):
+            rr_index = (rr_index + 1) % len(API_KEYS)
+            key = API_KEYS[rr_index]
+            st = key_status[key]
+            if st["healthy"] and not st["busy"] and key not in exclude:
+                st["busy"] = True
+                return key
+    return None
+async def release_key(key):
+    async with _key_lock:
+        if key in key_status:
+            key_status[key]["busy"] = False
+async def mark_fail(key):
+    async with _key_lock:
+        if key in key_status:
+            key_status[key]["fail"] += 1
+            if key_status[key]["fail"] >= 3:
+                key_status[key]["healthy"] = False
+async def mark_ok(key):
+    async with _key_lock:
+        if key in key_status:
+            key_status[key]["success"] += 1
+            key_status[key]["fail"] = 0
+            key_status[key]["healthy"] = True
+async def wait_for_free_key(exclude=None, max_wait=60.0, interval=0.3):
+    elapsed = 0.0
+    while elapsed < max_wait:
+        key = await get_key(exclude)
+        if key:
+            return key
+        await asyncio.sleep(interval)
+        elapsed += interval
+    return None
+def is_rate_limited(status_code: int, text: str) -> bool:
+    t = text.lower()
+    return status_code == 429 or "rate limit" in t or "too many requests" in t or "usage limit" in t
+# =====================================================
+# ROOT / STATUS
+# =====================================================
+@app.get("/")
+async def root():
+    async with _key_lock:
+        keys_info = {}
+        for k, v in key_status.items():
+            keys_info[v["prefix"]] = {
+                "status": "BUSY" if v["busy"] else "IDLE",
+                "healthy": v["healthy"],
+                "success": v["success"],
+                "fail": v["fail"],
+            }
+    return {
+        "status": "ok",
+        "backend": "cerebras",
+        "base_url": CEREBRAS_BASE_URL,
+        "default_model": DEFAULT_MODEL,
+        "max_request_tokens": MAX_REQUEST_TOKENS,
+        "total_keys": len(API_KEYS),
+        "keys": keys_info,
+    }
+# =====================================================
+# /v1/models
+# =====================================================
+@app.get("/v1/models")
+async def list_models(req: Request):
+    if not auth_ok(req):
+        return JSONResponse({"error": "Unauthorized"}, status_code=401)
+    # Try to fetch from Cerebras API
+    key = API_KEYS[0] if API_KEYS else ""
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            r = await client.get(
+                f"{CEREBRAS_BASE_URL}/models",
+                headers={"Authorization": f"Bearer {key}"}
+            )
+        if r.status_code == 200:
+            return Response(content=r.content, media_type="application/json")
+    except Exception as e:
+        log(f"[/v1/models] Error fetching from Cerebras: {e}")
+    # Fallback: return known models
+    now = int(time.time())
+    known_models = [
+        "llama-4-scout-17b-16e-instruct",
+        "llama-4-maverick-17b-128e-instruct",
+        "llama3.3-70b",
+        "llama3.1-8b",
+        "qwen-3-32b",
+        "deepseek-r1-distill-llama-70b",
+    ]
+    data = [
+        {"id": m, "object": "model", "created": now, "owned_by": "cerebras"}
+        for m in known_models
+    ]
+    return {"object": "list", "data": data}
+# =====================================================
+# /v1/chat/completions (OpenAI-compatible)
+# =====================================================
+@app.post("/v1/chat/completions")
+async def chat(req: Request):
+    if not auth_ok(req):
+        return JSONResponse({"error": "Unauthorized"}, status_code=401)
+    try:
+        body = await req.json()
+    except ClientDisconnect:
+        log("Client disconnected before reading body.")
+        return Response(status_code=499)
+    except json.JSONDecodeError:
+        return JSONResponse({"error": "Invalid JSON body"}, status_code=400)
+    is_stream = body.get("stream", False)
+    original_model = body.get("model", DEFAULT_MODEL)
+    cerebras_model = map_model(original_model)
+    # Token limiting: truncate messages
+    messages = body.get("messages", [])
+    messages = truncate_messages(messages, MAX_REQUEST_TOKENS)
+    # Build Cerebras request body
+    cerebras_body = {
+        "model": cerebras_model,
+        "messages": messages,
+        "stream": is_stream,
+    }
+    # Forward optional parameters
+    for param in ["max_tokens", "max_completion_tokens", "temperature", "top_p", "stop", "frequency_penalty", "presence_penalty"]:
+        if param in body:
+            cerebras_body[param] = body[param]
+    # Cap max_completion_tokens to avoid blowing Cerebras limits
+    if "max_tokens" not in cerebras_body and "max_completion_tokens" not in cerebras_body:
+        cerebras_body["max_completion_tokens"] = 8192
+    # -----------------------------------------
+    # NON STREAM
+    # -----------------------------------------
+    if not is_stream:
+        tried = set()
+        for _ in range(len(API_KEYS)):
+            key = await wait_for_free_key(exclude=tried)
+            if not key:
+                break
+            tried.add(key)
+            ki = key_status[key]
+            log(f"NON-STREAM: Using key#{ki['index']}")
+            try:
+                async with httpx.AsyncClient(timeout=180) as client:
+                    r = await client.post(
+                        f"{CEREBRAS_BASE_URL}/chat/completions",
+                        json=cerebras_body,
+                        headers={
+                            "Authorization": f"Bearer {key}",
+                            "Content-Type": "application/json",
+                        }
+                    )
+                if is_rate_limited(r.status_code, r.text):
+                    log(f"RATE LIMITED: key#{ki['index']}, trying next")
+                    await mark_fail(key)
+                    continue
+                if r.status_code != 200:
+                    log(f"HTTP {r.status_code}: key#{ki['index']}, trying next")
+                    await mark_fail(key)
+                    continue
+                await mark_ok(key)
+                # Cerebras returns OpenAI-compatible format, forward directly
+                return Response(content=r.content, media_type="application/json")
+            except Exception as e:
+                log(f"Exception: key#{ki['index']} - {e}")
+                await mark_fail(key)
+            finally:
+                await release_key(key)
+        return JSONResponse({"error": "All keys failed"}, status_code=500)
+    # -----------------------------------------
+    # STREAM
+    # -----------------------------------------
+    async def stream_gen():
+        tried = set()
+        for _ in range(len(API_KEYS)):
+            key = await wait_for_free_key(exclude=tried)
+            if not key:
+                break
+            tried.add(key)
+            ki = key_status[key]
+            log(f"STREAM: Using key#{ki['index']}")
+            try:
+                async with httpx.AsyncClient(timeout=None) as client:
+                    async with client.stream(
+                        "POST",
+                        f"{CEREBRAS_BASE_URL}/chat/completions",
+                        json=cerebras_body,
+                        headers={
+                            "Authorization": f"Bearer {key}",
+                            "Content-Type": "application/json",
+                        }
+                    ) as r:
+                        if is_rate_limited(r.status_code, ""):
+                            log(f"STREAM RATE LIMITED: key#{ki['index']}, trying next")
+                            await mark_fail(key)
+                            continue
+                        if r.status_code != 200:
+                            log(f"STREAM HTTP {r.status_code}: key#{ki['index']}, trying next")
+                            await mark_fail(key)
+                            continue
+                        hit_limit = False
+                        async for line in r.aiter_lines():
+                            if not line:
+                                continue
+                            if line.strip() == "data: [DONE]":
+                                break
+                            raw = line[6:] if line.startswith("data: ") else line
+                            if is_rate_limited(0, raw):
+                                log(f"MID-STREAM LIMIT: key#{ki['index']}, switching")
+                                hit_limit = True
+                                break
+                            # Cerebras SSE is already OpenAI-compatible, pipe directly
+                            yield line + "\n\n"
+                        if hit_limit:
+                            await mark_fail(key)
+                            continue
+                        yield "data: [DONE]\n\n"
+                        await mark_ok(key)
+                        return
+            except Exception as e:
+                log(f"STREAM EXCEPTION: key#{ki['index']} - {e}")
+                await mark_fail(key)
+            finally:
+                await release_key(key)
+        yield sse({"error": "All keys failed"})
+        yield "data: [DONE]\n\n"
+    return StreamingResponse(stream_gen(), media_type="text/event-stream")
+# =====================================================
+# /v1/messages (Anthropic-compatible)
+# =====================================================
+@app.post("/v1/messages")
+async def anthropic_messages(req: Request):
+    if not auth_ok(req):
+        return JSONResponse(
+            {"type": "error", "error": {"type": "authentication_error", "message": "Unauthorized"}},
+            status_code=401
+        )
+    try:
+        body = await req.json()
+    except ClientDisconnect:
+        return Response(status_code=499)
+    except Exception:
+        return JSONResponse(
+            {"type": "error", "error": {"type": "invalid_request_error", "message": "Bad JSON"}},
+            status_code=400
+        )
+    is_stream = body.get("stream", False)
+    original_model = body.get("model", DEFAULT_MODEL)
+    cerebras_model = map_model(original_model)
+    max_tokens = body.get("max_tokens", 4096)
+    # Convert Anthropic messages -> OpenAI format
+    messages = []
+    if body.get("system"):
+        sys_content = body["system"]
+        if isinstance(sys_content, list):
+            # Anthropic system can be list of content blocks
+            txt = "".join(x.get("text", "") for x in sys_content if x.get("type") == "text")
+            sys_content = txt
+        messages.append({"role": "system", "content": sys_content})
+    for m in body.get("messages", []):
+        content = m.get("content", "")
+        if isinstance(content, list):
+            txt = ""
+            for block in content:
+                if block.get("type") == "text":
+                    txt += block.get("text", "")
+                elif block.get("type") == "tool_result":
+                    txt += block.get("content", str(block))
+                elif block.get("type") == "tool_use":
+                    txt += json.dumps(block)
+            content = txt
+        messages.append({"role": m["role"], "content": content})
+    # Token limiting
+    messages = truncate_messages(messages, MAX_REQUEST_TOKENS)
+    cerebras_body = {
+        "model": cerebras_model,
+        "messages": messages,
+        "stream": is_stream,
+        "max_completion_tokens": min(max_tokens, 8192),
+    }
+    # Forward optional params
+    if "temperature" in body:
+        cerebras_body["temperature"] = body["temperature"]
+    if "top_p" in body:
+        cerebras_body["top_p"] = body["top_p"]
+    # -----------------------------------------
+    # NON STREAM
+    # -----------------------------------------
+    if not is_stream:
+        tried = set()
+        for _ in range(len(API_KEYS)):
+            key = await wait_for_free_key(exclude=tried)
+            if not key:
+                break
+            tried.add(key)
+            ki = key_status[key]
+            log(f"ANTHROPIC NON-STREAM: key#{ki['index']}")
+            try:
+                async with httpx.AsyncClient(timeout=180) as client:
+                    r = await client.post(
+                        f"{CEREBRAS_BASE_URL}/chat/completions",
+                        json=cerebras_body,
+                        headers={
+                            "Authorization": f"Bearer {key}",
+                            "Content-Type": "application/json",
+                        }
+                    )
+                if is_rate_limited(r.status_code, r.text):
+                    log(f"RATE LIMITED: key#{ki['index']}")
+                    await mark_fail(key)
+                    continue
+                if r.status_code != 200:
+                    log(f"HTTP {r.status_code}: key#{ki['index']}")
+                    await mark_fail(key)
+                    continue
+                data = r.json()
+                # Convert OpenAI response -> Anthropic format
+                content_text = data["choices"][0]["message"]["content"]
+                usage = data.get("usage", {})
+                finish = data["choices"][0].get("finish_reason", "stop")
+                stop_map = {"stop": "end_turn", "length": "max_tokens", "eos": "end_turn"}
+                out = {
+                    "id": "msg_" + uuid.uuid4().hex[:10],
+                    "type": "message",
+                    "role": "assistant",
+                    "model": original_model,
+                    "content": [{"type": "text", "text": content_text}],
+                    "stop_reason": stop_map.get(finish, "end_turn"),
+                    "stop_sequence": None,
+                    "usage": {
+                        "input_tokens": usage.get("prompt_tokens", 0),
+                        "output_tokens": usage.get("completion_tokens", 0),
+                    }
+                }
+                await mark_ok(key)
+                return JSONResponse(out)
+            except Exception as e:
+                log(f"Exception: key#{ki['index']} - {e}")
+                await mark_fail(key)
+            finally:
+                await release_key(key)
+        return JSONResponse(
+            {"type": "error", "error": {"type": "api_error", "message": "All keys failed"}},
+            status_code=500
+        )
+    # -----------------------------------------
+    # STREAM (Anthropic SSE envelope)
+    # -----------------------------------------
+    async def anthropic_stream_gen():
+        tried = set()
+        msg_id = "msg_" + uuid.uuid4().hex[:10]
+        sent_header = False
+        for _ in range(len(API_KEYS)):
+            key = await wait_for_free_key(exclude=tried)
+            if not key:
+                break
+            tried.add(key)
+            ki = key_status[key]
+            log(f"ANTHROPIC STREAM: key#{ki['index']}")
+            try:
+                async with httpx.AsyncClient(timeout=None) as client:
+                    async with client.stream(
+                        "POST",
+                        f"{CEREBRAS_BASE_URL}/chat/completions",
+                        json=cerebras_body,
+                        headers={
+                            "Authorization": f"Bearer {key}",
+                            "Content-Type": "application/json",
+                        }
+                    ) as r:
+                        if is_rate_limited(r.status_code, ""):
+                            log(f"STREAM RATE LIMITED: key#{ki['index']}")
+                            await mark_fail(key)
+                            continue
+                        if r.status_code != 200:
+                            log(f"STREAM HTTP {r.status_code}: key#{ki['index']}")
+                            await mark_fail(key)
+                            continue
+                        # Send Anthropic envelope header (once)
+                        if not sent_header:
+                            yield sse({
+                                "type": "message_start",
+                                "message": {
+                                    "id": msg_id,
+                                    "type": "message",
+                                    "role": "assistant",
+                                    "model": original_model,
+                                    "content": [],
+                                    "stop_reason": None,
+                                    "stop_sequence": None,
+                                    "usage": {"input_tokens": 0, "output_tokens": 0}
+                                }
+                            })
+                            yield sse({
+                                "type": "content_block_start",
+                                "index": 0,
+                                "content_block": {"type": "text", "text": ""}
+                            })
+                            sent_header = True
+                        hit_limit = False
+                        output_tokens = 0
+                        async for line in r.aiter_lines():
+                            if not line:
+                                continue
+                            if line.strip() == "data: [DONE]":
+                                break
+                            raw = line[6:] if line.startswith("data: ") else line
+                            if is_rate_limited(0, raw):
+                                log(f"MID-STREAM LIMIT: key#{ki['index']}")
+                                hit_limit = True
+                                break
+                            try:
+                                j = json.loads(raw)
+                                token = j["choices"][0]["delta"].get("content", "")
+                                if j.get("usage"):
+                                    output_tokens = j["usage"].get("completion_tokens", output_tokens)
+                            except Exception:
+                                continue
+                            if token:
+                                yield sse({
+                                    "type": "content_block_delta",
+                                    "index": 0,
+                                    "delta": {"type": "text_delta", "text": token}
+                                })
+                        if hit_limit:
+                            await mark_fail(key)
+                            continue
+                        await mark_ok(key)
+                        break  # success, exit retry loop
+            except Exception as e:
+                log(f"STREAM EXCEPTION: key#{ki['index']} - {e}")
+                await mark_fail(key)
+            finally:
+                await release_key(key)
+        # Close Anthropic SSE envelope
+        yield sse({"type": "content_block_stop", "index": 0})
+        yield sse({
+            "type": "message_delta",
+            "delta": {"stop_reason": "end_turn", "stop_sequence": None},
+            "usage": {"output_tokens": 0}
+        })
+        yield sse({"type": "message_stop"})
+    return StreamingResponse(anthropic_stream_gen(), media_type="text/event-stream")