proxycf

Sleeping

App Files Files Community

Elysiadev11 commited on 14 days ago

Commit

4ba202a

verified ·

1 Parent(s): 3f83b4e

Update proxy_cerebras.py

Browse files

Files changed (1) hide show

proxy_cerebras.py +276 -206

proxy_cerebras.py CHANGED Viewed

@@ -14,24 +14,40 @@ app = FastAPI()
 # =====================================================
 # CONFIG
 # =====================================================
-BASE_URL = os.getenv("BASE_URL", "https://ollama.com")
 MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
 # =====================================================
-# LOAD KEYS
 # =====================================================
-OLLAMA_KEYS = []
-for i in range(1, 101):
-    k = os.getenv(f"OLLAMA_KEY_{i}")
-    if k:
-        OLLAMA_KEYS.append(k)
-if not OLLAMA_KEYS:
-    OLLAMA_KEYS.append("dummy")
 key_status = {}
-for idx, k in enumerate(OLLAMA_KEYS, 1):
-    key_status[k] = {
         "index": idx,
         "healthy": True,
         "busy": False,
@@ -40,8 +56,6 @@ for idx, k in enumerate(OLLAMA_KEYS, 1):
     }
 rr_index = 0
-# Global async lock to prevent race condition on rr_index & busy flag
 _key_lock = asyncio.Lock()
@@ -61,63 +75,67 @@ def auth_ok(req: Request):
     return token == MASTER_API_KEY
 async def get_key(exclude=None):
-    """
-    Thread-safe round-robin key picker.
-    Returns the key string, or None if all are busy/excluded.
-    """
     global rr_index
     if exclude is None:
         exclude = set()
     async with _key_lock:
-        for _ in range(len(OLLAMA_KEYS)):
-            rr_index = (rr_index + 1) % len(OLLAMA_KEYS)
-            k = OLLAMA_KEYS[rr_index]
-            st = key_status[k]
-            if st["healthy"] and not st["busy"] and k not in exclude:
                 st["busy"] = True
-                return k
     return None
-async def release_key(k):
     async with _key_lock:
-        if k in key_status:
-            key_status[k]["busy"] = False
-async def mark_fail(k):
     async with _key_lock:
-        if k in key_status:
-            key_status[k]["fail"] += 1
-async def mark_ok(k):
     async with _key_lock:
-        if k in key_status:
-            key_status[k]["success"] += 1
-            key_status[k]["fail"] = 0
 async def wait_for_free_key(exclude=None, max_wait=30.0, interval=0.3):
-    """
-    Polls until a free key is available or max_wait seconds pass.
-    Returns the key or None on timeout.
-    """
     elapsed = 0.0
     while elapsed < max_wait:
-        key = await get_key(exclude)
-        if key:
-            return key
         await asyncio.sleep(interval)
         elapsed += interval
     return None
 # =====================================================
 # ROOT
 # =====================================================
@@ -125,8 +143,8 @@ async def wait_for_free_key(exclude=None, max_wait=30.0, interval=0.3):
 async def root():
     async with _key_lock:
         safe = {}
-        for k, v in key_status.items():
-            masked = k[:4] + "****" + k[-4:]
             safe[masked] = {
                 "index": v["index"],
                 "healthy": v["healthy"],
@@ -137,47 +155,41 @@ async def root():
     return {
         "status": "ok",
-        "keys": len(OLLAMA_KEYS),
         "detail": safe
     }
 # =====================================================
-# /v1/models
 # =====================================================
 @app.get("/v1/models")
 async def models(req: Request):
     if not auth_ok(req):
         return JSONResponse({"error": "Unauthorized"}, status_code=401)
-    key = OLLAMA_KEYS[0]
-    async with httpx.AsyncClient(timeout=60) as client:
-        r = await client.get(
-            f"{BASE_URL}/api/tags",
-            headers={"Authorization": f"Bearer {key}"}
-        )
-    if r.status_code != 200:
-        return JSONResponse({"error": r.text}, status_code=r.status_code)
-    data = r.json()
     now = int(time.time())
-    out = []
-    for m in data.get("models", []):
-        out.append({
-            "id": m.get("name"),
-            "object": "model",
-            "created": now,
-            "owned_by": "ollama"
-        })
-    return {"object": "list", "data": out}
 # =====================================================
-# OPENAI CHAT  /v1/chat/completions
 # =====================================================
 @app.post("/v1/chat/completions")
 async def chat(req: Request):
@@ -190,6 +202,15 @@ async def chat(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     is_stream = body.get("stream", False)
     # -----------------------------------------
     # NON STREAM
@@ -197,109 +218,193 @@ async def chat(req: Request):
     if not is_stream:
         tried = set()
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
                 break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=body,
-                        headers={"Authorization": f"Bearer {key}"}
                     )
-                txt = r.text.lower()
-                if "weekly usage limit" in txt or r.status_code == 429:
-                    log(f"Key {key[:8]}... rate limited (non-stream chat), trying next")
-                    await mark_fail(key)
                     continue
-                await mark_ok(key)
-                return Response(
-                    content=r.content,
-                    media_type=r.headers.get("content-type", "application/json")
-                )
             except Exception as e:
-                log(f"Key {key[:8]}... exception: {e}")
-                await mark_fail(key)
             finally:
-                await release_key(key)
-        return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
     # STREAM
     # -----------------------------------------
     async def gen():
         tried = set()
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
                 break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
                         "POST",
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=body,
-                        headers={"Authorization": f"Bearer {key}"}
                     ) as r:
-                        if r.status_code == 429:
-                            log(f"Key {key[:8]}... rate limited (stream chat), trying next")
-                            await mark_fail(key)
                             continue
-                        hit_limit_mid_stream = False
                         async for line in r.aiter_lines():
                             if not line:
                                 continue
-                            # Detect mid-stream rate limit signal in data payload
-                            if "429" in line or "usage limit" in line.lower():
-                                log(f"Key {key[:8]}... mid-stream limit detected, aborting chunk")
-                                hit_limit_mid_stream = True
                                 break
-                            yield line + "\n\n"
-                        if hit_limit_mid_stream:
-                            await mark_fail(key)
                             continue
-                        await mark_ok(key)
                         return
             except Exception as e:
-                log(f"Key {key[:8]}... stream exception: {e}")
-                await mark_fail(key)
             finally:
-                await release_key(key)
-        yield sse({"error": "All keys failed"})
         yield "data: [DONE]\n\n"
     return StreamingResponse(gen(), media_type="text/event-stream")
 # =====================================================
-# ANTHROPIC /v1/messages
 # =====================================================
 @app.post("/v1/messages")
 async def anthropic(req: Request):
@@ -314,8 +419,10 @@ async def anthropic(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     stream = body.get("stream", False)
-    # Build messages list for proxy
     messages = []
     if body.get("system"):
@@ -323,20 +430,18 @@ async def anthropic(req: Request):
     for m in body.get("messages", []):
         content = m.get("content", "")
         if isinstance(content, list):
             txt = ""
             for x in content:
                 if x.get("type") == "text":
                     txt += x.get("text", "")
             content = txt
         messages.append({"role": m["role"], "content": content})
-    proxy_body = {
-        "model": "minimax-m2.7:cloud",
         "messages": messages,
-        "stream": stream
     }
     # -----------------------------------------
@@ -345,91 +450,101 @@ async def anthropic(req: Request):
     if not stream:
         tried = set()
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
                 break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=proxy_body,
-                        headers={"Authorization": f"Bearer {key}"}
                     )
-                txt = r.text.lower()
-                if "weekly usage limit" in txt or r.status_code == 429:
-                    log(f"Key {key[:8]}... rate limited (non-stream anthropic), trying next")
-                    await mark_fail(key)
                     continue
                 data = r.json()
-                ans = data["choices"][0]["message"]["content"]
                 out = {
                     "id": "msg_" + uuid.uuid4().hex[:10],
                     "type": "message",
                     "role": "assistant",
-                    "model": body.get("model", "claude-opus-4-7"),
-                    "content": [{"type": "text", "text": ans}],
                     "stop_reason": "end_turn",
                     "stop_sequence": None,
                     "usage": {"input_tokens": 0, "output_tokens": 0}
                 }
-                await mark_ok(key)
                 return JSONResponse(out)
             except Exception as e:
-                log(f"Key {key[:8]}... exception: {e}")
-                await mark_fail(key)
             finally:
-                await release_key(key)
-        return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
-    # STREAM  (Anthropic SSE format)
     # -----------------------------------------
     async def agen():
         tried = set()
         msg_id = "msg_" + uuid.uuid4().hex[:10]
         sent_any_delta = False
-        # Send Anthropic envelope headers ONCE before first key attempt
-        # We defer these until we have a successful connection to avoid
-        # sending headers before knowing if any key works.
-        # Instead we buffer and yield only on confirmed success.
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
                 break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
                         "POST",
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=proxy_body,
-                        headers={"Authorization": f"Bearer {key}"}
                     ) as r:
-                        if r.status_code == 429:
-                            log(f"Key {key[:8]}... rate limited (stream anthropic), trying next")
-                            await mark_fail(key)
                             continue
-                        # Only emit Anthropic envelope on first successful key
                         if not sent_any_delta:
                             yield sse({
                                 "type": "message_start",
@@ -437,7 +552,7 @@ async def anthropic(req: Request):
                                     "id": msg_id,
                                     "type": "message",
                                     "role": "assistant",
-                                    "model": body.get("model", "claude-opus-4-7"),
                                     "content": [],
                                     "stop_reason": None,
                                     "stop_sequence": None,
@@ -450,62 +565,17 @@ async def anthropic(req: Request):
                                 "content_block": {"type": "text"}
                             })
-                        hit_limit_mid_stream = False
                         async for line in r.aiter_lines():
-                            if not line.startswith("data: "):
                                 continue
-                            raw = line[6:].strip()
-                            if raw == "[DONE]":
                                 break
-                            # Detect mid-stream 429 / limit payload
-                            if "429" in raw or "usage limit" in raw.lower():
-                                log(f"Key {key[:8]}... mid-stream limit in anthropic, aborting chunk")
-                                hit_limit_mid_stream = True
-                                break
-                            try:
-                                j = json.loads(raw)
-                            except Exception:
-                                continue
-                            delta = j["choices"][0]["delta"]
-                            txt = delta.get("content", "")
-                            if txt:
-                                sent_any_delta = True
-                                yield sse({
-                                    "type": "content_block_delta",
-                                    "index": 0,
-                                    "delta": {"type": "text_delta", "text": txt}
-                                })
-                        if hit_limit_mid_stream:
-                            await mark_fail(key)
-                            # Continue to next key — stream resumes from where it broke
-                            # Note: client will receive continued deltas seamlessly
-                            continue
-                        await mark_ok(key)
-                        break  # Success — exit key retry loop
-            except Exception as e:
-                log(f"Key {key[:8]}... agen exception: {e}")
-                await mark_fail(key)
-            finally:
-                await release_key(key)
-        # Close Anthropic SSE envelope
-        yield sse({"type": "content_block_stop", "index": 0})
-        yield sse({
-            "type": "message_delta",
-            "delta": {"stop_reason": "end_turn", "stop_sequence": None},
-            "usage": {"output_tokens": 0}
-        })
-        yield sse({"type": "message_stop"})
-    return StreamingResponse(agen(), media_type="text/event-stream")

 # =====================================================
 # CONFIG
 # =====================================================
 MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
+# Default CF Workers AI model (can override via request body)
+DEFAULT_CF_MODEL = os.getenv("DEFAULT_CF_MODEL", "@cf/meta/llama-3.3-70b-instruct-fp8-fast")
 # =====================================================
+# LOAD CF CREDENTIALS
+# Format env: CF_1=account_id,api_key
 # =====================================================
+CF_ACCOUNTS = []  # list of {"account_id": ..., "api_key": ...}
+for i in range(1, 101):
+    raw = os.getenv(f"CF_{i}")
+    if not raw:
+        continue
+    parts = raw.split(",", 1)
+    if len(parts) != 2:
+        print(f"[WARN] CF_{i} format invalid, expected 'account_id,api_key' — skipped")
+        continue
+    account_id, api_key = parts[0].strip(), parts[1].strip()
+    if account_id and api_key:
+        CF_ACCOUNTS.append({"account_id": account_id, "api_key": api_key})
+if not CF_ACCOUNTS:
+    print("[WARN] No CF credentials found, inserting dummy")
+    CF_ACCOUNTS.append({"account_id": "dummy", "api_key": "dummy"})
+# =====================================================
+# KEY STATUS
+# =====================================================
 key_status = {}
+for idx, acc in enumerate(CF_ACCOUNTS, 1):
+    kid = acc["account_id"]
+    key_status[kid] = {
         "index": idx,
         "healthy": True,
         "busy": False,
     }
 rr_index = 0
 _key_lock = asyncio.Lock()
     return token == MASTER_API_KEY
+def cf_url(account_id: str, model: str) -> str:
+    return f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model}"
 async def get_key(exclude=None):
     global rr_index
     if exclude is None:
         exclude = set()
     async with _key_lock:
+        for _ in range(len(CF_ACCOUNTS)):
+            rr_index = (rr_index + 1) % len(CF_ACCOUNTS)
+            acc = CF_ACCOUNTS[rr_index]
+            kid = acc["account_id"]
+            st = key_status[kid]
+            if st["healthy"] and not st["busy"] and kid not in exclude:
                 st["busy"] = True
+                return acc  # returns dict {"account_id": ..., "api_key": ...}
     return None
+async def release_key(acc):
     async with _key_lock:
+        kid = acc["account_id"]
+        if kid in key_status:
+            key_status[kid]["busy"] = False
+async def mark_fail(acc):
     async with _key_lock:
+        kid = acc["account_id"]
+        if kid in key_status:
+            key_status[kid]["fail"] += 1
+async def mark_ok(acc):
     async with _key_lock:
+        kid = acc["account_id"]
+        if kid in key_status:
+            key_status[kid]["success"] += 1
+            key_status[kid]["fail"] = 0
 async def wait_for_free_key(exclude=None, max_wait=30.0, interval=0.3):
     elapsed = 0.0
     while elapsed < max_wait:
+        acc = await get_key(exclude)
+        if acc:
+            return acc
         await asyncio.sleep(interval)
         elapsed += interval
     return None
+def is_rate_limited(status_code: int, text: str) -> bool:
+    t = text.lower()
+    return status_code == 429 or "rate limit" in t or "too many requests" in t or "usage limit" in t
 # =====================================================
 # ROOT
 # =====================================================
 async def root():
     async with _key_lock:
         safe = {}
+        for kid, v in key_status.items():
+            masked = kid[:6] + "****" + kid[-4:]
             safe[masked] = {
                 "index": v["index"],
                 "healthy": v["healthy"],
     return {
         "status": "ok",
+        "accounts": len(CF_ACCOUNTS),
+        "default_model": DEFAULT_CF_MODEL,
         "detail": safe
     }
 # =====================================================
+# /v1/models  — static list of popular CF models
 # =====================================================
 @app.get("/v1/models")
 async def models(req: Request):
     if not auth_ok(req):
         return JSONResponse({"error": "Unauthorized"}, status_code=401)
     now = int(time.time())
+    cf_models = [
+        "@cf/meta/llama-3.3-70b-instruct-fp8-fast",
+        "@cf/meta/llama-3.1-8b-instruct",
+        "@cf/meta/llama-3.1-70b-instruct",
+        "@cf/mistral/mistral-7b-instruct-v0.1",
+        "@cf/google/gemma-7b-it",
+        "@cf/qwen/qwen1.5-14b-chat-awq",
+        "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b",
+    ]
+    data = [
+        {"id": m, "object": "model", "created": now, "owned_by": "cloudflare"}
+        for m in cf_models
+    ]
+    return {"object": "list", "data": data}
 # =====================================================
+# /v1/chat/completions  — OpenAI-compatible endpoint
 # =====================================================
 @app.post("/v1/chat/completions")
 async def chat(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     is_stream = body.get("stream", False)
+    model = body.get("model", DEFAULT_CF_MODEL)
+    messages = body.get("messages", [])
+    max_tokens = body.get("max_tokens", 2048)
+    cf_body = {
+        "messages": messages,
+        "stream": is_stream,
+        "max_tokens": max_tokens,
+    }
     # -----------------------------------------
     # NON STREAM
     if not is_stream:
         tried = set()
+        for _ in range(len(CF_ACCOUNTS)):
+            acc = await wait_for_free_key(exclude=tried)
+            if not acc:
                 break
+            tried.add(acc["account_id"])
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
+                        cf_url(acc["account_id"], model),
+                        json=cf_body,
+                        headers={
+                            "Authorization": f"Bearer {acc['api_key']}",
+                            "Content-Type": "application/json",
+                        }
                     )
+                if is_rate_limited(r.status_code, r.text):
+                    log(f"Account {acc['account_id'][:8]}... rate limited (non-stream), trying next")
+                    await mark_fail(acc)
+                    continue
+                if r.status_code != 200:
+                    log(f"Account {acc['account_id'][:8]}... HTTP {r.status_code}, trying next")
+                    await mark_fail(acc)
                     continue
+                data = r.json()
+                # CF Workers AI response format:
+                # {"result": {"response": "..."}, "success": true, ...}
+                # Convert to OpenAI format
+                cf_result = data.get("result", {})
+                content = cf_result.get("response", "")
+                out = {
+                    "id": "chatcmpl-" + uuid.uuid4().hex[:10],
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {"role": "assistant", "content": content},
+                            "finish_reason": "stop",
+                        }
+                    ],
+                    "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
+                }
+                await mark_ok(acc)
+                return JSONResponse(out)
             except Exception as e:
+                log(f"Account {acc['account_id'][:8]}... exception: {e}")
+                await mark_fail(acc)
             finally:
+                await release_key(acc)
+        return JSONResponse({"error": "All accounts failed"}, status_code=500)
     # -----------------------------------------
     # STREAM
+    # CF Workers AI streams NDJSON lines:
+    # {"response":"token"} or {"p":"...","response":"token"} and ends with [DONE]
+    # We convert to OpenAI SSE format
     # -----------------------------------------
     async def gen():
         tried = set()
+        cid = "chatcmpl-" + uuid.uuid4().hex[:10]
+        sent_any = False
+        for _ in range(len(CF_ACCOUNTS)):
+            acc = await wait_for_free_key(exclude=tried)
+            if not acc:
                 break
+            tried.add(acc["account_id"])
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
                         "POST",
+                        cf_url(acc["account_id"], model),
+                        json=cf_body,
+                        headers={
+                            "Authorization": f"Bearer {acc['api_key']}",
+                            "Content-Type": "application/json",
+                        }
                     ) as r:
+                        if is_rate_limited(r.status_code, ""):
+                            log(f"Account {acc['account_id'][:8]}... rate limited (stream), trying next")
+                            await mark_fail(acc)
                             continue
+                        if r.status_code != 200:
+                            log(f"Account {acc['account_id'][:8]}... HTTP {r.status_code} (stream), trying next")
+                            await mark_fail(acc)
+                            continue
+                        hit_limit = False
                         async for line in r.aiter_lines():
+                            line = line.strip()
                             if not line:
                                 continue
+                            if line == "data: [DONE]" or line == "[DONE]":
                                 break
+                            # Strip "data: " prefix if present
+                            raw = line[6:] if line.startswith("data: ") else line
+                            # Detect mid-stream rate limit
+                            if is_rate_limited(0, raw):
+                                log(f"Account {acc['account_id'][:8]}... mid-stream limit, switching key")
+                                hit_limit = True
+                                break
+                            try:
+                                j = json.loads(raw)
+                            except Exception:
+                                continue
+                            token = j.get("response", "")
+                            if token:
+                                sent_any = True
+                                chunk = {
+                                    "id": cid,
+                                    "object": "chat.completion.chunk",
+                                    "created": int(time.time()),
+                                    "model": model,
+                                    "choices": [
+                                        {
+                                            "index": 0,
+                                            "delta": {"role": "assistant", "content": token},
+                                            "finish_reason": None,
+                                        }
+                                    ]
+                                }
+                                yield sse(chunk)
+                        if hit_limit:
+                            await mark_fail(acc)
                             continue
+                        # Send finish chunk
+                        finish_chunk = {
+                            "id": cid,
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": model,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {},
+                                    "finish_reason": "stop",
+                                }
+                            ]
+                        }
+                        yield sse(finish_chunk)
+                        yield "data: [DONE]\n\n"
+                        await mark_ok(acc)
                         return
             except Exception as e:
+                log(f"Account {acc['account_id'][:8]}... stream exception: {e}")
+                await mark_fail(acc)
             finally:
+                await release_key(acc)
+        yield sse({"error": "All accounts failed"})
         yield "data: [DONE]\n\n"
     return StreamingResponse(gen(), media_type="text/event-stream")
 # =====================================================
+# /v1/messages  — Anthropic-compatible endpoint
 # =====================================================
 @app.post("/v1/messages")
 async def anthropic(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     stream = body.get("stream", False)
+    model = body.get("model", DEFAULT_CF_MODEL)
+    max_tokens = body.get("max_tokens", 2048)
+    # Convert Anthropic message format to CF/OpenAI format
     messages = []
     if body.get("system"):
     for m in body.get("messages", []):
         content = m.get("content", "")
         if isinstance(content, list):
             txt = ""
             for x in content:
                 if x.get("type") == "text":
                     txt += x.get("text", "")
             content = txt
         messages.append({"role": m["role"], "content": content})
+    cf_body = {
         "messages": messages,
+        "stream": stream,
+        "max_tokens": max_tokens,
     }
     # -----------------------------------------
     if not stream:
         tried = set()
+        for _ in range(len(CF_ACCOUNTS)):
+            acc = await wait_for_free_key(exclude=tried)
+            if not acc:
                 break
+            tried.add(acc["account_id"])
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
+                        cf_url(acc["account_id"], model),
+                        json=cf_body,
+                        headers={
+                            "Authorization": f"Bearer {acc['api_key']}",
+                            "Content-Type": "application/json",
+                        }
                     )
+                if is_rate_limited(r.status_code, r.text):
+                    log(f"Account {acc['account_id'][:8]}... rate limited (anthropic non-stream), trying next")
+                    await mark_fail(acc)
+                    continue
+                if r.status_code != 200:
+                    log(f"Account {acc['account_id'][:8]}... HTTP {r.status_code}, trying next")
+                    await mark_fail(acc)
                     continue
                 data = r.json()
+                cf_result = data.get("result", {})
+                content = cf_result.get("response", "")
                 out = {
                     "id": "msg_" + uuid.uuid4().hex[:10],
                     "type": "message",
                     "role": "assistant",
+                    "model": body.get("model", DEFAULT_CF_MODEL),
+                    "content": [{"type": "text", "text": content}],
                     "stop_reason": "end_turn",
                     "stop_sequence": None,
                     "usage": {"input_tokens": 0, "output_tokens": 0}
                 }
+                await mark_ok(acc)
                 return JSONResponse(out)
             except Exception as e:
+                log(f"Account {acc['account_id'][:8]}... exception: {e}")
+                await mark_fail(acc)
             finally:
+                await release_key(acc)
+        return JSONResponse({"error": "All accounts failed"}, status_code=500)
     # -----------------------------------------
+    # STREAM (Anthropic SSE envelope)
     # -----------------------------------------
     async def agen():
         tried = set()
         msg_id = "msg_" + uuid.uuid4().hex[:10]
         sent_any_delta = False
+        for _ in range(len(CF_ACCOUNTS)):
+            acc = await wait_for_free_key(exclude=tried)
+            if not acc:
                 break
+            tried.add(acc["account_id"])
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
                         "POST",
+                        cf_url(acc["account_id"], model),
+                        json=cf_body,
+                        headers={
+                            "Authorization": f"Bearer {acc['api_key']}",
+                            "Content-Type": "application/json",
+                        }
                     ) as r:
+                        if is_rate_limited(r.status_code, ""):
+                            log(f"Account {acc['account_id'][:8]}... rate limited (anthropic stream), trying next")
+                            await mark_fail(acc)
                             continue
+                        if r.status_code != 200:
+                            log(f"Account {acc['account_id'][:8]}... HTTP {r.status_code} (anthropic stream), trying next")
+                            await mark_fail(acc)
+                            continue
+                        # Emit Anthropic envelope only once on first successful key
                         if not sent_any_delta:
                             yield sse({
                                 "type": "message_start",
                                     "id": msg_id,
                                     "type": "message",
                                     "role": "assistant",
+                                    "model": body.get("model", DEFAULT_CF_MODEL),
                                     "content": [],
                                     "stop_reason": None,
                                     "stop_sequence": None,
                                 "content_block": {"type": "text"}
                             })
+                        hit_limit = False
                         async for line in r.aiter_lines():
+                            line = line.strip()
+                            if not line:
                                 continue
+                            if line == "data: [DONE]" or line == "[DONE]":
                                 break
+                            raw = line[6:] if line.startswith("data: ") else line
+                            if is_rate_limited(0, raw):
+                                log(f"Account {acc['account_id'][:8]}... mid-stream limit (anthr