Spaces:

Elysiadev11
/

proxyollma

Sleeping

App Files Files Community

Elysiadev11 commited on 7 days ago

Commit

bff9611

verified ·

1 Parent(s): 3f83b4e

Update proxy_cerebras.py

Browse files

Files changed (1) hide show

proxy_cerebras.py +443 -240

proxy_cerebras.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import os
 import json
 import time
@@ -16,6 +26,30 @@ app = FastAPI()
 # =====================================================
 BASE_URL = os.getenv("BASE_URL", "https://ollama.com")
 MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
 # =====================================================
 # LOAD KEYS
@@ -33,16 +67,15 @@ key_status = {}
 for idx, k in enumerate(OLLAMA_KEYS, 1):
     key_status[k] = {
         "index": idx,
-        "healthy": True,
         "busy": False,
         "success": 0,
         "fail": 0,
     }
 rr_index = 0
-# Global async lock to prevent race condition on rr_index & busy flag
-_key_lock = asyncio.Lock()
 # =====================================================
@@ -51,71 +84,241 @@ _key_lock = asyncio.Lock()
 def log(x):
     print(f"[{time.strftime('%H:%M:%S')}] {x}", flush=True)
 def sse(obj):
     return "data: " + json.dumps(obj, ensure_ascii=False) + "\n\n"
 def auth_ok(req: Request):
-    token = req.headers.get("Authorization", "").replace("Bearer ", "")
-    return token == MASTER_API_KEY
-async def get_key(exclude=None):
-    """
-    Thread-safe round-robin key picker.
-    Returns the key string, or None if all are busy/excluded.
-    """
-    global rr_index
-    if exclude is None:
-        exclude = set()
-    async with _key_lock:
-        for _ in range(len(OLLAMA_KEYS)):
-            rr_index = (rr_index + 1) % len(OLLAMA_KEYS)
-            k = OLLAMA_KEYS[rr_index]
-            st = key_status[k]
-            if st["healthy"] and not st["busy"] and k not in exclude:
-                st["busy"] = True
-                return k
     return None
 async def release_key(k):
-    async with _key_lock:
         if k in key_status:
             key_status[k]["busy"] = False
 async def mark_fail(k):
-    async with _key_lock:
         if k in key_status:
             key_status[k]["fail"] += 1
 async def mark_ok(k):
-    async with _key_lock:
         if k in key_status:
             key_status[k]["success"] += 1
             key_status[k]["fail"] = 0
-async def wait_for_free_key(exclude=None, max_wait=30.0, interval=0.3):
     """
-    Polls until a free key is available or max_wait seconds pass.
-    Returns the key or None on timeout.
     """
-    elapsed = 0.0
-    while elapsed < max_wait:
-        key = await get_key(exclude)
-        if key:
-            return key
-        await asyncio.sleep(interval)
-        elapsed += interval
-    return None
 # =====================================================
@@ -123,23 +326,19 @@ async def wait_for_free_key(exclude=None, max_wait=30.0, interval=0.3):
 # =====================================================
 @app.get("/")
 async def root():
-    async with _key_lock:
         safe = {}
         for k, v in key_status.items():
-            masked = k[:4] + "****" + k[-4:]
-            safe[masked] = {
                 "index": v["index"],
-                "healthy": v["healthy"],
-                "busy": v["busy"],
                 "success": v["success"],
                 "fail": v["fail"],
             }
-    return {
-        "status": "ok",
-        "keys": len(OLLAMA_KEYS),
-        "detail": safe
-    }
 # =====================================================
@@ -151,33 +350,22 @@ async def models(req: Request):
         return JSONResponse({"error": "Unauthorized"}, status_code=401)
     key = OLLAMA_KEYS[0]
-    async with httpx.AsyncClient(timeout=60) as client:
-        r = await client.get(
-            f"{BASE_URL}/api/tags",
-            headers={"Authorization": f"Bearer {key}"}
-        )
-    if r.status_code != 200:
-        return JSONResponse({"error": r.text}, status_code=r.status_code)
-    data = r.json()
-    now = int(time.time())
-    out = []
-    for m in data.get("models", []):
-        out.append({
-            "id": m.get("name"),
-            "object": "model",
-            "created": now,
-            "owned_by": "ollama"
-        })
-    return {"object": "list", "data": out}
 # =====================================================
-# OPENAI CHAT  /v1/chat/completions
 # =====================================================
 @app.post("/v1/chat/completions")
 async def chat(req: Request):
@@ -196,15 +384,12 @@ async def chat(req: Request):
     # -----------------------------------------
     if not is_stream:
         tried = set()
         for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
             if not key:
-                break
             tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
@@ -212,94 +397,71 @@ async def chat(req: Request):
                         json=body,
                         headers={"Authorization": f"Bearer {key}"}
                     )
-                txt = r.text.lower()
-                if "weekly usage limit" in txt or r.status_code == 429:
-                    log(f"Key {key[:8]}... rate limited (non-stream chat), trying next")
-                    await mark_fail(key)
                     continue
                 await mark_ok(key)
-                return Response(
-                    content=r.content,
-                    media_type=r.headers.get("content-type", "application/json")
-                )
             except Exception as e:
-                log(f"Key {key[:8]}... exception: {e}")
                 await mark_fail(key)
             finally:
                 await release_key(key)
         return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
-    # STREAM
     # -----------------------------------------
     async def gen():
-        tried = set()
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
-                break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
-                        "POST",
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=body,
-                        headers={"Authorization": f"Bearer {key}"}
                     ) as r:
-                        if r.status_code == 429:
-                            log(f"Key {key[:8]}... rate limited (stream chat), trying next")
-                            await mark_fail(key)
                             continue
-                        hit_limit_mid_stream = False
                         async for line in r.aiter_lines():
                             if not line:
                                 continue
-                            # Detect mid-stream rate limit signal in data payload
-                            if "429" in line or "usage limit" in line.lower():
-                                log(f"Key {key[:8]}... mid-stream limit detected, aborting chunk")
-                                hit_limit_mid_stream = True
                                 break
                             yield line + "\n\n"
-                        if hit_limit_mid_stream:
-                            await mark_fail(key)
                             continue
                         await mark_ok(key)
                         return
             except Exception as e:
-                log(f"Key {key[:8]}... stream exception: {e}")
                 await mark_fail(key)
             finally:
                 await release_key(key)
-        yield sse({"error": "All keys failed"})
-        yield "data: [DONE]\n\n"
     return StreamingResponse(gen(), media_type="text/event-stream")
 # =====================================================
-# ANTHROPIC /v1/messages
 # =====================================================
 @app.post("/v1/messages")
 async def anthropic(req: Request):
@@ -314,45 +476,50 @@ async def anthropic(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     stream = body.get("stream", False)
-    # Build messages list for proxy
     messages = []
     if body.get("system"):
-        messages.append({"role": "system", "content": body["system"]})
-    for m in body.get("messages", []):
-        content = m.get("content", "")
-        if isinstance(content, list):
-            txt = ""
-            for x in content:
-                if x.get("type") == "text":
-                    txt += x.get("text", "")
-            content = txt
-        messages.append({"role": m["role"], "content": content})
     proxy_body = {
-        "model": "minimax-m2.7:cloud",
         "messages": messages,
-        "stream": stream
     }
     # -----------------------------------------
     # NON STREAM
     # -----------------------------------------
     if not stream:
         tried = set()
         for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
             if not key:
-                break
             tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
@@ -360,152 +527,188 @@ async def anthropic(req: Request):
                         json=proxy_body,
                         headers={"Authorization": f"Bearer {key}"}
                     )
-                txt = r.text.lower()
-                if "weekly usage limit" in txt or r.status_code == 429:
-                    log(f"Key {key[:8]}... rate limited (non-stream anthropic), trying next")
                     await mark_fail(key)
                     continue
                 data = r.json()
-                ans = data["choices"][0]["message"]["content"]
-                out = {
-                    "id": "msg_" + uuid.uuid4().hex[:10],
-                    "type": "message",
-                    "role": "assistant",
-                    "model": body.get("model", "claude-opus-4-7"),
-                    "content": [{"type": "text", "text": ans}],
-                    "stop_reason": "end_turn",
-                    "stop_sequence": None,
-                    "usage": {"input_tokens": 0, "output_tokens": 0}
-                }
                 await mark_ok(key)
                 return JSONResponse(out)
             except Exception as e:
-                log(f"Key {key[:8]}... exception: {e}")
                 await mark_fail(key)
             finally:
                 await release_key(key)
         return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
-    # STREAM  (Anthropic SSE format)
     # -----------------------------------------
     async def agen():
-        tried = set()
         msg_id = "msg_" + uuid.uuid4().hex[:10]
-        sent_any_delta = False
-        # Send Anthropic envelope headers ONCE before first key attempt
-        # We defer these until we have a successful connection to avoid
-        # sending headers before knowing if any key works.
-        # Instead we buffer and yield only on confirmed success.
-        for _ in range(len(OLLAMA_KEYS)):
-            key = await wait_for_free_key(exclude=tried)
-            if not key:
-                break
-            tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
-                        "POST",
-                        f"{BASE_URL}/v1/chat/completions",
-                        json=proxy_body,
-                        headers={"Authorization": f"Bearer {key}"}
                     ) as r:
-                        if r.status_code == 429:
-                            log(f"Key {key[:8]}... rate limited (stream anthropic), trying next")
                             await mark_fail(key)
                             continue
-                        # Only emit Anthropic envelope on first successful key
-                        if not sent_any_delta:
                             yield sse({
                                 "type": "message_start",
                                 "message": {
-                                    "id": msg_id,
-                                    "type": "message",
-                                    "role": "assistant",
-                                    "model": body.get("model", "claude-opus-4-7"),
-                                    "content": [],
-                                    "stop_reason": None,
-                                    "stop_sequence": None,
                                     "usage": {"input_tokens": 0, "output_tokens": 0}
                                 }
                             })
                             yield sse({
-                                "type": "content_block_start",
-                                "index": 0,
-                                "content_block": {"type": "text"}
                             })
-                        hit_limit_mid_stream = False
                         async for line in r.aiter_lines():
-                            if not line.startswith("data: "):
                                 continue
-                            raw = line[6:].strip()
-                            if raw == "[DONE]":
                                 break
-                            # Detect mid-stream 429 / limit payload
-                            if "429" in raw or "usage limit" in raw.lower():
-                                log(f"Key {key[:8]}... mid-stream limit in anthropic, aborting chunk")
-                                hit_limit_mid_stream = True
-                                break
                             try:
                                 j = json.loads(raw)
                             except Exception:
                                 continue
-                            delta = j["choices"][0]["delta"]
-                            txt = delta.get("content", "")
                             if txt:
-                                sent_any_delta = True
                                 yield sse({
-                                    "type": "content_block_delta",
-                                    "index": 0,
                                     "delta": {"type": "text_delta", "text": txt}
                                 })
-                        if hit_limit_mid_stream:
-                            await mark_fail(key)
-                            # Continue to next key — stream resumes from where it broke
-                            # Note: client will receive continued deltas seamlessly
                             continue
                         await mark_ok(key)
-                        break  # Success — exit key retry loop
             except Exception as e:
-                log(f"Key {key[:8]}... agen exception: {e}")
                 await mark_fail(key)
             finally:
                 await release_key(key)
-        # Close Anthropic SSE envelope
         yield sse({"type": "content_block_stop", "index": 0})
-        yield sse({
-            "type": "message_delta",
-            "delta": {"stop_reason": "end_turn", "stop_sequence": None},
-            "usage": {"output_tokens": 0}
-        })
         yield sse({"type": "message_stop"})
-    return StreamingResponse(agen(), media_type="text/event-stream")

+"""
+proxy_cerebras.py — Proxy ke Ollama backend dengan Anthropic + OpenAI compatible API
+FIXED:
+  - Tool calling support penuh (Anthropic <-> OpenAI)
+  - Non-stream tidak crash saat finish_reason=tool_calls
+  - Stream handle delta.tool_calls
+  - Model tidak hardcoded, diteruskan dari request
+  - Infinite loop dengan smart cooldown (tidak hammering)
+"""
 import os
 import json
 import time
 # =====================================================
 BASE_URL = os.getenv("BASE_URL", "https://ollama.com")
 MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "minimax-m2.7:cloud")
+RATE_LIMIT_COOLDOWN = int(os.getenv("RATE_LIMIT_COOLDOWN", "62"))
+# Model mapping Claude/GPT → Ollama model
+MODEL_MAP = {
+    "claude-opus-4-7": DEFAULT_MODEL,
+    "claude-opus-4-6": DEFAULT_MODEL,
+    "claude-opus-4-5": DEFAULT_MODEL,
+    "claude-opus-4-1": DEFAULT_MODEL,
+    "claude-opus-4-20250514": DEFAULT_MODEL,
+    "claude-sonnet-4-6": DEFAULT_MODEL,
+    "claude-sonnet-4-5": DEFAULT_MODEL,
+    "claude-sonnet-4-20250514": DEFAULT_MODEL,
+    "claude-haiku-4-5": DEFAULT_MODEL,
+    "claude-haiku-4-5-20251001": DEFAULT_MODEL,
+    "gpt-4": DEFAULT_MODEL,
+    "gpt-4o": DEFAULT_MODEL,
+    "gpt-4o-mini": DEFAULT_MODEL,
+    "gpt-4-turbo": DEFAULT_MODEL,
+    "gpt-3.5-turbo": DEFAULT_MODEL,
+}
+def map_model(name: str) -> str:
+    return MODEL_MAP.get(name, name)
 # =====================================================
 # LOAD KEYS
 for idx, k in enumerate(OLLAMA_KEYS, 1):
     key_status[k] = {
         "index": idx,
+        "prefix": k[:6] + "..." if len(k) > 6 else k,
         "busy": False,
         "success": 0,
         "fail": 0,
+        "rate_limited_until": 0.0,
     }
 rr_index = 0
+_lock = asyncio.Lock()
 # =====================================================
 def log(x):
     print(f"[{time.strftime('%H:%M:%S')}] {x}", flush=True)
 def sse(obj):
     return "data: " + json.dumps(obj, ensure_ascii=False) + "\n\n"
 def auth_ok(req: Request):
+    return req.headers.get("Authorization", "").replace("Bearer ", "") == MASTER_API_KEY
+def is_rate_limited(status: int, body_text: str = "") -> bool:
+    if status == 429:
+        return True
+    t = body_text.lower()
+    return "weekly usage limit" in t or "rate limit" in t or "too many requests" in t
+# =====================================================
+# KEY MANAGEMENT
+# =====================================================
+def _pick_key(exclude: set):
+    """Sync, dipanggil dalam _lock. Pilih key yang ready (tidak busy & tidak cooldown)."""
+    global rr_index
+    now = time.time()
+    for _ in range(len(OLLAMA_KEYS)):
+        rr_index = (rr_index + 1) % len(OLLAMA_KEYS)
+        k = OLLAMA_KEYS[rr_index]
+        st = key_status[k]
+        if not st["busy"] and now >= st["rate_limited_until"] and k not in exclude:
+            st["busy"] = True
+            return k
     return None
+def _next_ready_time() -> float:
+    """Epoch time kapan key pertama keluar dari cooldown."""
+    now = time.time()
+    times = [st["rate_limited_until"] for st in key_status.values() if st["rate_limited_until"] > now]
+    return min(times) if times else now
+async def get_key(exclude=None):
+    async with _lock:
+        return _pick_key(exclude or set())
 async def release_key(k):
+    async with _lock:
         if k in key_status:
             key_status[k]["busy"] = False
+async def mark_rate_limited(k):
+    async with _lock:
+        if k in key_status:
+            until = time.time() + RATE_LIMIT_COOLDOWN
+            key_status[k]["rate_limited_until"] = until
+            key_status[k]["fail"] += 1
+            idx = key_status[k]["index"]
+            log(f"⏳ key#{idx} cooldown {RATE_LIMIT_COOLDOWN}s (ready {time.strftime('%H:%M:%S', time.localtime(until))})")
 async def mark_fail(k):
+    async with _lock:
         if k in key_status:
             key_status[k]["fail"] += 1
 async def mark_ok(k):
+    async with _lock:
         if k in key_status:
             key_status[k]["success"] += 1
             key_status[k]["fail"] = 0
+            key_status[k]["rate_limited_until"] = 0.0
+async def get_key_infinite(exclude=None):
+    """
+    Tunggu key tanpa batas. Kalau semua cooldown, sleep TEPAT sampai key paling cepat ready.
+    Return: (key, exclude_set)
+    """
+    local_exclude = set(exclude) if exclude else set()
+    cycle = 0
+    while True:
+        async with _lock:
+            k = _pick_key(local_exclude)
+            if k:
+                return k, local_exclude
+            now = time.time()
+            next_ready = _next_ready_time()
+            wait_sec = max(0.5, next_ready - now)
+            all_cooldown = all(
+                st["rate_limited_until"] > now or st["busy"]
+                for st in key_status.values()
+            )
+        if all_cooldown:
+            cycle += 1
+            log(f"⏳ Semua key cooldown. Tunggu {wait_sec:.1f}s... (cycle #{cycle})")
+            local_exclude.clear()
+            await asyncio.sleep(wait_sec)
+        else:
+            await asyncio.sleep(0.3)
+# =====================================================
+# TOOL CONVERSION: Anthropic ↔ OpenAI
+# =====================================================
+def anthropic_tools_to_openai(tools: list) -> list:
     """
+    Anthropic: {"name", "description", "input_schema"}
+    OpenAI:    {"type": "function", "function": {"name", "description", "parameters"}}
     """
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": t.get("name", ""),
+                "description": t.get("description", ""),
+                "parameters": t.get("input_schema", {"type": "object", "properties": {}}),
+            }
+        }
+        for t in tools
+    ]
+def anthropic_tool_choice_to_openai(tc):
+    if tc is None:
+        return None
+    if isinstance(tc, str):
+        return {"auto": "auto", "any": "required", "none": "none"}.get(tc, "auto")
+    if isinstance(tc, dict):
+        t = tc.get("type", "")
+        if t == "tool":
+            return {"type": "function", "function": {"name": tc.get("name", "")}}
+        return {"auto": "auto", "any": "required", "none": "none"}.get(t, "auto")
+    return "auto"
+def convert_anthropic_messages(messages: list) -> list:
+    """
+    Convert Anthropic message list → OpenAI message list.
+    Handles: text, tool_use (assistant), tool_result (user).
+    """
+    out = []
+    for m in messages:
+        role = m.get("role", "user")
+        content = m.get("content", "")
+        # String content → langsung
+        if isinstance(content, str):
+            out.append({"role": role, "content": content})
+            continue
+        if not isinstance(content, list):
+            out.append({"role": role, "content": str(content)})
+            continue
+        tool_use_blocks   = [b for b in content if b.get("type") == "tool_use"]
+        tool_result_blocks = [b for b in content if b.get("type") == "tool_result"]
+        text_blocks        = [b for b in content if b.get("type") == "text"]
+        # Assistant dengan tool_use → OpenAI assistant + tool_calls
+        if tool_use_blocks and role == "assistant":
+            text_content = "".join(b.get("text", "") for b in text_blocks) or None
+            tool_calls = [
+                {
+                    "id": b.get("id", "call_" + uuid.uuid4().hex[:8]),
+                    "type": "function",
+                    "function": {
+                        "name": b.get("name", ""),
+                        "arguments": json.dumps(b.get("input", {}))
+                    }
+                }
+                for b in tool_use_blocks
+            ]
+            out.append({"role": "assistant", "content": text_content, "tool_calls": tool_calls})
+            continue
+        # User dengan tool_result → OpenAI role=tool messages
+        if tool_result_blocks and role == "user":
+            for b in tool_result_blocks:
+                rc = b.get("content", "")
+                if isinstance(rc, list):
+                    rc = "".join(x.get("text", "") if isinstance(x, dict) else str(x) for x in rc)
+                out.append({
+                    "role": "tool",
+                    "tool_call_id": b.get("tool_use_id", ""),
+                    "content": str(rc),
+                })
+            if text_blocks:
+                txt = "".join(b.get("text", "") for b in text_blocks)
+                if txt:
+                    out.append({"role": "user", "content": txt})
+            continue
+        # Default: gabung semua text
+        out.append({"role": role, "content": "".join(b.get("text", "") for b in text_blocks)})
+    return out
+def openai_to_anthropic_response(data: dict, original_model: str) -> dict:
+    """
+    Convert OpenAI non-stream response → Anthropic format.
+    Handle text + tool_calls.
+    """
+    choice = data["choices"][0]
+    message = choice.get("message", {})
+    finish_reason = choice.get("finish_reason", "stop")
+    usage = data.get("usage", {})
+    stop_map = {"stop": "end_turn", "length": "max_tokens", "eos": "end_turn", "tool_calls": "tool_use"}
+    stop_reason = stop_map.get(finish_reason, "end_turn")
+    content_blocks = []
+    text = message.get("content") or ""
+    if text:
+        content_blocks.append({"type": "text", "text": text})
+    for tc in (message.get("tool_calls") or []):
+        fn = tc.get("function", {})
+        try:
+            inp = json.loads(fn.get("arguments", "{}"))
+        except Exception:
+            inp = {"_raw": fn.get("arguments", "")}
+        content_blocks.append({
+            "type": "tool_use",
+            "id": tc.get("id", "toolu_" + uuid.uuid4().hex[:10]),
+            "name": fn.get("name", ""),
+            "input": inp,
+        })
+    return {
+        "id": "msg_" + uuid.uuid4().hex[:10],
+        "type": "message",
+        "role": "assistant",
+        "model": original_model,
+        "content": content_blocks,
+        "stop_reason": stop_reason,
+        "stop_sequence": None,
+        "usage": {
+            "input_tokens": usage.get("prompt_tokens", 0),
+            "output_tokens": usage.get("completion_tokens", 0),
+        }
+    }
 # =====================================================
 # =====================================================
 @app.get("/")
 async def root():
+    now = time.time()
+    async with _lock:
         safe = {}
         for k, v in key_status.items():
+            cd = max(0, v["rate_limited_until"] - now)
+            safe[v["prefix"]] = {
                 "index": v["index"],
+                "status": "BUSY" if v["busy"] else ("COOLDOWN" if cd > 0 else "IDLE"),
+                "cooldown_sec": round(cd, 1) if cd > 0 else 0,
                 "success": v["success"],
                 "fail": v["fail"],
             }
+    return {"status": "ok", "base_url": BASE_URL, "default_model": DEFAULT_MODEL, "keys": safe}
 # =====================================================
         return JSONResponse({"error": "Unauthorized"}, status_code=401)
     key = OLLAMA_KEYS[0]
+    try:
+        async with httpx.AsyncClient(timeout=60) as client:
+            r = await client.get(f"{BASE_URL}/api/tags", headers={"Authorization": f"Bearer {key}"})
+        if r.status_code == 200:
+            data = r.json()
+            now = int(time.time())
+            out = [{"id": m.get("name"), "object": "model", "created": now, "owned_by": "ollama"}
+                   for m in data.get("models", [])]
+            return {"object": "list", "data": out}
+    except Exception as e:
+        log(f"[/v1/models] {e}")
+    return JSONResponse({"error": "Failed to fetch models"}, status_code=500)
 # =====================================================
+# /v1/chat/completions  (OpenAI-compatible, pipe through)
 # =====================================================
 @app.post("/v1/chat/completions")
 async def chat(req: Request):
     # -----------------------------------------
     if not is_stream:
         tried = set()
         for _ in range(len(OLLAMA_KEYS)):
+            key = await get_key(tried)
             if not key:
+                await asyncio.sleep(0.5)
+                continue
             tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
                         json=body,
                         headers={"Authorization": f"Bearer {key}"}
                     )
+                if is_rate_limited(r.status_code, r.text if r.status_code != 200 else ""):
+                    await mark_rate_limited(key)
                     continue
                 await mark_ok(key)
+                return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
             except Exception as e:
+                log(e)
                 await mark_fail(key)
             finally:
                 await release_key(key)
         return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
+    # STREAM — infinite loop
     # -----------------------------------------
     async def gen():
+        exclude = set()
+        while True:
+            key, exclude = await get_key_infinite(exclude)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
+                        "POST", f"{BASE_URL}/v1/chat/completions",
+                        json=body, headers={"Authorization": f"Bearer {key}"}
                     ) as r:
+                        if is_rate_limited(r.status_code):
+                            await mark_rate_limited(key)
                             continue
+                        hit_limit = False
                         async for line in r.aiter_lines():
                             if not line:
                                 continue
+                            if line.strip() == "data: [DONE]":
                                 break
+                            raw = line[6:] if line.startswith("data: ") else line
+                            try:
+                                j = json.loads(raw)
+                                if "error" in j and "choices" not in j and is_rate_limited(0, json.dumps(j)):
+                                    hit_limit = True
+                                    break
+                            except Exception:
+                                pass
                             yield line + "\n\n"
+                        if hit_limit:
+                            await mark_rate_limited(key)
                             continue
+                        yield "data: [DONE]\n\n"
                         await mark_ok(key)
                         return
             except Exception as e:
+                log(e)
                 await mark_fail(key)
             finally:
                 await release_key(key)
     return StreamingResponse(gen(), media_type="text/event-stream")
 # =====================================================
+# /v1/messages  (Anthropic-compatible)
+# FIXED: Full tool calling support
 # =====================================================
 @app.post("/v1/messages")
 async def anthropic(req: Request):
         return JSONResponse({"error": "Bad JSON"}, status_code=400)
     stream = body.get("stream", False)
+    original_model = body.get("model", "claude-opus-4-7")
+    ollama_model = map_model(original_model)
+    # Build OpenAI-format messages
     messages = []
     if body.get("system"):
+        sys = body["system"]
+        if isinstance(sys, list):
+            sys = "".join(x.get("text", "") for x in sys if x.get("type") == "text")
+        messages.append({"role": "system", "content": sys})
+    # FIXED: Konversi penuh termasuk tool_use dan tool_result
+    messages.extend(convert_anthropic_messages(body.get("messages", [])))
     proxy_body = {
+        "model": ollama_model,
         "messages": messages,
+        "stream": stream,
     }
+    if "max_tokens" in body:
+        proxy_body["max_tokens"] = body["max_tokens"]
+    if "temperature" in body:
+        proxy_body["temperature"] = body["temperature"]
+    if "top_p" in body:
+        proxy_body["top_p"] = body["top_p"]
+    # FIXED: Forward tools → OpenAI format
+    if body.get("tools"):
+        proxy_body["tools"] = anthropic_tools_to_openai(body["tools"])
+    if body.get("tool_choice"):
+        proxy_body["tool_choice"] = anthropic_tool_choice_to_openai(body["tool_choice"])
     # -----------------------------------------
     # NON STREAM
     # -----------------------------------------
     if not stream:
         tried = set()
         for _ in range(len(OLLAMA_KEYS)):
+            key = await get_key(tried)
             if not key:
+                await asyncio.sleep(0.5)
+                continue
             tried.add(key)
             try:
                 async with httpx.AsyncClient(timeout=180) as client:
                     r = await client.post(
                         json=proxy_body,
                         headers={"Authorization": f"Bearer {key}"}
                     )
+                if is_rate_limited(r.status_code, r.text if r.status_code != 200 else ""):
+                    await mark_rate_limited(key)
+                    continue
+                if r.status_code != 200:
+                    log(f"HTTP {r.status_code}: {r.text[:200]}")
                     await mark_fail(key)
                     continue
                 data = r.json()
+                # FIXED: convert OpenAI response → Anthropic (handles tool_calls too)
+                out = openai_to_anthropic_response(data, original_model)
                 await mark_ok(key)
                 return JSONResponse(out)
             except Exception as e:
+                log(e)
                 await mark_fail(key)
             finally:
                 await release_key(key)
         return JSONResponse({"error": "All keys failed"}, status_code=500)
     # -----------------------------------------
+    # STREAM — infinite loop, Anthropic SSE format
+    # FIXED: Handle tool_calls delta dari streaming
     # -----------------------------------------
     async def agen():
+        exclude = set()
         msg_id = "msg_" + uuid.uuid4().hex[:10]
+        sent_header = False
+        while True:
+            key, exclude = await get_key_infinite(exclude)
             try:
                 async with httpx.AsyncClient(timeout=None) as client:
                     async with client.stream(
+                        "POST", f"{BASE_URL}/v1/chat/completions",
+                        json=proxy_body, headers={"Authorization": f"Bearer {key}"}
                     ) as r:
+                        if is_rate_limited(r.status_code):
+                            await mark_rate_limited(key)
+                            continue
+                        if r.status_code != 200:
+                            log(f"STREAM HTTP {r.status_code}")
                             await mark_fail(key)
                             continue
+                        # Kirim Anthropic header sekali
+                        if not sent_header:
+                            sent_header = True
                             yield sse({
                                 "type": "message_start",
                                 "message": {
+                                    "id": msg_id, "type": "message", "role": "assistant",
+                                    "model": original_model, "content": [],
+                                    "stop_reason": None, "stop_sequence": None,
                                     "usage": {"input_tokens": 0, "output_tokens": 0}
                                 }
                             })
                             yield sse({
+                                "type": "content_block_start", "index": 0,
+                                "content_block": {"type": "text", "text": ""}
                             })
+                        hit_limit = False
+                        finish_reason = None
+                        # track tool call blocks: openai index → {block_index, id, name}
+                        tool_blocks = {}   # tc_idx → anthropic block_index
+                        next_block = 1     # 0 = text block
                         async for line in r.aiter_lines():
+                            if not line:
                                 continue
+                            if line.strip() == "data: [DONE]":
                                 break
+                            raw = line[6:] if line.startswith("data: ") else line
                             try:
                                 j = json.loads(raw)
                             except Exception:
                                 continue
+                            # Cek error API (bukan model output)
+                            if "error" in j and "choices" not in j:
+                                if is_rate_limited(0, json.dumps(j)):
+                                    hit_limit = True
+                                break
+                            choices = j.get("choices", [])
+                            if not choices:
+                                continue
+                            choice = choices[0]
+                            delta = choice.get("delta", {})
+                            finish_reason = choice.get("finish_reason") or finish_reason
+                            # ── TEXT content ──
+                            txt = delta.get("content") or ""
                             if txt:
                                 yield sse({
+                                    "type": "content_block_delta", "index": 0,
                                     "delta": {"type": "text_delta", "text": txt}
                                 })
+                            # ── TOOL CALLS ── FIXED
+                            for tc in (delta.get("tool_calls") or []):
+                                tc_idx = tc.get("index", 0)
+                                # Tool call baru → buka block
+                                if tc.get("id") or tc.get("function", {}).get("name"):
+                                    if tc_idx not in tool_blocks:
+                                        block_idx = next_block
+                                        next_block += 1
+                                        tool_blocks[tc_idx] = block_idx
+                                        yield sse({
+                                            "type": "content_block_start",
+                                            "index": block_idx,
+                                            "content_block": {
+                                                "type": "tool_use",
+                                                "id": tc.get("id", "toolu_" + uuid.uuid4().hex[:10]),
+                                                "name": tc.get("function", {}).get("name", ""),
+                                                "input": {}
+                                            }
+                                        })
+                                # Stream argument chunks
+                                args_chunk = tc.get("function", {}).get("arguments", "")
+                                if args_chunk and tc_idx in tool_blocks:
+                                    yield sse({
+                                        "type": "content_block_delta",
+                                        "index": tool_blocks[tc_idx],
+                                        "delta": {"type": "input_json_delta", "partial_json": args_chunk}
+                                    })
+                        if hit_limit:
+                            await mark_rate_limited(key)
                             continue
+                        # Tutup semua blocks
+                        yield sse({"type": "content_block_stop", "index": 0})
+                        for block_idx in tool_blocks.values():
+                            yield sse({"type": "content_block_stop", "index": block_idx})
+                        # Stop reason
+                        if finish_reason == "tool_calls" or tool_blocks:
+                            stop_reason = "tool_use"
+                        elif finish_reason == "length":
+                            stop_reason = "max_tokens"
+                        else:
+                            stop_reason = "end_turn"
+                        yield sse({
+                            "type": "message_delta",
+                            "delta": {"stop_reason": stop_reason, "stop_sequence": None},
+                            "usage": {"output_tokens": 0}
+                        })
+                        yield sse({"type": "message_stop"})
                         await mark_ok(key)
+                        return  # sukses
             except Exception as e:
+                log(e)
                 await mark_fail(key)
             finally:
                 await release_key(key)
+        # Fallback kalau belum kirim header
+        if not sent_header:
+            yield sse({
+                "type": "message_start",
+                "message": {
+                    "id": msg_id, "type": "message", "role": "assistant",
+                    "model": original_model, "content": [], "stop_reason": None,
+                    "stop_sequence": None, "usage": {"input_tokens": 0, "output_tokens": 0}
+                }
+            })
+            yield sse({"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}})
         yield sse({"type": "content_block_stop", "index": 0})
+        yield sse({"type": "message_delta", "delta": {"stop_reason": "end_turn", "stop_sequence": None}, "usage": {"output_tokens": 0}})
         yield sse({"type": "message_stop"})
+    return StreamingResponse(agen(), media_type="text/event-stream")