Spaces:

Elysiadev11
/

proxyollma

Sleeping

App Files Files Community

proxyollma / proxy_cerebras.py

Elysiadev11

Update proxy_cerebras.py

bff9611 verified 7 days ago

raw

history blame contribute delete

27.1 kB

	"""
	proxy_cerebras.py — Proxy ke Ollama backend dengan Anthropic + OpenAI compatible API
	FIXED:
	- Tool calling support penuh (Anthropic <-> OpenAI)
	- Non-stream tidak crash saat finish_reason=tool_calls
	- Stream handle delta.tool_calls
	- Model tidak hardcoded, diteruskan dari request
	- Infinite loop dengan smart cooldown (tidak hammering)
	"""

	import os
	import json
	import time
	import uuid
	import asyncio
	import httpx

	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse, Response, StreamingResponse
	from starlette.requests import ClientDisconnect

	app = FastAPI()

	# =====================================================
	# CONFIG
	# =====================================================
	BASE_URL = os.getenv("BASE_URL", "https://ollama.com")
	MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
	DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "minimax-m2.7:cloud")
	RATE_LIMIT_COOLDOWN = int(os.getenv("RATE_LIMIT_COOLDOWN", "62"))

	# Model mapping Claude/GPT → Ollama model
	MODEL_MAP = {
	"claude-opus-4-7": DEFAULT_MODEL,
	"claude-opus-4-6": DEFAULT_MODEL,
	"claude-opus-4-5": DEFAULT_MODEL,
	"claude-opus-4-1": DEFAULT_MODEL,
	"claude-opus-4-20250514": DEFAULT_MODEL,
	"claude-sonnet-4-6": DEFAULT_MODEL,
	"claude-sonnet-4-5": DEFAULT_MODEL,
	"claude-sonnet-4-20250514": DEFAULT_MODEL,
	"claude-haiku-4-5": DEFAULT_MODEL,
	"claude-haiku-4-5-20251001": DEFAULT_MODEL,
	"gpt-4": DEFAULT_MODEL,
	"gpt-4o": DEFAULT_MODEL,
	"gpt-4o-mini": DEFAULT_MODEL,
	"gpt-4-turbo": DEFAULT_MODEL,
	"gpt-3.5-turbo": DEFAULT_MODEL,
	}

	def map_model(name: str) -> str:
	return MODEL_MAP.get(name, name)

	# =====================================================
	# LOAD KEYS
	# =====================================================
	OLLAMA_KEYS = []
	for i in range(1, 101):
	k = os.getenv(f"OLLAMA_KEY_{i}")
	if k:
	OLLAMA_KEYS.append(k)

	if not OLLAMA_KEYS:
	OLLAMA_KEYS.append("dummy")

	key_status = {}
	for idx, k in enumerate(OLLAMA_KEYS, 1):
	key_status[k] = {
	"index": idx,
	"prefix": k[:6] + "..." if len(k) > 6 else k,
	"busy": False,
	"success": 0,
	"fail": 0,
	"rate_limited_until": 0.0,
	}

	rr_index = 0
	_lock = asyncio.Lock()


	# =====================================================
	# HELPERS
	# =====================================================
	def log(x):
	print(f"[{time.strftime('%H:%M:%S')}] {x}", flush=True)

	def sse(obj):
	return "data: " + json.dumps(obj, ensure_ascii=False) + "\n\n"

	def auth_ok(req: Request):
	return req.headers.get("Authorization", "").replace("Bearer ", "") == MASTER_API_KEY

	def is_rate_limited(status: int, body_text: str = "") -> bool:
	if status == 429:
	return True
	t = body_text.lower()
	return "weekly usage limit" in t or "rate limit" in t or "too many requests" in t


	# =====================================================
	# KEY MANAGEMENT
	# =====================================================
	def _pick_key(exclude: set):
	"""Sync, dipanggil dalam _lock. Pilih key yang ready (tidak busy & tidak cooldown)."""
	global rr_index
	now = time.time()
	for _ in range(len(OLLAMA_KEYS)):
	rr_index = (rr_index + 1) % len(OLLAMA_KEYS)
	k = OLLAMA_KEYS[rr_index]
	st = key_status[k]
	if not st["busy"] and now >= st["rate_limited_until"] and k not in exclude:
	st["busy"] = True
	return k
	return None

	def _next_ready_time() -> float:
	"""Epoch time kapan key pertama keluar dari cooldown."""
	now = time.time()
	times = [st["rate_limited_until"] for st in key_status.values() if st["rate_limited_until"] > now]
	return min(times) if times else now

	async def get_key(exclude=None):
	async with _lock:
	return _pick_key(exclude or set())

	async def release_key(k):
	async with _lock:
	if k in key_status:
	key_status[k]["busy"] = False

	async def mark_rate_limited(k):
	async with _lock:
	if k in key_status:
	until = time.time() + RATE_LIMIT_COOLDOWN
	key_status[k]["rate_limited_until"] = until
	key_status[k]["fail"] += 1
	idx = key_status[k]["index"]
	log(f"⏳ key#{idx} cooldown {RATE_LIMIT_COOLDOWN}s (ready {time.strftime('%H:%M:%S', time.localtime(until))})")

	async def mark_fail(k):
	async with _lock:
	if k in key_status:
	key_status[k]["fail"] += 1

	async def mark_ok(k):
	async with _lock:
	if k in key_status:
	key_status[k]["success"] += 1
	key_status[k]["fail"] = 0
	key_status[k]["rate_limited_until"] = 0.0

	async def get_key_infinite(exclude=None):
	"""
	Tunggu key tanpa batas. Kalau semua cooldown, sleep TEPAT sampai key paling cepat ready.
	Return: (key, exclude_set)
	"""
	local_exclude = set(exclude) if exclude else set()
	cycle = 0

	while True:
	async with _lock:
	k = _pick_key(local_exclude)
	if k:
	return k, local_exclude

	now = time.time()
	next_ready = _next_ready_time()
	wait_sec = max(0.5, next_ready - now)
	all_cooldown = all(
	st["rate_limited_until"] > now or st["busy"]
	for st in key_status.values()
	)

	if all_cooldown:
	cycle += 1
	log(f"⏳ Semua key cooldown. Tunggu {wait_sec:.1f}s... (cycle #{cycle})")
	local_exclude.clear()
	await asyncio.sleep(wait_sec)
	else:
	await asyncio.sleep(0.3)


	# =====================================================
	# TOOL CONVERSION: Anthropic ↔ OpenAI
	# =====================================================
	def anthropic_tools_to_openai(tools: list) -> list:
	"""
	Anthropic: {"name", "description", "input_schema"}
	OpenAI: {"type": "function", "function": {"name", "description", "parameters"}}
	"""
	return [
	{
	"type": "function",
	"function": {
	"name": t.get("name", ""),
	"description": t.get("description", ""),
	"parameters": t.get("input_schema", {"type": "object", "properties": {}}),
	}
	}
	for t in tools
	]

	def anthropic_tool_choice_to_openai(tc):
	if tc is None:
	return None
	if isinstance(tc, str):
	return {"auto": "auto", "any": "required", "none": "none"}.get(tc, "auto")
	if isinstance(tc, dict):
	t = tc.get("type", "")
	if t == "tool":
	return {"type": "function", "function": {"name": tc.get("name", "")}}
	return {"auto": "auto", "any": "required", "none": "none"}.get(t, "auto")
	return "auto"

	def convert_anthropic_messages(messages: list) -> list:
	"""
	Convert Anthropic message list → OpenAI message list.
	Handles: text, tool_use (assistant), tool_result (user).
	"""
	out = []
	for m in messages:
	role = m.get("role", "user")
	content = m.get("content", "")

	# String content → langsung
	if isinstance(content, str):
	out.append({"role": role, "content": content})
	continue

	if not isinstance(content, list):
	out.append({"role": role, "content": str(content)})
	continue

	tool_use_blocks = [b for b in content if b.get("type") == "tool_use"]
	tool_result_blocks = [b for b in content if b.get("type") == "tool_result"]
	text_blocks = [b for b in content if b.get("type") == "text"]

	# Assistant dengan tool_use → OpenAI assistant + tool_calls
	if tool_use_blocks and role == "assistant":
	text_content = "".join(b.get("text", "") for b in text_blocks) or None
	tool_calls = [
	{
	"id": b.get("id", "call_" + uuid.uuid4().hex[:8]),
	"type": "function",
	"function": {
	"name": b.get("name", ""),
	"arguments": json.dumps(b.get("input", {}))
	}
	}
	for b in tool_use_blocks
	]
	out.append({"role": "assistant", "content": text_content, "tool_calls": tool_calls})
	continue

	# User dengan tool_result → OpenAI role=tool messages
	if tool_result_blocks and role == "user":
	for b in tool_result_blocks:
	rc = b.get("content", "")
	if isinstance(rc, list):
	rc = "".join(x.get("text", "") if isinstance(x, dict) else str(x) for x in rc)
	out.append({
	"role": "tool",
	"tool_call_id": b.get("tool_use_id", ""),
	"content": str(rc),
	})
	if text_blocks:
	txt = "".join(b.get("text", "") for b in text_blocks)
	if txt:
	out.append({"role": "user", "content": txt})
	continue

	# Default: gabung semua text
	out.append({"role": role, "content": "".join(b.get("text", "") for b in text_blocks)})

	return out

	def openai_to_anthropic_response(data: dict, original_model: str) -> dict:
	"""
	Convert OpenAI non-stream response → Anthropic format.
	Handle text + tool_calls.
	"""
	choice = data["choices"][0]
	message = choice.get("message", {})
	finish_reason = choice.get("finish_reason", "stop")
	usage = data.get("usage", {})

	stop_map = {"stop": "end_turn", "length": "max_tokens", "eos": "end_turn", "tool_calls": "tool_use"}
	stop_reason = stop_map.get(finish_reason, "end_turn")

	content_blocks = []

	text = message.get("content") or ""
	if text:
	content_blocks.append({"type": "text", "text": text})

	for tc in (message.get("tool_calls") or []):
	fn = tc.get("function", {})
	try:
	inp = json.loads(fn.get("arguments", "{}"))
	except Exception:
	inp = {"_raw": fn.get("arguments", "")}
	content_blocks.append({
	"type": "tool_use",
	"id": tc.get("id", "toolu_" + uuid.uuid4().hex[:10]),
	"name": fn.get("name", ""),
	"input": inp,
	})

	return {
	"id": "msg_" + uuid.uuid4().hex[:10],
	"type": "message",
	"role": "assistant",
	"model": original_model,
	"content": content_blocks,
	"stop_reason": stop_reason,
	"stop_sequence": None,
	"usage": {
	"input_tokens": usage.get("prompt_tokens", 0),
	"output_tokens": usage.get("completion_tokens", 0),
	}
	}


	# =====================================================
	# ROOT
	# =====================================================
	@app.get("/")
	async def root():
	now = time.time()
	async with _lock:
	safe = {}
	for k, v in key_status.items():
	cd = max(0, v["rate_limited_until"] - now)
	safe[v["prefix"]] = {
	"index": v["index"],
	"status": "BUSY" if v["busy"] else ("COOLDOWN" if cd > 0 else "IDLE"),
	"cooldown_sec": round(cd, 1) if cd > 0 else 0,
	"success": v["success"],
	"fail": v["fail"],
	}
	return {"status": "ok", "base_url": BASE_URL, "default_model": DEFAULT_MODEL, "keys": safe}


	# =====================================================
	# /v1/models
	# =====================================================
	@app.get("/v1/models")
	async def models(req: Request):
	if not auth_ok(req):
	return JSONResponse({"error": "Unauthorized"}, status_code=401)

	key = OLLAMA_KEYS[0]
	try:
	async with httpx.AsyncClient(timeout=60) as client:
	r = await client.get(f"{BASE_URL}/api/tags", headers={"Authorization": f"Bearer {key}"})
	if r.status_code == 200:
	data = r.json()
	now = int(time.time())
	out = [{"id": m.get("name"), "object": "model", "created": now, "owned_by": "ollama"}
	for m in data.get("models", [])]
	return {"object": "list", "data": out}
	except Exception as e:
	log(f"[/v1/models] {e}")
	return JSONResponse({"error": "Failed to fetch models"}, status_code=500)


	# =====================================================
	# /v1/chat/completions (OpenAI-compatible, pipe through)
	# =====================================================
	@app.post("/v1/chat/completions")
	async def chat(req: Request):
	if not auth_ok(req):
	return JSONResponse({"error": "Unauthorized"}, status_code=401)

	try:
	body = await req.json()
	except Exception:
	return JSONResponse({"error": "Bad JSON"}, status_code=400)

	is_stream = body.get("stream", False)

	# -----------------------------------------
	# NON STREAM
	# -----------------------------------------
	if not is_stream:
	tried = set()
	for _ in range(len(OLLAMA_KEYS)):
	key = await get_key(tried)
	if not key:
	await asyncio.sleep(0.5)
	continue
	tried.add(key)
	try:
	async with httpx.AsyncClient(timeout=180) as client:
	r = await client.post(
	f"{BASE_URL}/v1/chat/completions",
	json=body,
	headers={"Authorization": f"Bearer {key}"}
	)
	if is_rate_limited(r.status_code, r.text if r.status_code != 200 else ""):
	await mark_rate_limited(key)
	continue
	await mark_ok(key)
	return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
	except Exception as e:
	log(e)
	await mark_fail(key)
	finally:
	await release_key(key)
	return JSONResponse({"error": "All keys failed"}, status_code=500)

	# -----------------------------------------
	# STREAM — infinite loop
	# -----------------------------------------
	async def gen():
	exclude = set()
	while True:
	key, exclude = await get_key_infinite(exclude)
	try:
	async with httpx.AsyncClient(timeout=None) as client:
	async with client.stream(
	"POST", f"{BASE_URL}/v1/chat/completions",
	json=body, headers={"Authorization": f"Bearer {key}"}
	) as r:
	if is_rate_limited(r.status_code):
	await mark_rate_limited(key)
	continue

	hit_limit = False
	async for line in r.aiter_lines():
	if not line:
	continue
	if line.strip() == "data: [DONE]":
	break
	raw = line[6:] if line.startswith("data: ") else line
	try:
	j = json.loads(raw)
	if "error" in j and "choices" not in j and is_rate_limited(0, json.dumps(j)):
	hit_limit = True
	break
	except Exception:
	pass
	yield line + "\n\n"

	if hit_limit:
	await mark_rate_limited(key)
	continue

	yield "data: [DONE]\n\n"
	await mark_ok(key)
	return

	except Exception as e:
	log(e)
	await mark_fail(key)
	finally:
	await release_key(key)

	return StreamingResponse(gen(), media_type="text/event-stream")


	# =====================================================
	# /v1/messages (Anthropic-compatible)
	# FIXED: Full tool calling support
	# =====================================================
	@app.post("/v1/messages")
	async def anthropic(req: Request):
	if not auth_ok(req):
	return JSONResponse({"error": "Unauthorized"}, status_code=401)

	try:
	body = await req.json()
	except ClientDisconnect:
	return Response(status_code=499)
	except Exception:
	return JSONResponse({"error": "Bad JSON"}, status_code=400)

	stream = body.get("stream", False)
	original_model = body.get("model", "claude-opus-4-7")
	ollama_model = map_model(original_model)

	# Build OpenAI-format messages
	messages = []
	if body.get("system"):
	sys = body["system"]
	if isinstance(sys, list):
	sys = "".join(x.get("text", "") for x in sys if x.get("type") == "text")
	messages.append({"role": "system", "content": sys})

	# FIXED: Konversi penuh termasuk tool_use dan tool_result
	messages.extend(convert_anthropic_messages(body.get("messages", [])))

	proxy_body = {
	"model": ollama_model,
	"messages": messages,
	"stream": stream,
	}

	if "max_tokens" in body:
	proxy_body["max_tokens"] = body["max_tokens"]
	if "temperature" in body:
	proxy_body["temperature"] = body["temperature"]
	if "top_p" in body:
	proxy_body["top_p"] = body["top_p"]

	# FIXED: Forward tools → OpenAI format
	if body.get("tools"):
	proxy_body["tools"] = anthropic_tools_to_openai(body["tools"])
	if body.get("tool_choice"):
	proxy_body["tool_choice"] = anthropic_tool_choice_to_openai(body["tool_choice"])

	# -----------------------------------------
	# NON STREAM
	# -----------------------------------------
	if not stream:
	tried = set()
	for _ in range(len(OLLAMA_KEYS)):
	key = await get_key(tried)
	if not key:
	await asyncio.sleep(0.5)
	continue
	tried.add(key)
	try:
	async with httpx.AsyncClient(timeout=180) as client:
	r = await client.post(
	f"{BASE_URL}/v1/chat/completions",
	json=proxy_body,
	headers={"Authorization": f"Bearer {key}"}
	)
	if is_rate_limited(r.status_code, r.text if r.status_code != 200 else ""):
	await mark_rate_limited(key)
	continue
	if r.status_code != 200:
	log(f"HTTP {r.status_code}: {r.text[:200]}")
	await mark_fail(key)
	continue

	data = r.json()
	# FIXED: convert OpenAI response → Anthropic (handles tool_calls too)
	out = openai_to_anthropic_response(data, original_model)
	await mark_ok(key)
	return JSONResponse(out)

	except Exception as e:
	log(e)
	await mark_fail(key)
	finally:
	await release_key(key)

	return JSONResponse({"error": "All keys failed"}, status_code=500)

	# -----------------------------------------
	# STREAM — infinite loop, Anthropic SSE format
	# FIXED: Handle tool_calls delta dari streaming
	# -----------------------------------------
	async def agen():
	exclude = set()
	msg_id = "msg_" + uuid.uuid4().hex[:10]
	sent_header = False

	while True:
	key, exclude = await get_key_infinite(exclude)
	try:
	async with httpx.AsyncClient(timeout=None) as client:
	async with client.stream(
	"POST", f"{BASE_URL}/v1/chat/completions",
	json=proxy_body, headers={"Authorization": f"Bearer {key}"}
	) as r:

	if is_rate_limited(r.status_code):
	await mark_rate_limited(key)
	continue

	if r.status_code != 200:
	log(f"STREAM HTTP {r.status_code}")
	await mark_fail(key)
	continue

	# Kirim Anthropic header sekali
	if not sent_header:
	sent_header = True
	yield sse({
	"type": "message_start",
	"message": {
	"id": msg_id, "type": "message", "role": "assistant",
	"model": original_model, "content": [],
	"stop_reason": None, "stop_sequence": None,
	"usage": {"input_tokens": 0, "output_tokens": 0}
	}
	})
	yield sse({
	"type": "content_block_start", "index": 0,
	"content_block": {"type": "text", "text": ""}
	})

	hit_limit = False
	finish_reason = None
	# track tool call blocks: openai index → {block_index, id, name}
	tool_blocks = {} # tc_idx → anthropic block_index
	next_block = 1 # 0 = text block

	async for line in r.aiter_lines():
	if not line:
	continue
	if line.strip() == "data: [DONE]":
	break

	raw = line[6:] if line.startswith("data: ") else line

	try:
	j = json.loads(raw)
	except Exception:
	continue

	# Cek error API (bukan model output)
	if "error" in j and "choices" not in j:
	if is_rate_limited(0, json.dumps(j)):
	hit_limit = True
	break

	choices = j.get("choices", [])
	if not choices:
	continue

	choice = choices[0]
	delta = choice.get("delta", {})
	finish_reason = choice.get("finish_reason") or finish_reason

	# ── TEXT content ──
	txt = delta.get("content") or ""
	if txt:
	yield sse({
	"type": "content_block_delta", "index": 0,
	"delta": {"type": "text_delta", "text": txt}
	})

	# ── TOOL CALLS ── FIXED
	for tc in (delta.get("tool_calls") or []):
	tc_idx = tc.get("index", 0)

	# Tool call baru → buka block
	if tc.get("id") or tc.get("function", {}).get("name"):
	if tc_idx not in tool_blocks:
	block_idx = next_block
	next_block += 1
	tool_blocks[tc_idx] = block_idx
	yield sse({
	"type": "content_block_start",
	"index": block_idx,
	"content_block": {
	"type": "tool_use",
	"id": tc.get("id", "toolu_" + uuid.uuid4().hex[:10]),
	"name": tc.get("function", {}).get("name", ""),
	"input": {}
	}
	})

	# Stream argument chunks
	args_chunk = tc.get("function", {}).get("arguments", "")
	if args_chunk and tc_idx in tool_blocks:
	yield sse({
	"type": "content_block_delta",
	"index": tool_blocks[tc_idx],
	"delta": {"type": "input_json_delta", "partial_json": args_chunk}
	})

	if hit_limit:
	await mark_rate_limited(key)
	continue

	# Tutup semua blocks
	yield sse({"type": "content_block_stop", "index": 0})
	for block_idx in tool_blocks.values():
	yield sse({"type": "content_block_stop", "index": block_idx})

	# Stop reason
	if finish_reason == "tool_calls" or tool_blocks:
	stop_reason = "tool_use"
	elif finish_reason == "length":
	stop_reason = "max_tokens"
	else:
	stop_reason = "end_turn"

	yield sse({
	"type": "message_delta",
	"delta": {"stop_reason": stop_reason, "stop_sequence": None},
	"usage": {"output_tokens": 0}
	})
	yield sse({"type": "message_stop"})
	await mark_ok(key)
	return # sukses

	except Exception as e:
	log(e)
	await mark_fail(key)
	finally:
	await release_key(key)

	# Fallback kalau belum kirim header
	if not sent_header:
	yield sse({
	"type": "message_start",
	"message": {
	"id": msg_id, "type": "message", "role": "assistant",
	"model": original_model, "content": [], "stop_reason": None,
	"stop_sequence": None, "usage": {"input_tokens": 0, "output_tokens": 0}
	}
	})
	yield sse({"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}})
	yield sse({"type": "content_block_stop", "index": 0})
	yield sse({"type": "message_delta", "delta": {"stop_reason": "end_turn", "stop_sequence": None}, "usage": {"output_tokens": 0}})
	yield sse({"type": "message_stop"})

	return StreamingResponse(agen(), media_type="text/event-stream")