Elysiadev11 commited on
Commit
1334a00
·
verified ·
1 Parent(s): 0d1e167

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +474 -1
app.py CHANGED
@@ -6,6 +6,7 @@ from starlette.requests import ClientDisconnect
6
  import time
7
  import json
8
  import asyncio
 
9
 
10
  app = FastAPI()
11
 
@@ -15,6 +16,59 @@ app = FastAPI()
15
  BASE_URL = os.getenv("BASE_URL", "https://elysiadev11-proxyollma.hf.space")
16
  MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  OLLAMA_KEYS = []
19
  # Mendukung hingga 100 API Key (OLLAMA_KEY_1 sampai OLLAMA_KEY_100)
20
  for i in range(1, 101):
@@ -73,6 +127,198 @@ def get_and_lock_key(exclude_keys=None):
73
 
74
  return None
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # ==========================================
77
  # ENDPOINTS
78
  # ==========================================
@@ -91,6 +337,234 @@ def root():
91
  }
92
  }
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  @app.post("/v1/chat/completions")
95
  async def chat(req: Request):
96
  auth_key = req.headers.get("Authorization", "").replace("Bearer ", "")
@@ -274,4 +748,3 @@ async def chat(req: Request):
274
  yield f"data: {json.dumps({'error': 'Stream failed completely'})}\n\ndata: [DONE]\n\n"
275
 
276
  return StreamingResponse(stream_generator(), media_type="text/event-stream")
277
-
 
6
  import time
7
  import json
8
  import asyncio
9
+ import uuid
10
 
11
  app = FastAPI()
12
 
 
16
  BASE_URL = os.getenv("BASE_URL", "https://elysiadev11-proxyollma.hf.space")
17
  MASTER_API_KEY = os.getenv("MASTER_API_KEY", "olla")
18
 
19
+ # Default model mapping (Claude → MiniMax)
20
+ DEFAULT_MODEL_MAPPING = {
21
+ # Opus models
22
+ "claude-opus-4-7": "minimax-m2.7:cloud",
23
+ "claude-opus-4-6": "minimax-m2.7:cloud",
24
+ "claude-opus-4-5": "minimax-m2.7:cloud",
25
+ "claude-opus-4-1": "minimax-m2.7:cloud",
26
+ "claude-opus-4-20250514": "minimax-m2.7:cloud",
27
+
28
+ # Sonnet models
29
+ "claude-sonnet-4-6": "minimax-m2.7:cloud",
30
+ "claude-sonnet-4-5": "minimax-m2.7:cloud",
31
+ "claude-sonnet-4-20250514": "minimax-m2.7:cloud",
32
+
33
+ # Haiku models
34
+ "claude-haiku-4-5": "minimax-m2.7:cloud",
35
+ "claude-haiku-4-5-20251001": "minimax-m2.7:cloud",
36
+ }
37
+
38
+ # Load model mapping dari ENV
39
+ def load_model_mapping():
40
+ mapping = DEFAULT_MODEL_MAPPING.copy()
41
+
42
+ env_map = os.getenv("CLAUDE_MODEL_MAP")
43
+ if env_map:
44
+ for pair in env_map.split(","):
45
+ if ":" in pair:
46
+ parts = pair.split(":", 1)
47
+ if len(parts) == 2:
48
+ claude_model = parts[0].strip()
49
+ ollama_model = parts[1].strip()
50
+ mapping[claude_model] = ollama_model
51
+
52
+ return mapping
53
+
54
+ def map_model(claude_model: str) -> str:
55
+ """Map Claude model name to Ollama model"""
56
+ model_mapping = load_model_mapping()
57
+
58
+ # Try exact match first
59
+ if claude_model in model_mapping:
60
+ return model_mapping[claude_model]
61
+
62
+ # Fallback based on model family
63
+ if "opus" in claude_model.lower():
64
+ return os.getenv("DEFAULT_OPUS_MODEL", "minimax-m2.7:cloud")
65
+
66
+ if "haiku" in claude_model.lower():
67
+ return os.getenv("DEFAULT_HAIKU_MODEL", "minimax-m2.7:cloud")
68
+
69
+ # Default to Sonnet model
70
+ return os.getenv("DEFAULT_SONNET_MODEL", "minimax-m2.7:cloud")
71
+
72
  OLLAMA_KEYS = []
73
  # Mendukung hingga 100 API Key (OLLAMA_KEY_1 sampai OLLAMA_KEY_100)
74
  for i in range(1, 101):
 
127
 
128
  return None
129
 
130
+ def anthropic_error(error_type: str, message: str, status_code: int = 400):
131
+ """Format error in Anthropic style"""
132
+ return JSONResponse(
133
+ {
134
+ "type": "error",
135
+ "error": {
136
+ "type": error_type,
137
+ "message": message
138
+ }
139
+ },
140
+ status_code=status_code
141
+ )
142
+
143
+ def anthropic_to_ollama(body: dict) -> dict:
144
+ """Convert Anthropic request to Ollama format"""
145
+
146
+ # Build messages array
147
+ messages = []
148
+
149
+ # Add system message if exists
150
+ if body.get("system"):
151
+ messages.append({
152
+ "role": "system",
153
+ "content": body["system"]
154
+ })
155
+
156
+ # Add conversation messages
157
+ for msg in body.get("messages", []):
158
+ # Handle content blocks (Anthropic support array or string)
159
+ content = msg["content"]
160
+ if isinstance(content, list):
161
+ # Extract text from content blocks
162
+ text_content = ""
163
+ for block in content:
164
+ if block.get("type") == "text":
165
+ text_content += block.get("text", "")
166
+ content = text_content
167
+
168
+ messages.append({
169
+ "role": msg["role"],
170
+ "content": content
171
+ })
172
+
173
+ # Map model
174
+ ollama_model = map_model(body.get("model", "claude-sonnet-4-6"))
175
+
176
+ # Build Ollama request
177
+ ollama_body = {
178
+ "model": ollama_model,
179
+ "messages": messages,
180
+ "stream": body.get("stream", False),
181
+ "options": {}
182
+ }
183
+
184
+ # Add optional parameters
185
+ if "max_tokens" in body:
186
+ ollama_body["options"]["num_predict"] = body["max_tokens"]
187
+
188
+ if "temperature" in body:
189
+ ollama_body["options"]["temperature"] = body["temperature"]
190
+
191
+ if "top_p" in body:
192
+ ollama_body["options"]["top_p"] = body["top_p"]
193
+
194
+ if "top_k" in body:
195
+ ollama_body["options"]["top_k"] = body["top_k"]
196
+
197
+ return ollama_body
198
+
199
+ def ollama_to_anthropic(ollama_response: dict, original_model: str) -> dict:
200
+ """Convert Ollama response to Anthropic format"""
201
+
202
+ message = ollama_response.get("message", {})
203
+
204
+ # Map stop reasons
205
+ stop_reason_map = {
206
+ "stop": "end_turn",
207
+ "length": "max_tokens",
208
+ "eos": "end_turn",
209
+ "load": "end_turn",
210
+ "unload": "end_turn",
211
+ }
212
+
213
+ done_reason = ollama_response.get("done_reason", "stop")
214
+
215
+ return {
216
+ "id": f"msg_{uuid.uuid4().hex[:10]}",
217
+ "type": "message",
218
+ "role": "assistant",
219
+ "content": [
220
+ {
221
+ "type": "text",
222
+ "text": message.get("content", "")
223
+ }
224
+ ],
225
+ "model": original_model,
226
+ "stop_reason": stop_reason_map.get(done_reason, "end_turn"),
227
+ "stop_sequence": None,
228
+ "usage": {
229
+ "input_tokens": ollama_response.get("prompt_eval_count", 0),
230
+ "output_tokens": ollama_response.get("eval_count", 0)
231
+ }
232
+ }
233
+
234
+ async def stream_anthropic(ollama_stream, original_model: str):
235
+ """Convert Ollama streaming to Anthropic SSE format"""
236
+
237
+ message_id = f"msg_{uuid.uuid4().hex[:10]}"
238
+
239
+ # Send message_start
240
+ yield f"data: {json.dumps({
241
+ 'type': 'message_start',
242
+ 'message': {
243
+ 'id': message_id,
244
+ 'type': 'message',
245
+ 'role': 'assistant',
246
+ 'model': original_model,
247
+ 'content': [],
248
+ 'stop_reason': None,
249
+ 'stop_sequence': None,
250
+ 'usage': {'input_tokens': 0, 'output_tokens': 0}
251
+ }
252
+ })}\n\n"
253
+
254
+ # Send content_block_start
255
+ yield f"data: {json.dumps({
256
+ 'type': 'content_block_start',
257
+ 'index': 0,
258
+ 'content_block': {'type': 'text'}
259
+ })}\n\n"
260
+
261
+ input_tokens = 0
262
+ output_tokens = 0
263
+ stop_reason = "end_turn"
264
+
265
+ # Stream content
266
+ async for line in ollama_stream:
267
+ if line.startswith("data: "):
268
+ data_str = line[6:]
269
+ try:
270
+ data = json.loads(data_str)
271
+
272
+ if data.get("done", False):
273
+ input_tokens = data.get("prompt_eval_count", 0)
274
+ output_tokens = data.get("eval_count", 0)
275
+ stop_reason = data.get("done_reason", "stop")
276
+ continue
277
+
278
+ message = data.get("message", {})
279
+ content = message.get("content", "")
280
+
281
+ if content:
282
+ # Send text_delta
283
+ yield f"data: {json.dumps({
284
+ 'type': 'content_block_delta',
285
+ 'index': 0,
286
+ 'delta': {
287
+ 'type': 'text_delta',
288
+ 'text': content
289
+ }
290
+ })}\n\n"
291
+ except json.JSONDecodeError:
292
+ continue
293
+
294
+ # Send content_block_stop
295
+ yield f"data: {json.dumps({
296
+ 'type': 'content_block_stop',
297
+ 'index': 0
298
+ })}\n\n"
299
+
300
+ # Map stop reason
301
+ stop_reason_map = {
302
+ "stop": "end_turn",
303
+ "length": "max_tokens",
304
+ "eos": "end_turn",
305
+ }
306
+
307
+ # Send message_delta
308
+ yield f"data: {json.dumps({
309
+ 'type': 'message_delta',
310
+ 'delta': {
311
+ 'stop_reason': stop_reason_map.get(stop_reason, "end_turn"),
312
+ 'stop_sequence': None
313
+ },
314
+ 'usage': {'output_tokens': output_tokens}
315
+ })}\n\n"
316
+
317
+ # Send message_stop
318
+ yield f"data: {json.dumps({
319
+ 'type': 'message_stop'
320
+ })}\n\n"
321
+
322
  # ==========================================
323
  # ENDPOINTS
324
  # ==========================================
 
337
  }
338
  }
339
 
340
+ @app.get("/v1/models")
341
+ async def list_models(request: Request):
342
+ # Validate auth
343
+ auth_key = request.headers.get("Authorization", "").replace("Bearer ", "")
344
+ if auth_key != MASTER_API_KEY:
345
+ return JSONResponse(
346
+ {"error": {"type": "authentication_error", "message": "Unauthorized"}},
347
+ status_code=401
348
+ )
349
+
350
+ # Proxy to Ollama /api/tags
351
+ async with httpx.AsyncClient(timeout=30.0) as client:
352
+ try:
353
+ resp = await client.get(
354
+ f"{BASE_URL}/api/tags",
355
+ headers={"Authorization": f"Bearer {OLLAMA_KEYS[0]}"}
356
+ )
357
+
358
+ if resp.status_code != 200:
359
+ return JSONResponse(
360
+ {"error": {"type": "api_error", "message": "Failed to fetch models"}},
361
+ status_code=resp.status_code
362
+ )
363
+
364
+ ollama_data = resp.json()
365
+
366
+ # Convert to OpenAI format
367
+ models = []
368
+ created_time = int(time.time())
369
+
370
+ for model in ollama_data.get("models", []):
371
+ models.append({
372
+ "id": model.get("name", model.get("model", "")),
373
+ "object": "model",
374
+ "created": created_time,
375
+ "owned_by": "ollama"
376
+ })
377
+
378
+ return {"object": "list", "data": models}
379
+
380
+ except Exception as e:
381
+ log(f"Error fetching models: {e}")
382
+ return JSONResponse(
383
+ {"error": {"type": "api_error", "message": str(e)}},
384
+ status_code=500
385
+ )
386
+
387
+ @app.post("/v1/messages")
388
+ async def anthropic_chat(request: Request):
389
+ # Validate auth
390
+ auth_key = request.headers.get("Authorization", "").replace("Bearer ", "")
391
+ if auth_key != MASTER_API_KEY:
392
+ return anthropic_error("authentication_error", "Unauthorized", 401)
393
+
394
+ try:
395
+ body = await request.json()
396
+ except ClientDisconnect:
397
+ log("Client kabur sebelum proxy selesai membaca request body.")
398
+ return Response(status_code=499)
399
+ except json.JSONDecodeError:
400
+ return anthropic_error("invalid_request_error", "Invalid JSON", 400)
401
+
402
+ is_stream = body.get("stream", False)
403
+ original_model = body.get("model", "claude-sonnet-4-6")
404
+
405
+ # Convert to Ollama format
406
+ ollama_body = anthropic_to_ollama(body)
407
+
408
+ # ==========================================
409
+ # LOGIKA NON-STREAM
410
+ # ==========================================
411
+ if not is_stream:
412
+ tried_keys = set()
413
+ for attempt in range(len(OLLAMA_KEYS)):
414
+ if len(tried_keys) >= len(OLLAMA_KEYS):
415
+ tried_keys.clear()
416
+
417
+ key = None
418
+ log("Menunggu API Key idle (Antrean Non-Stream)...")
419
+
420
+ # Antrean Tanpa Batas Waktu
421
+ while True:
422
+ if await request.is_disconnected():
423
+ log("Client membatalkan request saat mengantre (Non-Stream).")
424
+ return Response(status_code=499)
425
+
426
+ # Gunakan fungsi Atomic Lock
427
+ key = get_and_lock_key(exclude_keys=tried_keys)
428
+ if key:
429
+ break # Langsung keluar loop, key SUDAH DIKUNCI
430
+
431
+ await asyncio.sleep(0.5) # Cek tiap setengah detik
432
+
433
+ ki = key_status[key]
434
+ tried_keys.add(key)
435
+ log(f"LOCK ACQUIRED: key#{ki['index']} (Non-Stream)")
436
+
437
+ try:
438
+ async with httpx.AsyncClient(timeout=120.0) as client:
439
+ resp = await client.post(
440
+ f"{BASE_URL}/v1/chat/completions",
441
+ json=ollama_body,
442
+ headers={"Authorization": f"Bearer {key}"}
443
+ )
444
+ if resp.status_code == 200:
445
+ ki["success"] += 1
446
+ ki["failures"] = 0
447
+
448
+ # Convert response to Anthropic format
449
+ ollama_response = resp.json()
450
+ anthropic_response = ollama_to_anthropic(ollama_response, original_model)
451
+
452
+ ki["in_use"] = False
453
+ log(f"RELEASE: key#{ki['index']} (Non-Stream)")
454
+ return JSONResponse(anthropic_response)
455
+ elif resp.status_code == 429:
456
+ ki["failures"] += 1
457
+ ki["healthy"] = False
458
+ log(f"RATE LIMIT: key#{ki['index']} - Skip ke key berikutnya.")
459
+ continue
460
+ else:
461
+ ki["failures"] += 1
462
+ continue
463
+ except Exception as e:
464
+ ki["failures"] += 1
465
+ log(f"Error Non-Stream: {e}")
466
+ continue
467
+ finally:
468
+ ki["in_use"] = False # SELALU LEPAS KUNCI
469
+ log(f"RELEASE: key#{ki['index']} (Non-Stream)")
470
+
471
+ return JSONResponse({"error": "All keys failed after multiple attempts"}, status_code=500)
472
+
473
+ # ==========================================
474
+ # LOGIKA STREAMING
475
+ # ==========================================
476
+ async def stream_generator():
477
+ current_body = ollama_body.copy()
478
+ generated_text_buffer = ""
479
+ tried_keys = set()
480
+
481
+ for attempt in range(len(OLLAMA_KEYS)):
482
+ if len(tried_keys) >= len(OLLAMA_KEYS):
483
+ tried_keys.clear()
484
+
485
+ key = None
486
+ if attempt == 0:
487
+ log("Menunggu API Key idle (Antrean Stream Baru)...")
488
+ else:
489
+ log(f"Menunggu API Key idle (Antrean Fallback ke-{attempt})...")
490
+
491
+ # Antrean Tanpa Batas Waktu
492
+ while True:
493
+ if await request.is_disconnected():
494
+ log("Client membatalkan request saat mengantre stream.")
495
+ return
496
+
497
+ # Gunakan fungsi Atomic Lock
498
+ key = get_and_lock_key(exclude_keys=tried_keys)
499
+ if key:
500
+ break # Langsung keluar loop, key SUDAH DIKUNCI
501
+
502
+ await asyncio.sleep(0.5)
503
+
504
+ ki = key_status[key]
505
+ tried_keys.add(key)
506
+ log(f"STREAM LOCK ACQUIRED: key#{ki['index']}")
507
+
508
+ if generated_text_buffer:
509
+ log(f"Resuming stream. Injecting {len(generated_text_buffer)} chars.")
510
+ messages = current_body.get("messages", [])
511
+ if messages and messages[-1].get("role") == "assistant":
512
+ messages[-1]["content"] = generated_text_buffer
513
+ else:
514
+ messages.append({"role": "assistant", "content": generated_text_buffer})
515
+ current_body["messages"] = messages
516
+
517
+ try:
518
+ custom_timeout = httpx.Timeout(connect=15.0, read=None, write=15.0, pool=10.0)
519
+ async with httpx.AsyncClient(timeout=custom_timeout) as client:
520
+ async with client.stream(
521
+ "POST", f"{BASE_URL}/v1/chat/completions",
522
+ json=current_body, headers={"Authorization": f"Bearer {key}"}
523
+ ) as response:
524
+
525
+ if response.status_code == 429:
526
+ ki["failures"] += 1
527
+ ki["healthy"] = False
528
+ log(f"STREAM 429: key#{ki['index']} - Switching key...")
529
+ continue
530
+
531
+ if response.status_code != 200:
532
+ ki["failures"] += 1
533
+ log(f"STREAM ERR {response.status_code}: key#{ki['index']} - Switching key...")
534
+ continue
535
+
536
+ stream_interrupted = False
537
+ try:
538
+ # Convert Ollama stream to Anthropic SSE
539
+ async for chunk in stream_anthropic(response.aiter_lines(), original_model):
540
+ yield chunk
541
+
542
+ ki["success"] += 1
543
+ ki["failures"] = 0
544
+ return
545
+
546
+ except (httpx.ReadTimeout, httpx.ReadError, httpx.RemoteProtocolError) as e:
547
+ log(f"STREAM PUTUS: key#{ki['index']}. Buffering...")
548
+ ki["failures"] += 1
549
+ stream_interrupted = True
550
+
551
+ if not stream_interrupted:
552
+ return
553
+
554
+ except Exception as e:
555
+ ki["failures"] += 1
556
+ log(f"STREAM EXCEPTION: key#{ki['index']} - {e}")
557
+ continue
558
+
559
+ finally:
560
+ # SELALU LEPAS KUNCI
561
+ ki["in_use"] = False
562
+ log(f"STREAM RELEASE: key#{ki['index']}")
563
+
564
+ yield f"data: {json.dumps({'error': 'Stream failed completely'})}\n\ndata: [DONE]\n\n"
565
+
566
+ return StreamingResponse(stream_generator(), media_type="text/event-stream")
567
+
568
  @app.post("/v1/chat/completions")
569
  async def chat(req: Request):
570
  auth_key = req.headers.get("Authorization", "").replace("Bearer ", "")
 
748
  yield f"data: {json.dumps({'error': 'Stream failed completely'})}\n\ndata: [DONE]\n\n"
749
 
750
  return StreamingResponse(stream_generator(), media_type="text/event-stream")