owlninjam commited on
Commit
657d146
·
verified ·
1 Parent(s): 969b25e

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +115 -150
api.py CHANGED
@@ -1,35 +1,41 @@
1
- from fastapi import FastAPI, HTTPException, Depends, status
2
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from pydantic import BaseModel
5
- from llama_cpp import Llama
6
  import os
7
  import uvicorn
8
- from typing import Optional, List, Dict, Union, Literal
9
  import time
10
  import json
11
- import uuid
12
  from datetime import datetime
 
13
 
14
- # Configuration
 
 
 
 
 
 
 
15
  VALID_API_KEYS = {
16
  "sk-adminkey02",
17
- "sk-testkey123",
18
  "sk-userkey456",
19
  "sk-demokey789"
20
  }
 
 
21
 
22
- # Global model variable
23
  llm = None
24
  security = HTTPBearer()
25
 
26
- # OpenAI-compatible request/response models
 
27
  class Message(BaseModel):
28
  role: Literal["system", "user", "assistant"]
29
  content: str
30
 
31
  class ChatCompletionRequest(BaseModel):
32
- model: str = "capybarahermes-2.5-mistral-7b"
33
  messages: List[Message]
34
  max_tokens: Optional[int] = 512
35
  temperature: Optional[float] = 0.7
@@ -41,7 +47,7 @@ class ChatCompletionRequest(BaseModel):
41
  class ChatCompletionChoice(BaseModel):
42
  index: int
43
  message: Message
44
- finish_reason: Literal["stop", "length", "content_filter"]
45
 
46
  class Usage(BaseModel):
47
  prompt_tokens: int
@@ -49,33 +55,33 @@ class Usage(BaseModel):
49
  total_tokens: int
50
 
51
  class ChatCompletionResponse(BaseModel):
52
- id: str
53
  object: str = "chat.completion"
54
- created: int
55
- model: str
56
  choices: List[ChatCompletionChoice]
57
  usage: Usage
58
 
59
- class Model(BaseModel):
60
  id: str
61
  object: str = "model"
62
- created: int
63
- owned_by: str
64
 
65
  class ModelsResponse(BaseModel):
66
  object: str = "list"
67
- data: List[Model]
 
 
68
 
69
- # Initialize FastAPI
70
  app = FastAPI(
71
- title="CapybaraHermes OpenAI API",
72
- description="OpenAI-compatible API for CapybaraHermes-2.5-Mistral-7B",
73
  version="1.0.0",
74
  docs_url="/v1/docs",
75
  redoc_url="/v1/redoc"
76
  )
77
 
78
- # Add CORS middleware
79
  app.add_middleware(
80
  CORSMiddleware,
81
  allow_origins=["*"],
@@ -84,92 +90,116 @@ app.add_middleware(
84
  allow_headers=["*"],
85
  )
86
 
 
 
87
  def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
88
- """Verify API key"""
89
  if credentials.credentials not in VALID_API_KEYS:
90
  raise HTTPException(
91
  status_code=status.HTTP_401_UNAUTHORIZED,
92
- detail="Invalid API key"
93
  )
94
  return credentials.credentials
95
 
 
 
 
96
  def load_model():
97
- """Load the GGUF model"""
98
  global llm
99
- model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
100
-
101
- if not os.path.exists(model_path):
102
- raise Exception(f"Model file {model_path} not found!")
103
 
104
- try:
105
- llm = Llama(
106
- model_path=model_path,
107
- n_ctx=4096,
108
- n_threads=2,
109
- n_batch=512,
110
- verbose=False,
111
- use_mlock=True,
112
- n_gpu_layers=0,
113
- )
114
- print("✅ Model loaded successfully!")
115
- except Exception as e:
116
- raise Exception(f"❌ Error loading model: {str(e)}")
117
 
118
  def format_messages(messages: List[Message]) -> str:
119
- """Format messages for ChatML format"""
120
  formatted = ""
121
-
122
  for message in messages:
123
- formatted += f"<|im_start|>{message.role}\n{message.content}\n<|im_end|>\n"
124
-
125
  formatted += "<|im_start|>assistant\n"
126
  return formatted
127
 
128
  def count_tokens_rough(text: str) -> int:
129
- """Rough token counting"""
130
  return len(text.split())
131
 
132
- @app.on_event("startup")
133
- async def startup_event():
134
- """Load model on startup"""
135
- try:
136
- print("🚀 Starting CapybaraHermes API server...")
137
- load_model()
138
- except Exception as e:
139
- print(f"💥 Failed to load model: {e}")
140
- raise e
141
 
142
- # API endpoints with authentication
143
  @app.get("/v1/models", response_model=ModelsResponse)
144
  async def list_models(api_key: str = Depends(verify_api_key)):
145
- """List available models"""
146
- return ModelsResponse(
147
- data=[
148
- Model(
149
- id="capybarahermes-2.5-mistral-7b",
150
- created=int(datetime.now().timestamp()),
151
- owned_by="local"
152
- )
153
- ]
154
- )
155
 
156
- @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
157
  async def create_chat_completion(
158
- request: ChatCompletionRequest,
159
  api_key: str = Depends(verify_api_key)
160
  ):
161
- """Create chat completion"""
162
  if llm is None:
163
- raise HTTPException(status_code=503, detail="Model not loaded")
 
 
164
 
165
- try:
166
- # Format messages
167
- prompt = format_messages(request.messages)
168
- prompt_tokens = count_tokens_rough(prompt)
169
-
170
- start_time = time.time()
171
-
172
- # Generate response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  response = llm(
174
  prompt,
175
  max_tokens=request.max_tokens,
@@ -179,20 +209,13 @@ async def create_chat_completion(
179
  echo=False
180
  )
181
 
182
- end_time = time.time()
183
- generation_time = end_time - start_time
184
-
185
- # Extract response
186
  response_text = response['choices'][0]['text'].strip()
187
- completion_tokens = count_tokens_rough(response_text)
188
- tokens_per_second = completion_tokens / generation_time if generation_time > 0 else 0
189
 
190
- print(f"⚡ Generated {completion_tokens} tokens in {generation_time:.2f}s ({tokens_per_second:.2f} tok/s)")
 
191
 
192
  return ChatCompletionResponse(
193
- id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
194
- created=int(time.time()),
195
- model=request.model,
196
  choices=[
197
  ChatCompletionChoice(
198
  index=0,
@@ -206,64 +229,6 @@ async def create_chat_completion(
206
  total_tokens=prompt_tokens + completion_tokens
207
  )
208
  )
209
-
210
- except Exception as e:
211
- raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
212
-
213
- @app.get("/v1/health")
214
- async def health_check():
215
- """Health check (no auth required)"""
216
- if llm is None:
217
- raise HTTPException(status_code=503, detail="Model not loaded")
218
- return {
219
- "status": "healthy",
220
- "model_loaded": True,
221
- "timestamp": datetime.now().isoformat(),
222
- "model": "capybarahermes-2.5-mistral-7b"
223
- }
224
-
225
- @app.get("/v1")
226
- async def api_info():
227
- """API information"""
228
- return {
229
- "message": "🦙 CapybaraHermes OpenAI Compatible API",
230
- "model": "CapybaraHermes-2.5-Mistral-7B (Q5_K_M quantized)",
231
- "endpoints": {
232
- "chat_completions": "/v1/chat/completions",
233
- "models": "/v1/models",
234
- "health": "/v1/health",
235
- "docs": "/v1/docs"
236
- },
237
- "authentication": {
238
- "required": True,
239
- "type": "Bearer token",
240
- "header": "Authorization: Bearer sk-your-api-key",
241
- "valid_keys": ["sk-adminkey02", "sk-testkey123", "sk-userkey456", "sk-demokey789"]
242
- },
243
- "usage": {
244
- "sdk": "pip install openai",
245
- "base_url": "https://your-username-your-space.hf.space/v1",
246
- "example": "client = OpenAI(base_url='https://your-space.hf.space/v1', api_key='sk-adminkey02')"
247
- },
248
- "performance": {
249
- "expected_speed": "2-8 tokens/second (CPU)",
250
- "context_length": 4096,
251
- "quantization": "Q5_K_M"
252
- }
253
- }
254
-
255
- # Public endpoint for basic info (no auth)
256
- @app.get("/api")
257
- async def public_api_info():
258
- """Public API information"""
259
- return {
260
- "service": "CapybaraHermes API",
261
- "status": "running",
262
- "endpoints": "/v1/",
263
- "docs": "/v1/docs",
264
- "chat_ui": "/",
265
- "authentication": "API key required for /v1/* endpoints"
266
- }
267
 
268
  if __name__ == "__main__":
269
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ # api.py
 
 
 
 
2
  import os
3
  import uvicorn
4
+ import uuid
5
  import time
6
  import json
 
7
  from datetime import datetime
8
+ from typing import Optional, List, Union, Literal
9
 
10
+ from fastapi import FastAPI, HTTPException, Depends, status
11
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import StreamingResponse
14
+ from pydantic import BaseModel, Field
15
+ from llama_cpp import Llama
16
+
17
+ # --- Configuration ---
18
  VALID_API_KEYS = {
19
  "sk-adminkey02",
20
+ "sk-testkey123",
21
  "sk-userkey456",
22
  "sk-demokey789"
23
  }
24
+ MODEL_PATH = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
25
+ MODEL_NAME = "capybarahermes-2.5-mistral-7b"
26
 
27
+ # --- Global Model Variable ---
28
  llm = None
29
  security = HTTPBearer()
30
 
31
+ # --- Pydantic Models for OpenAI Compatibility ---
32
+
33
  class Message(BaseModel):
34
  role: Literal["system", "user", "assistant"]
35
  content: str
36
 
37
  class ChatCompletionRequest(BaseModel):
38
+ model: str = MODEL_NAME
39
  messages: List[Message]
40
  max_tokens: Optional[int] = 512
41
  temperature: Optional[float] = 0.7
 
47
  class ChatCompletionChoice(BaseModel):
48
  index: int
49
  message: Message
50
+ finish_reason: Optional[Literal["stop", "length"]] = None
51
 
52
  class Usage(BaseModel):
53
  prompt_tokens: int
 
55
  total_tokens: int
56
 
57
  class ChatCompletionResponse(BaseModel):
58
+ id: str = Field(default_factory=lambda: f"chatcmpl-{uuid.uuid4().hex}")
59
  object: str = "chat.completion"
60
+ created: int = Field(default_factory=lambda: int(time.time()))
61
+ model: str = MODEL_NAME
62
  choices: List[ChatCompletionChoice]
63
  usage: Usage
64
 
65
+ class ModelData(BaseModel):
66
  id: str
67
  object: str = "model"
68
+ created: int = Field(default_factory=lambda: int(time.time()))
69
+ owned_by: str = "user"
70
 
71
  class ModelsResponse(BaseModel):
72
  object: str = "list"
73
+ data: List[ModelData]
74
+
75
+ # --- FastAPI App Initialization ---
76
 
 
77
  app = FastAPI(
78
+ title="CapybaraHermes OpenAI-Compatible API",
79
+ description=f"An OpenAI-compatible API for the {MODEL_NAME} model.",
80
  version="1.0.0",
81
  docs_url="/v1/docs",
82
  redoc_url="/v1/redoc"
83
  )
84
 
 
85
  app.add_middleware(
86
  CORSMiddleware,
87
  allow_origins=["*"],
 
90
  allow_headers=["*"],
91
  )
92
 
93
+ # --- Dependency for API Key Verification ---
94
+
95
  def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
 
96
  if credentials.credentials not in VALID_API_KEYS:
97
  raise HTTPException(
98
  status_code=status.HTTP_401_UNAUTHORIZED,
99
+ detail="Invalid or missing API key"
100
  )
101
  return credentials.credentials
102
 
103
+ # --- Model Loading ---
104
+
105
+ @app.on_event("startup")
106
  def load_model():
 
107
  global llm
108
+ if not os.path.exists(MODEL_PATH):
109
+ raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
 
 
110
 
111
+ print("🚀 Loading GGUF model...")
112
+ llm = Llama(
113
+ model_path=MODEL_PATH,
114
+ n_ctx=4096,
115
+ n_threads=2,
116
+ n_batch=512,
117
+ verbose=False,
118
+ use_mlock=True,
119
+ n_gpu_layers=0,
120
+ )
121
+ print("✅ Model loaded successfully!")
122
+
123
+ # --- Helper Functions ---
124
 
125
  def format_messages(messages: List[Message]) -> str:
126
+ """Formats messages for the ChatML format expected by the model."""
127
  formatted = ""
 
128
  for message in messages:
129
+ formatted += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"
 
130
  formatted += "<|im_start|>assistant\n"
131
  return formatted
132
 
133
  def count_tokens_rough(text: str) -> int:
134
+ """A rough approximation of token counting."""
135
  return len(text.split())
136
 
137
+ # --- API Endpoints ---
138
+
139
+ @app.get("/v1/health")
140
+ async def health_check():
141
+ """Health check endpoint."""
142
+ return {"status": "healthy", "model_loaded": llm is not None}
 
 
 
143
 
 
144
  @app.get("/v1/models", response_model=ModelsResponse)
145
  async def list_models(api_key: str = Depends(verify_api_key)):
146
+ """Lists the available models."""
147
+ return ModelsResponse(data=[ModelData(id=MODEL_NAME)])
 
 
 
 
 
 
 
 
148
 
149
+ @app.post("/v1/chat/completions")
150
  async def create_chat_completion(
151
+ request: ChatCompletionRequest,
152
  api_key: str = Depends(verify_api_key)
153
  ):
154
+ """Creates a model response for the given chat conversation."""
155
  if llm is None:
156
+ raise HTTPException(status_code=503, detail="Model is not loaded yet")
157
+
158
+ prompt = format_messages(request.messages)
159
 
160
+ # Streaming response
161
+ if request.stream:
162
+ async def stream_generator():
163
+ completion_id = f"chatcmpl-{uuid.uuid4().hex}"
164
+ created_time = int(time.time())
165
+
166
+ stream = llm(
167
+ prompt,
168
+ max_tokens=request.max_tokens,
169
+ temperature=request.temperature,
170
+ top_p=request.top_p,
171
+ stop=["<|im_end|>", "<|im_start|>"] + (request.stop or []),
172
+ stream=True,
173
+ echo=False
174
+ )
175
+
176
+ for output in stream:
177
+ if 'choices' in output and len(output['choices']) > 0:
178
+ delta_content = output['choices'][0].get('text', '')
179
+ chunk = {
180
+ "id": completion_id,
181
+ "object": "chat.completion.chunk",
182
+ "created": created_time,
183
+ "model": MODEL_NAME,
184
+ "choices": [{"index": 0, "delta": {"content": delta_content}, "finish_reason": None}]
185
+ }
186
+ yield f"data: {json.dumps(chunk)}\n\n"
187
+
188
+ # Send the final chunk
189
+ final_chunk = {
190
+ "id": completion_id,
191
+ "object": "chat.completion.chunk",
192
+ "created": created_time,
193
+ "model": MODEL_NAME,
194
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
195
+ }
196
+ yield f"data: {json.dumps(final_chunk)}\n\n"
197
+ yield "data: [DONE]\n\n"
198
+
199
+ return StreamingResponse(stream_generator(), media_type="text/event-stream")
200
+
201
+ # Non-streaming response
202
+ else:
203
  response = llm(
204
  prompt,
205
  max_tokens=request.max_tokens,
 
209
  echo=False
210
  )
211
 
 
 
 
 
212
  response_text = response['choices'][0]['text'].strip()
 
 
213
 
214
+ prompt_tokens = count_tokens_rough(prompt)
215
+ completion_tokens = count_tokens_rough(response_text)
216
 
217
  return ChatCompletionResponse(
218
+ model=MODEL_NAME,
 
 
219
  choices=[
220
  ChatCompletionChoice(
221
  index=0,
 
229
  total_tokens=prompt_tokens + completion_tokens
230
  )
231
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  if __name__ == "__main__":
234
+ uvicorn.run(app, host="0.0.0.0", port=8000)