Spaces:

owlninjam
/

spacecx

Paused

App Files Files Community

owlninjam commited on Aug 7, 2025

Commit

657d146

verified ·

1 Parent(s): 969b25e

Update api.py

Browse files

Files changed (1) hide show

api.py +115 -150

api.py CHANGED Viewed

@@ -1,35 +1,41 @@
-from fastapi import FastAPI, HTTPException, Depends, status
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from llama_cpp import Llama
 import os
 import uvicorn
-from typing import Optional, List, Dict, Union, Literal
 import time
 import json
-import uuid
 from datetime import datetime
-# Configuration
 VALID_API_KEYS = {
     "sk-adminkey02",
-    "sk-testkey123",
     "sk-userkey456",
     "sk-demokey789"
 }
-# Global model variable
 llm = None
 security = HTTPBearer()
-# OpenAI-compatible request/response models
 class Message(BaseModel):
     role: Literal["system", "user", "assistant"]
     content: str
 class ChatCompletionRequest(BaseModel):
-    model: str = "capybarahermes-2.5-mistral-7b"
     messages: List[Message]
     max_tokens: Optional[int] = 512
     temperature: Optional[float] = 0.7
@@ -41,7 +47,7 @@ class ChatCompletionRequest(BaseModel):
 class ChatCompletionChoice(BaseModel):
     index: int
     message: Message
-    finish_reason: Literal["stop", "length", "content_filter"]
 class Usage(BaseModel):
     prompt_tokens: int
@@ -49,33 +55,33 @@ class Usage(BaseModel):
     total_tokens: int
 class ChatCompletionResponse(BaseModel):
-    id: str
     object: str = "chat.completion"
-    created: int
-    model: str
     choices: List[ChatCompletionChoice]
     usage: Usage
-class Model(BaseModel):
     id: str
     object: str = "model"
-    created: int
-    owned_by: str
 class ModelsResponse(BaseModel):
     object: str = "list"
-    data: List[Model]
-# Initialize FastAPI
 app = FastAPI(
-    title="CapybaraHermes OpenAI API",
-    description="OpenAI-compatible API for CapybaraHermes-2.5-Mistral-7B",
     version="1.0.0",
     docs_url="/v1/docs",
     redoc_url="/v1/redoc"
 )
-# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -84,92 +90,116 @@ app.add_middleware(
     allow_headers=["*"],
 )
 def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
-    """Verify API key"""
     if credentials.credentials not in VALID_API_KEYS:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid API key"
         )
     return credentials.credentials
 def load_model():
-    """Load the GGUF model"""
     global llm
-    model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
-    if not os.path.exists(model_path):
-        raise Exception(f"Model file {model_path} not found!")
-    try:
-        llm = Llama(
-            model_path=model_path,
-            n_ctx=4096,
-            n_threads=2,
-            n_batch=512,
-            verbose=False,
-            use_mlock=True,
-            n_gpu_layers=0,
-        )
-        print("✅ Model loaded successfully!")
-    except Exception as e:
-        raise Exception(f"❌ Error loading model: {str(e)}")
 def format_messages(messages: List[Message]) -> str:
-    """Format messages for ChatML format"""
     formatted = ""
     for message in messages:
-        formatted += f"<|im_start|>{message.role}\n{message.content}\n<|im_end|>\n"
     formatted += "<|im_start|>assistant\n"
     return formatted
 def count_tokens_rough(text: str) -> int:
-    """Rough token counting"""
     return len(text.split())
-@app.on_event("startup")
-async def startup_event():
-    """Load model on startup"""
-    try:
-        print("🚀 Starting CapybaraHermes API server...")
-        load_model()
-    except Exception as e:
-        print(f"💥 Failed to load model: {e}")
-        raise e
-# API endpoints with authentication
 @app.get("/v1/models", response_model=ModelsResponse)
 async def list_models(api_key: str = Depends(verify_api_key)):
-    """List available models"""
-    return ModelsResponse(
-        data=[
-            Model(
-                id="capybarahermes-2.5-mistral-7b",
-                created=int(datetime.now().timestamp()),
-                owned_by="local"
-            )
-        ]
-    )
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def create_chat_completion(
-    request: ChatCompletionRequest,
     api_key: str = Depends(verify_api_key)
 ):
-    """Create chat completion"""
     if llm is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-    try:
-        # Format messages
-        prompt = format_messages(request.messages)
-        prompt_tokens = count_tokens_rough(prompt)
-        start_time = time.time()
-        # Generate response
         response = llm(
             prompt,
             max_tokens=request.max_tokens,
@@ -179,20 +209,13 @@ async def create_chat_completion(
             echo=False
         )
-        end_time = time.time()
-        generation_time = end_time - start_time
-        # Extract response
         response_text = response['choices'][0]['text'].strip()
-        completion_tokens = count_tokens_rough(response_text)
-        tokens_per_second = completion_tokens / generation_time if generation_time > 0 else 0
-        print(f"⚡ Generated {completion_tokens} tokens in {generation_time:.2f}s ({tokens_per_second:.2f} tok/s)")
         return ChatCompletionResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            created=int(time.time()),
-            model=request.model,
             choices=[
                 ChatCompletionChoice(
                     index=0,
@@ -206,64 +229,6 @@ async def create_chat_completion(
                 total_tokens=prompt_tokens + completion_tokens
             )
         )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
-@app.get("/v1/health")
-async def health_check():
-    """Health check (no auth required)"""
-    if llm is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-    return {
-        "status": "healthy",
-        "model_loaded": True,
-        "timestamp": datetime.now().isoformat(),
-        "model": "capybarahermes-2.5-mistral-7b"
-    }
-@app.get("/v1")
-async def api_info():
-    """API information"""
-    return {
-        "message": "🦙 CapybaraHermes OpenAI Compatible API",
-        "model": "CapybaraHermes-2.5-Mistral-7B (Q5_K_M quantized)",
-        "endpoints": {
-            "chat_completions": "/v1/chat/completions",
-            "models": "/v1/models",
-            "health": "/v1/health",
-            "docs": "/v1/docs"
-        },
-        "authentication": {
-            "required": True,
-            "type": "Bearer token",
-            "header": "Authorization: Bearer sk-your-api-key",
-            "valid_keys": ["sk-adminkey02", "sk-testkey123", "sk-userkey456", "sk-demokey789"]
-        },
-        "usage": {
-            "sdk": "pip install openai",
-            "base_url": "https://your-username-your-space.hf.space/v1",
-            "example": "client = OpenAI(base_url='https://your-space.hf.space/v1', api_key='sk-adminkey02')"
-        },
-        "performance": {
-            "expected_speed": "2-8 tokens/second (CPU)",
-            "context_length": 4096,
-            "quantization": "Q5_K_M"
-        }
-    }
-# Public endpoint for basic info (no auth)
-@app.get("/api")
-async def public_api_info():
-    """Public API information"""
-    return {
-        "service": "CapybaraHermes API",
-        "status": "running",
-        "endpoints": "/v1/",
-        "docs": "/v1/docs",
-        "chat_ui": "/",
-        "authentication": "API key required for /v1/* endpoints"
-    }
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+# api.py
 import os
 import uvicorn
+import uuid
 import time
 import json
 from datetime import datetime
+from typing import Optional, List, Union, Literal
+from fastapi import FastAPI, HTTPException, Depends, status
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+from llama_cpp import Llama
+# --- Configuration ---
 VALID_API_KEYS = {
     "sk-adminkey02",
+    "sk-testkey123",
     "sk-userkey456",
     "sk-demokey789"
 }
+MODEL_PATH = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
+MODEL_NAME = "capybarahermes-2.5-mistral-7b"
+# --- Global Model Variable ---
 llm = None
 security = HTTPBearer()
+# --- Pydantic Models for OpenAI Compatibility ---
 class Message(BaseModel):
     role: Literal["system", "user", "assistant"]
     content: str
 class ChatCompletionRequest(BaseModel):
+    model: str = MODEL_NAME
     messages: List[Message]
     max_tokens: Optional[int] = 512
     temperature: Optional[float] = 0.7
 class ChatCompletionChoice(BaseModel):
     index: int
     message: Message
+    finish_reason: Optional[Literal["stop", "length"]] = None
 class Usage(BaseModel):
     prompt_tokens: int
     total_tokens: int
 class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{uuid.uuid4().hex}")
     object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str = MODEL_NAME
     choices: List[ChatCompletionChoice]
     usage: Usage
+class ModelData(BaseModel):
     id: str
     object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "user"
 class ModelsResponse(BaseModel):
     object: str = "list"
+    data: List[ModelData]
+# --- FastAPI App Initialization ---
 app = FastAPI(
+    title="CapybaraHermes OpenAI-Compatible API",
+    description=f"An OpenAI-compatible API for the {MODEL_NAME} model.",
     version="1.0.0",
     docs_url="/v1/docs",
     redoc_url="/v1/redoc"
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# --- Dependency for API Key Verification ---
 def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
     if credentials.credentials not in VALID_API_KEYS:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid or missing API key"
         )
     return credentials.credentials
+# --- Model Loading ---
+@app.on_event("startup")
 def load_model():
     global llm
+    if not os.path.exists(MODEL_PATH):
+        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
+    print("🚀 Loading GGUF model...")
+    llm = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=4096,
+        n_threads=2,
+        n_batch=512,
+        verbose=False,
+        use_mlock=True,
+        n_gpu_layers=0,
+    )
+    print("✅ Model loaded successfully!")
+# --- Helper Functions ---
 def format_messages(messages: List[Message]) -> str:
+    """Formats messages for the ChatML format expected by the model."""
     formatted = ""
     for message in messages:
+        formatted += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"
     formatted += "<|im_start|>assistant\n"
     return formatted
 def count_tokens_rough(text: str) -> int:
+    """A rough approximation of token counting."""
     return len(text.split())
+# --- API Endpoints ---
+@app.get("/v1/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "model_loaded": llm is not None}
 @app.get("/v1/models", response_model=ModelsResponse)
 async def list_models(api_key: str = Depends(verify_api_key)):
+    """Lists the available models."""
+    return ModelsResponse(data=[ModelData(id=MODEL_NAME)])
+@app.post("/v1/chat/completions")
 async def create_chat_completion(
+    request: ChatCompletionRequest,
     api_key: str = Depends(verify_api_key)
 ):
+    """Creates a model response for the given chat conversation."""
     if llm is None:
+        raise HTTPException(status_code=503, detail="Model is not loaded yet")
+    prompt = format_messages(request.messages)
+    # Streaming response
+    if request.stream:
+        async def stream_generator():
+            completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+            created_time = int(time.time())
+            stream = llm(
+                prompt,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                stop=["<|im_end|>", "<|im_start|>"] + (request.stop or []),
+                stream=True,
+                echo=False
+            )
+            for output in stream:
+                if 'choices' in output and len(output['choices']) > 0:
+                    delta_content = output['choices'][0].get('text', '')
+                    chunk = {
+                        "id": completion_id,
+                        "object": "chat.completion.chunk",
+                        "created": created_time,
+                        "model": MODEL_NAME,
+                        "choices": [{"index": 0, "delta": {"content": delta_content}, "finish_reason": None}]
+                    }
+                    yield f"data: {json.dumps(chunk)}\n\n"
+            # Send the final chunk
+            final_chunk = {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": created_time,
+                "model": MODEL_NAME,
+                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+            yield "data: [DONE]\n\n"
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    # Non-streaming response
+    else:
         response = llm(
             prompt,
             max_tokens=request.max_tokens,
             echo=False
         )
         response_text = response['choices'][0]['text'].strip()
+        prompt_tokens = count_tokens_rough(prompt)
+        completion_tokens = count_tokens_rough(response_text)
         return ChatCompletionResponse(
+            model=MODEL_NAME,
             choices=[
                 ChatCompletionChoice(
                     index=0,
                 total_tokens=prompt_tokens + completion_tokens
             )
         )
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)