| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
| import os |
|
|
| app = FastAPI(title="Llama 3.2 1B API") |
|
|
| |
| REPO_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF" |
| FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" |
| MODEL_PATH = os.path.join(os.getcwd(), FILENAME) |
|
|
| def ensure_model_exists(): |
| if not os.path.exists(MODEL_PATH): |
| print(f"Downloading model {FILENAME} from {REPO_ID}...") |
| try: |
| hf_hub_download( |
| repo_id=REPO_ID, |
| filename=FILENAME, |
| local_dir=os.getcwd(), |
| local_dir_use_symlinks=False |
| ) |
| print("Download complete.") |
| except Exception as e: |
| raise RuntimeError(f"Failed to download model: {e}") |
| else: |
| print(f"Model found at {MODEL_PATH}") |
|
|
| |
| ensure_model_exists() |
|
|
| |
| |
| |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_threads=4, |
| n_ctx=2048, |
| verbose=False |
| ) |
|
|
| class ChatRequest(BaseModel): |
| prompt: str |
| max_tokens: int = 512 |
| temperature: float = 0.7 |
| top_p: float = 0.9 |
|
|
| @app.get("/") |
| async def root(): |
| return {"message": "Llama 3.2 1B FastAPI server is running", "model": FILENAME} |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completion(request: ChatRequest): |
| try: |
| |
| formatted_prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{request.prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" |
| |
| output = llm( |
| formatted_prompt, |
| max_tokens=request.max_tokens, |
| temperature=request.temperature, |
| top_p=request.top_p, |
| stop=["<|eot_id|>"] |
| ) |
| |
| return output |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=8000) |
|
|