| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| # Download the quantized model | |
| model_path = hf_hub_download( | |
| repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", | |
| filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" | |
| ) | |
| # Initialize LLM for 2 vCPUs | |
| llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2) | |
| # Define the request body format | |
| class PromptRequest(BaseModel): | |
| prompt: str | |
| def generate_text(request: PromptRequest): | |
| output = llm( | |
| f"<|system|>\nYou are a helpful API.\n<|user|>\n{request.prompt}\n<|assistant|>\n", | |
| max_tokens=256, | |
| stop=["<|user|>"], | |
| echo=False | |
| ) | |
| return {"response": output['choices'][0]['text']} |