File size: 824 Bytes
c68a24d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
app = FastAPI()
# Download the quantized model
model_path = hf_hub_download(
repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
)
# Initialize LLM for 2 vCPUs
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
# Define the request body format
class PromptRequest(BaseModel):
prompt: str
@app.post("/generate")
def generate_text(request: PromptRequest):
output = llm(
f"<|system|>\nYou are a helpful API.\n<|user|>\n{request.prompt}\n<|assistant|>\n",
max_tokens=256,
stop=["<|user|>"],
echo=False
)
return {"response": output['choices'][0]['text']} |