from fastapi import FastAPI from pydantic import BaseModel from huggingface_hub import hf_hub_download from llama_cpp import Llama app = FastAPI() # Download the quantized model model_path = hf_hub_download( repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" ) # Initialize LLM for 2 vCPUs llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2) # Define the request body format class PromptRequest(BaseModel): prompt: str @app.post("/generate") def generate_text(request: PromptRequest): output = llm( f"<|system|>\nYou are a helpful API.\n<|user|>\n{request.prompt}\n<|assistant|>\n", max_tokens=256, stop=["<|user|>"], echo=False ) return {"response": output['choices'][0]['text']}