from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

app = FastAPI()

# Download the quantized model
model_path = hf_hub_download(
    repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
    filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
)

# Initialize LLM for 2 vCPUs
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)

# Define the request body format
class PromptRequest(BaseModel):
    prompt: str

@app.post("/generate")
def generate_text(request: PromptRequest):
    output = llm(
        f"<|system|>\nYou are a helpful API.\n<|user|>\n{request.prompt}\n<|assistant|>\n",
        max_tokens=256,
        stop=["<|user|>"],
        echo=False
    )
    return {"response": output['choices'][0]['text']}