from fastapi import FastAPI, Request from llama_cpp import Llama import uvicorn app = FastAPI() # Model එක Load කිරීම (මෙයට විනාඩි කිහිපයක් ගතවේ) llm = Llama( model_path="model.gguf", n_ctx=2048, n_threads=4 # CPU cores ගණන ) @app.post("/v1/chat/completions") async def chat(request: Request): data = await request.json() messages = data.get("messages", []) # Prompt එක සකසා ගැනීම full_prompt = "" for m in messages: full_prompt += f"{m['role']}: {m['content']}\n" full_prompt += "assistant: " # Model එකෙන් පිළිතුර ලබා ගැනීම output = llm( full_prompt, max_tokens=512, stop=["user:", "\n"], echo=False ) response_text = output["choices"][0]["text"] return { "choices": [{ "message": { "role": "assistant", "content": response_text.strip() } }] } @app.get("/") def home(): return {"status": "MINZO-CORE v1.0 is Live"}