import os from fastapi import FastAPI from llama_cpp import Llama import requests # 🔱 CPU Core Management: සර්වර් එකේ තියෙන Cores ගණනට Threads සීමා කිරීම threads = int(os.cpu_count() or 2) # 🔱 Load Model: CPU එකට ගැලපෙන Gemma 3 GGUF මොඩල් එක # HF Space එකේදී හරි path එකක් ලබා දීම හෝ Repo ID එක පාවිච්චි කරන්න llm = Llama.from_pretrained( repo_id="google/gemma-3-1b-it-GGUF", filename="*q4_k_m.gguf", # 4-bit Quantized version for best performance n_ctx=2048, n_threads=threads, verbose=False ) main = FastAPI() def web_search(query): try: url = f"https://api.duckduckgo.com/?q={query}&format=json" response = requests.get(url, timeout=5).json() return response.get("AbstractText", "No data.") except: return "Search failed." @main.post("/v1/chat") async def chat(data: dict): user_query = data.get("message", "") # 🔱 Inachi AI Identity system_instr = ( "You are Inachi AI, developed by the Inachi Team. " "You are an expert system architect." ) # Simple search context logic search_context = "" if "search" in user_query.lower(): search_context = f"\nContext: {web_search(user_query)}" # Prompt Template prompt = f"system\n{system_instr}{search_context}\nuser\n{user_query}\nmodel\n" # Generation output = llm( prompt, max_tokens=512, stop=[""], echo=False ) reply = output['choices'][0]['text'].strip() return {"reply": reply} if __name__ == "__main__": import uvicorn uvicorn.run(main, host="0.0.0.0", port=7860)