File size: 1,877 Bytes
b0ae281 d76d7d1 4ce55c5 83957eb 7d813f4 4ce55c5 d76d7d1 7d813f4 4ce55c5 9154c39 249aa04 4ce55c5 249aa04 7d813f4 5e1de17 6219a3a 7d813f4 7ae0e9a 7d813f4 550f38c 7d813f4 4ce55c5 deadea5 6219a3a 4ce55c5 6219a3a deadea5 6219a3a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import os
from fastapi import FastAPI
from llama_cpp import Llama
import requests
# 🔱 CPU Core Management: සර්වර් එකේ තියෙන Cores ගණනට Threads සීමා කිරීම
threads = int(os.cpu_count() or 2)
# 🔱 Load Model: CPU එකට ගැලපෙන Gemma 3 GGUF මොඩල් එක
# HF Space එකේදී හරි path එකක් ලබා දීම හෝ Repo ID එක පාවිච්චි කරන්න
llm = Llama.from_pretrained(
repo_id="google/gemma-3-1b-it-GGUF",
filename="*q4_k_m.gguf", # 4-bit Quantized version for best performance
n_ctx=2048,
n_threads=threads,
verbose=False
)
main = FastAPI()
def web_search(query):
try:
url = f"https://api.duckduckgo.com/?q={query}&format=json"
response = requests.get(url, timeout=5).json()
return response.get("AbstractText", "No data.")
except:
return "Search failed."
@main.post("/v1/chat")
async def chat(data: dict):
user_query = data.get("message", "")
# 🔱 Inachi AI Identity
system_instr = (
"You are Inachi AI, developed by the Inachi Team. "
"You are an expert system architect."
)
# Simple search context logic
search_context = ""
if "search" in user_query.lower():
search_context = f"\nContext: {web_search(user_query)}"
# Prompt Template
prompt = f"<bos><start_of_turn>system\n{system_instr}{search_context}<end_of_turn>\n<start_of_turn>user\n{user_query}<end_of_turn>\n<start_of_turn>model\n"
# Generation
output = llm(
prompt,
max_tokens=512,
stop=["<end_of_turn>"],
echo=False
)
reply = output['choices'][0]['text'].strip()
return {"reply": reply}
if __name__ == "__main__":
import uvicorn
uvicorn.run(main, host="0.0.0.0", port=7860) |