File size: 1,877 Bytes
b0ae281
d76d7d1
4ce55c5
 
83957eb
7d813f4
4ce55c5
d76d7d1
7d813f4
 
 
 
 
4ce55c5
 
 
9154c39
249aa04
4ce55c5
249aa04
7d813f4
 
 
 
 
 
 
 
5e1de17
6219a3a
 
 
7d813f4
 
 
 
 
7ae0e9a
7d813f4
 
 
 
 
 
 
550f38c
7d813f4
4ce55c5
 
 
 
 
deadea5
6219a3a
4ce55c5
6219a3a
deadea5
6219a3a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
from fastapi import FastAPI
from llama_cpp import Llama
import requests

# 🔱 CPU Core Management: සර්වර් එකේ තියෙන Cores ගණනට Threads සීමා කිරීම
threads = int(os.cpu_count() or 2)

# 🔱 Load Model: CPU එකට ගැලපෙන Gemma 3 GGUF මොඩල් එක
# HF Space එකේදී හරි path එකක් ලබා දීම හෝ Repo ID එක පාවිච්චි කරන්න
llm = Llama.from_pretrained(
    repo_id="google/gemma-3-1b-it-GGUF", 
    filename="*q4_k_m.gguf", # 4-bit Quantized version for best performance
    n_ctx=2048,
    n_threads=threads,
    verbose=False
)

main = FastAPI()

def web_search(query):
    try:
        url = f"https://api.duckduckgo.com/?q={query}&format=json"
        response = requests.get(url, timeout=5).json()
        return response.get("AbstractText", "No data.")
    except:
        return "Search failed."

@main.post("/v1/chat")
async def chat(data: dict):
    user_query = data.get("message", "")
    
    # 🔱 Inachi AI Identity
    system_instr = (
        "You are Inachi AI, developed by the Inachi Team. "
        "You are an expert system architect."
    )
    
    # Simple search context logic
    search_context = ""
    if "search" in user_query.lower():
        search_context = f"\nContext: {web_search(user_query)}"

    # Prompt Template
    prompt = f"<bos><start_of_turn>system\n{system_instr}{search_context}<end_of_turn>\n<start_of_turn>user\n{user_query}<end_of_turn>\n<start_of_turn>model\n"
    
    # Generation
    output = llm(
        prompt,
        max_tokens=512,
        stop=["<end_of_turn>"],
        echo=False
    )
    
    reply = output['choices'][0]['text'].strip()
    return {"reply": reply}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(main, host="0.0.0.0", port=7860)