File size: 2,475 Bytes
7a03f1b
 
 
 
 
 
 
c38e36c
7a03f1b
 
 
 
 
 
 
 
 
 
 
c38e36c
7a03f1b
c38e36c
 
7a03f1b
 
 
c38e36c
 
 
 
 
 
 
 
 
7a03f1b
 
c38e36c
 
 
 
 
7a03f1b
 
 
c38e36c
7a03f1b
 
 
 
c38e36c
7a03f1b
 
c38e36c
 
7a03f1b
c38e36c
 
 
7a03f1b
 
c38e36c
7a03f1b
 
 
c38e36c
7a03f1b
c38e36c
7a03f1b
 
 
 
c38e36c
7a03f1b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from fastapi import FastAPI, Header, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
import os
import re
import secrets
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from duckduckgo_search import DDGS

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# --- Specialist DB ---
API_KEYS_DB = {
    "ELE-PRIME-ADMIN-SYS": {"limit": 100000, "used": 0, "status": "active"},
    "ELE-PRIME-VOID-X": {"limit": 50000, "used": 0, "status": "active"}
}
ADMIN_SECRET = "MINZO-SECRET-2026"

# --- MiMo-Audio 7B Optimization ---
model_id = "XiaomiMiMo/MiMo-Audio-7B-Instruct"
print(f"🔱 INACHI-CORE: Deploying Multimodal Engine {model_id}...")

# 4-bit Quantization එකතු කිරීම (CPU/RAM ඉතිරි කිරීමට)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto" # මෙය ස්වයංක්‍රීයව RAM එක කළමනාකරණය කරයි
)

class AdminRequest(BaseModel):
    admin_pass: str
    limit: int = 5000

@app.post("/v1/chat")
async def chat(message: dict, x_api_key: str = Header(None)):
    if not x_api_key or x_api_key not in API_KEYS_DB:
        raise HTTPException(status_code=403, detail="Invalid Key")
        
    query = message.get("query", "")
    
    # MiMo පද්ධතියේ System Prompt එක
    system_instruction = (
        "You are Inachi-Prime, an Any-to-Any multimodal AI assistant. "
        "You are designed by Specialist MINZO-PRIME. "
        "Respond accurately in Sinhala or English based on user input."
    )

    inputs = tokenizer(f"{system_instruction}\n\nUser: {query}\nAssistant:", return_tensors="pt").to("cpu")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        ans = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

    API_KEYS_DB[x_api_key]["used"] += 1
    return {"reply": ans, "usage": API_KEYS_DB[x_api_key]["used"]}

main = app