Spaces:

MINZO4546
/

minzo-api

Build error

App Files Files Community

MINZO4546 commited on 14 days ago

Commit

7ae0e9a

verified ·

1 Parent(s): c8bfde7

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -42

app.py CHANGED Viewed

@@ -2,12 +2,10 @@ from fastapi import FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import torch
-import re
-import secrets
 import requests
-from transformers import AutoModelForCausalLM, AutoTokenizer
-# Hugging Face server needs 'main'
 main = FastAPI()
 main.add_middleware(
@@ -17,43 +15,47 @@ main.add_middleware(
     allow_headers=["*"],
 )
-# ── API Keys & Config ──
 API_KEYS_DB = {
     "ELE-PRIME-ADMIN-SYS": {"limit": 10000, "used": 0, "status": "active"},
     "ELE-PRIME-YG5EPZFQ":  {"limit": 5000,  "used": 0, "status": "active"},
 }
-ADMIN_SECRET = "MINZO-SECRET-2026"
-# ── Google Search Config ──
 GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
 GOOGLE_CX = "YOUR_CUSTOM_SEARCH_ENGINE_ID"
-# ── Load AI Model for CPU ──
-model_id = "google/gemma-2-9b-it"
-print(f"Loading {model_id} on CPU...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    dtype=torch.bfloat16,
-    device_map="cpu",
     trust_remote_code=True
 )
-print("Model loaded successfully.")
 class ChatRequest(BaseModel):
-    query: str
     search: bool = True
     max_results: int = 3
-# ──────────────────────────────────────
-# SEARCH HELPER
-# ──────────────────────────────────────
 def google_search(query: str, max_results: int = 3) -> str:
     url = "https://www.googleapis.com/customsearch/v1"
     params = {"q": query, "key": GOOGLE_API_KEY, "cx": GOOGLE_CX, "num": max_results}
     try:
-        response = requests.get(url, params=params)
         results = response.json().get("items", [])
         if not results: return ""
         lines = ["[WEB SEARCH RESULTS]"]
@@ -62,56 +64,60 @@ def google_search(query: str, max_results: int = 3) -> str:
         return "\n".join(lines)
     except: return ""
-# ──────────────────────────────────────
-# CHAT ENDPOINT (FIXED)
-# ──────────────────────────────────────
 @main.post("/v1/chat")
-async def chat(message: ChatRequest, x_api_key: str = Header(None)):
     if not x_api_key or x_api_key not in API_KEYS_DB:
         raise HTTPException(status_code=403, detail="Access Denied")
-    query = message.query.strip()
     context = ""
     search_used = False
-    if message.search:
-        context = google_search(query, max_results=message.max_results)
         if context: search_used = True
-    # SYSTEM INSTRUCTION RE-FORMATTED FOR GEMMA
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y")
-    # Gemma doesn't support 'system' role, so we merge it into the user prompt
-    combined_prompt = (
         f"Instruction: You are Elephant AI (Inachi-Core), an expert assistant for MINZO-PRIME. "
         f"Respond in the same language the user uses. Current date: {today}.\n"
     )
     if search_used:
-        combined_prompt += f"\nUse these web results to answer: {context}\n"
-    combined_prompt += f"\nUser Query: {query}"
-    msgs = [
-        {"role": "user", "content": combined_prompt},
-    ]
-    # Apply template (Now only with 'user' role)
-    text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer([text], return_tensors="pt").to("cpu")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=512,
             temperature=0.6,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
         )
-    ans = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
     API_KEYS_DB[x_api_key]["used"] += 1
-    return {"reply": ans, "search_used": search_used}
 @main.get("/")
-def home(): return {"status": "Inachi-Core Online"}

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import torch
 import requests
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# ── API INITIALIZATION ──
 main = FastAPI()
 main.add_middleware(
     allow_headers=["*"],
 )
+# ── CONFIGURATION ──
 API_KEYS_DB = {
     "ELE-PRIME-ADMIN-SYS": {"limit": 10000, "used": 0, "status": "active"},
     "ELE-PRIME-YG5EPZFQ":  {"limit": 5000,  "used": 0, "status": "active"},
 }
 GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
 GOOGLE_CX = "YOUR_CUSTOM_SEARCH_ENGINE_ID"
+# ── MODEL LOADING (OPTIMIZED FOR 16GB RAM) ──
+model_id = "google/gemma-2-9b-it"
+print(f"🔱 Specialist, Loading {model_id} with 4-bit Quantization...")
+# 16GB RAM එකට ගැලපෙන්න model එක පොඩි කරන config එක
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    quantization_config=bnb_config,
+    device_map="auto", # GPU තිබේ නම් GPU ගනී, නැතිනම් CPU ගනී
     trust_remote_code=True
 )
+print("🔱 Model loaded successfully. Inachi-Core is Online.")
+# ── DATA MODELS (FIXED FOR FRONTEND MATCH) ──
 class ChatRequest(BaseModel):
+    message: str          # HTML එකේ 'message' ලෙස එවන නිසා මෙතනත් 'message' විය යුතුයි
+    history: list = []    # Gradio history එක array එකක් ලෙස එනවා
+    think_level: str = "high"
     search: bool = True
     max_results: int = 3
+# ── SEARCH HELPER ──
 def google_search(query: str, max_results: int = 3) -> str:
     url = "https://www.googleapis.com/customsearch/v1"
     params = {"q": query, "key": GOOGLE_API_KEY, "cx": GOOGLE_CX, "num": max_results}
     try:
+        response = requests.get(url, params=params, timeout=5)
         results = response.json().get("items", [])
         if not results: return ""
         lines = ["[WEB SEARCH RESULTS]"]
         return "\n".join(lines)
     except: return ""
+# ── CHAT ENDPOINT ──
 @main.post("/v1/chat")
+async def chat(request_data: ChatRequest, x_api_key: str = Header(None)):
+    # API Key Validation
     if not x_api_key or x_api_key not in API_KEYS_DB:
         raise HTTPException(status_code=403, detail="Access Denied")
+    user_query = request_data.message.strip()
     context = ""
     search_used = False
+    if request_data.search:
+        context = google_search(user_query, max_results=request_data.max_results)
         if context: search_used = True
+    # Gemma prompt construction
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y")
+    system_instr = (
         f"Instruction: You are Elephant AI (Inachi-Core), an expert assistant for MINZO-PRIME. "
         f"Respond in the same language the user uses. Current date: {today}.\n"
     )
+    full_prompt = system_instr
     if search_used:
+        full_prompt += f"\nUse these web results to answer:\n{context}\n"
+    full_prompt += f"\nUser Query: {user_query}"
+    # Gemma Template
+    msgs = [{"role": "user", "content": full_prompt}]
+    input_text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=1024,
             temperature=0.6,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
         )
+    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
     API_KEYS_DB[x_api_key]["used"] += 1
+    # HTML එක බලාපොරොත්තු වෙන format එකට output එක දීම
+    return {
+        "reply": response_text,
+        "search_used": search_used,
+        "status": "success"
+    }
 @main.get("/")
+def home():
+    return {"status": "Inachi-Core Online", "model": "Gemma-2-9b-it-4bit"}