Spaces:

MINZO4546
/

minzo-api

Build error

App Files Files Community

MINZO4546 commited on 15 days ago

Commit

f0868c6

verified ·

1 Parent(s): 2bac2fe

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -103

app.py CHANGED Viewed

@@ -4,8 +4,8 @@ from pydantic import BaseModel
 import torch
 import re
 import secrets
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from duckduckgo_search import DDGS
 app = FastAPI()
@@ -16,27 +16,35 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ── API Keys Database ──
 API_KEYS_DB = {
     "ELE-PRIME-ADMIN-SYS": {"limit": 10000, "used": 0, "status": "active"},
     "ELE-PRIME-YG5EPZFQ":  {"limit": 5000,  "used": 0, "status": "active"},
 }
 ADMIN_SECRET = "MINZO-SECRET-2026"
-# ── Load AI Model ──
-model_id = "google/gemma-4-E4B-it"
-print(f"Loading {model_id} ...")
-tokenizer = AutoTokenizer.from_pretrained(
-    model_id,
-    trust_remote_code=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype="auto",
-    device_map="cpu",
-    trust_remote_code=True,
 )
-print("Model loaded.")
 # ── Pydantic Models ──
 class AdminRequest(BaseModel):
@@ -45,61 +53,55 @@ class AdminRequest(BaseModel):
 class ChatRequest(BaseModel):
     query: str
-    search: bool = True        # client can disable search per-request
-    max_results: int = 3       # how many DDG results to inject
 # ──────────────────────────────────────
-# REAL-TIME WEB SEARCH HELPER
 # ──────────────────────────────────────
-def web_search(query: str, max_results: int = 3) -> str:
     """
-    Search DuckDuckGo and return formatted context string.
-    Returns empty string on failure so the model still responds.
     """
     try:
-        with DDGS() as ddgs:
-            results = list(
-                ddgs.text(
-                    query,
-                    max_results=max_results,
-                    safesearch="moderate",
-                    timelimit=None,   # no time limit → more results
-                )
-            )
         if not results:
             return ""
-        lines = ["[WEB SEARCH RESULTS — Real-time]"]
         for i, r in enumerate(results, 1):
-            title   = r.get("title", "").strip()
-            body    = r.get("body",  "").strip()
-            href    = r.get("href",  "").strip()
-            lines.append(f"\n{i}. {title}\n   {body}\n   Source: {href}")
         lines.append("\n[END OF SEARCH RESULTS]")
         return "\n".join(lines)
     except Exception as e:
-        print(f"[DDG search error] {e}")
         return ""
 # ── Decide whether to search ──
 def should_search(query: str) -> bool:
-    """
-    Always search unless the query is clearly a pure code/math task
-    with no factual component. This keeps it simple and reliable.
-    """
     no_search_patterns = [
         r"^\s*(write|create|generate|make|build)\s+(a\s+)?(code|function|script|program|class)",
         r"^\s*explain\s+(this\s+)?(code|function|snippet)",
-        r"^\s*(what is|define)\s+[a-z ]+\s*\??\s*$",   # simple definitions
     ]
     q = query.lower().strip()
     for pat in no_search_patterns:
         if re.match(pat, q, re.I):
             return False
-    return True   # search by default for everything else
 # ──────────────────────────────────────
 # ENDPOINTS
@@ -107,30 +109,21 @@ def should_search(query: str) -> bool:
 @app.get("/")
 def home():
     return {
-        "status": "Elephant Pro Active",
         "active_keys": len(API_KEYS_DB),
-        "search": "DuckDuckGo real-time",
     }
 @app.post("/v1/generate-key")
 async def generate_key(data: AdminRequest):
     if data.admin_pass != ADMIN_SECRET:
         raise HTTPException(status_code=401, detail="Unauthorized Specialist Access!")
     new_key = f"ELE-PRIME-{secrets.token_hex(4).upper()}"
     API_KEYS_DB[new_key] = {"limit": data.limit, "used": 0, "status": "active"}
-    return {
-        "message": "New Specialist Key Activated",
-        "api_key": new_key,
-        "limit": data.limit,
-    }
 @app.post("/v1/chat")
 async def chat(message: ChatRequest, x_api_key: str = Header(None)):
-    # ── Auth ──
     if not x_api_key or x_api_key not in API_KEYS_DB:
         raise HTTPException(status_code=403, detail="Access Denied")
@@ -139,78 +132,41 @@ async def chat(message: ChatRequest, x_api_key: str = Header(None)):
         raise HTTPException(status_code=429, detail="Limit Reached")
     query = message.query.strip()
-    if not query:
-        raise HTTPException(status_code=400, detail="Empty query")
-    # ── Real-time Web Search ──
     context = ""
     search_used = False
     if message.search and should_search(query):
-        print(f"[SEARCH] Querying DDG: {query[:80]}")
-        context = web_search(query, max_results=message.max_results)
         if context:
             search_used = True
-            print(f"[SEARCH] Got {message.max_results} results.")
-        else:
-            print("[SEARCH] No results returned.")
-    # ── System Prompt ──
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y, %H:%M UTC")
     system_instruction = (
         "You are Elephant AI (Inachi-Core), an expert assistant for Specialist MINZO-PRIME. "
-        "Respond in the same language the user uses (Sinhala or English). "
-        "Be concise, accurate, and helpful. "
-        f"Current UTC date/time: {today}. "
     )
     if search_used:
-        system_instruction += (
-            "\nYou have been given real-time web search results below. "
-            "Use them to answer accurately. Always cite the source URL when referencing search results.\n"
-            + context
-        )
-    # ── Build Messages ──
     msgs = [
-        {"role": "system",  "content": system_instruction},
-        {"role": "user",    "content": query},
     ]
-    # ── Tokenize & Generate ──
-    text = tokenizer.apply_chat_template(
-        msgs, tokenize=False, add_generation_prompt=True
-    )
-    inputs = tokenizer([text], return_tensors="pt").to("cpu")
     with torch.no_grad():
         outputs = model.generate(
-            inputs.input_ids,
             max_new_tokens=512,
             temperature=0.6,
-            top_p=0.9,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
         )
-    full_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-    # ── Clean Output ──
-    ans = full_response.split("assistant")[-1].strip()
-    if "</think>" in ans:
-        ans = ans.split("</think>")[-1].strip()
-    ans = ans.replace("Ċ", "\n").replace("Ġ", " ")
-    ans = re.sub(r" +", " ", ans).strip()
-    # ── Update Usage ──
     API_KEYS_DB[x_api_key]["used"] += 1
-    return {
-        "reply":        ans,
-        "search_used":  search_used,
-        "usage":        API_KEYS_DB[x_api_key]["used"],
-        "limit":        key_info["limit"],
-    }
-# HuggingFace Spaces entrypoint
-main = app

 import torch
 import re
 import secrets
+import requests # Google Search API එකට අවශ්‍යයි
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 app = FastAPI()
     allow_headers=["*"],
 )
+# ── API Keys & Config ──
 API_KEYS_DB = {
     "ELE-PRIME-ADMIN-SYS": {"limit": 10000, "used": 0, "status": "active"},
     "ELE-PRIME-YG5EPZFQ":  {"limit": 5000,  "used": 0, "status": "active"},
 }
 ADMIN_SECRET = "MINZO-SECRET-2026"
+# ── Google Search Config ──
+# Specialist, මේ දෙක ඔයාගේ Google Cloud Console එකෙන් අරන් මෙතනට දාන්න
+GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
+GOOGLE_CX = "YOUR_CUSTOM_SEARCH_ENGINE_ID"
+# ── Load AI Model with 4-bit Quantization ──
+model_id = "google/gemma-2-9b-it" # Gemma-2-9B එක පාවිච්චි කරමු
+print(f"Loading {model_id} with 4-bit quantization...")
+quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
 )
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    quantization_config=quant_config,
+    device_map="auto", # 18GB Space එකේ GPU එකට auto load වෙයි
+    trust_remote_code=True
 )
+print("Model loaded and optimized.")
 # ── Pydantic Models ──
 class AdminRequest(BaseModel):
 class ChatRequest(BaseModel):
     query: str
+    search: bool = True
+    max_results: int = 3
 # ──────────────────────────────────────
+# GOOGLE REAL-TIME WEB SEARCH HELPER
 # ──────────────────────────────────────
+def google_search(query: str, max_results: int = 3) -> str:
     """
+    Search Google and return formatted context string.
     """
+    url = "https://www.googleapis.com/customsearch/v1"
+    params = {
+        "q": query,
+        "key": GOOGLE_API_KEY,
+        "cx": GOOGLE_CX,
+        "num": max_results
+    }
     try:
+        response = requests.get(url, params=params)
+        results = response.json().get("items", [])
         if not results:
             return ""
+        lines = ["[GOOGLE SEARCH RESULTS — Real-time]"]
         for i, r in enumerate(results, 1):
+            title = r.get("title", "").strip()
+            snippet = r.get("snippet", "").strip()
+            link = r.get("link", "").strip()
+            lines.append(f"\n{i}. {title}\n   {snippet}\n   Source: {link}")
         lines.append("\n[END OF SEARCH RESULTS]")
         return "\n".join(lines)
     except Exception as e:
+        print(f"[Google search error] {e}")
         return ""
 # ── Decide whether to search ──
 def should_search(query: str) -> bool:
     no_search_patterns = [
         r"^\s*(write|create|generate|make|build)\s+(a\s+)?(code|function|script|program|class)",
         r"^\s*explain\s+(this\s+)?(code|function|snippet)",
+        r"^\s*(what is|define)\s+[a-z ]+\s*\??\s*$",
     ]
     q = query.lower().strip()
     for pat in no_search_patterns:
         if re.match(pat, q, re.I):
             return False
+    return True
 # ──────────────────────────────────────
 # ENDPOINTS
 @app.get("/")
 def home():
     return {
+        "status": "Inachi-Core Active",
         "active_keys": len(API_KEYS_DB),
+        "search": "Google Real-time",
     }
 @app.post("/v1/generate-key")
 async def generate_key(data: AdminRequest):
     if data.admin_pass != ADMIN_SECRET:
         raise HTTPException(status_code=401, detail="Unauthorized Specialist Access!")
     new_key = f"ELE-PRIME-{secrets.token_hex(4).upper()}"
     API_KEYS_DB[new_key] = {"limit": data.limit, "used": 0, "status": "active"}
+    return {"api_key": new_key, "limit": data.limit}
 @app.post("/v1/chat")
 async def chat(message: ChatRequest, x_api_key: str = Header(None)):
     if not x_api_key or x_api_key not in API_KEYS_DB:
         raise HTTPException(status_code=403, detail="Access Denied")
         raise HTTPException(status_code=429, detail="Limit Reached")
     query = message.query.strip()
     context = ""
     search_used = False
     if message.search and should_search(query):
+        context = google_search(query, max_results=message.max_results)
         if context:
             search_used = True
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y, %H:%M UTC")
     system_instruction = (
         "You are Elephant AI (Inachi-Core), an expert assistant for Specialist MINZO-PRIME. "
+        "Respond in the same language the user uses. "
+        f"Current date: {today}. "
     )
     if search_used:
+        system_instruction += "\nUse these web results to answer accurately:\n" + context
     msgs = [
+        {"role": "system", "content": system_instruction},
+        {"role": "user", "content": query},
     ]
+    text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer([text], return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
+            **inputs,
             max_new_tokens=512,
             temperature=0.6,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
         )
+    ans = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
     API_KEYS_DB[x_api_key]["used"] += 1
+    return {"reply": ans, "search_used": search_used, "usage": API_KEYS_DB[x_api_key]["used"]}