Spaces:

MINZO4546
/

minzo-api

Build error

App Files Files Community

MINZO4546 commited on 14 days ago

Commit

8f635ff

verified ·

1 Parent(s): ebfdf66

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -21

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 import re
 import secrets
 import requests # Google Search API එකට අවශ්‍යයි
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 app = FastAPI()
@@ -24,27 +24,22 @@ API_KEYS_DB = {
 ADMIN_SECRET = "MINZO-SECRET-2026"
 # ── Google Search Config ──
-# Specialist, මේ දෙක ඔයාගේ Google Cloud Console එකෙන් අරන් මෙතනට දාන්න
 GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
 GOOGLE_CX = "YOUR_CUSTOM_SEARCH_ENGINE_ID"
-# ── Load AI Model with 4-bit Quantization ──
-model_id = "google/gemma-4-E4B-it" # Gemma-2-9B එක පාවිච්චි කරමු
-print(f"Loading {model_id} with 4-bit quantization...")
-quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    quantization_config=quant_config,
-    device_map="auto", # 18GB Space එකේ GPU එකට auto load වෙයි
     trust_remote_code=True
 )
-print("Model loaded and optimized.")
 # ── Pydantic Models ──
 class AdminRequest(BaseModel):
@@ -60,9 +55,6 @@ class ChatRequest(BaseModel):
 # GOOGLE REAL-TIME WEB SEARCH HELPER
 # ──────────────────────────────────────
 def google_search(query: str, max_results: int = 3) -> str:
-    """
-    Search Google and return formatted context string.
-    """
     url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "q": query,
@@ -85,12 +77,10 @@ def google_search(query: str, max_results: int = 3) -> str:
             lines.append(f"\n{i}. {title}\n   {snippet}\n   Source: {link}")
         lines.append("\n[END OF SEARCH RESULTS]")
         return "\n".join(lines)
     except Exception as e:
         print(f"[Google search error] {e}")
         return ""
-# ── Decide whether to search ──
 def should_search(query: str) -> bool:
     no_search_patterns = [
         r"^\s*(write|create|generate|make|build)\s+(a\s+)?(code|function|script|program|class)",
@@ -117,7 +107,7 @@ def home():
 @app.post("/v1/generate-key")
 async def generate_key(data: AdminRequest):
     if data.admin_pass != ADMIN_SECRET:
-        raise HTTPException(status_code=401, detail="Unauthorized Specialist Access!")
     new_key = f"ELE-PRIME-{secrets.token_hex(4).upper()}"
     API_KEYS_DB[new_key] = {"limit": data.limit, "used": 0, "status": "active"}
     return {"api_key": new_key, "limit": data.limit}
@@ -142,7 +132,7 @@ async def chat(message: ChatRequest, x_api_key: str = Header(None)):
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y, %H:%M UTC")
     system_instruction = (
-        "You are Elephant AI (Inachi-Core), an expert assistant for Specialist MINZO-PRIME. "
         "Respond in the same language the user uses. "
         f"Current date: {today}. "
     )
@@ -155,7 +145,7 @@ async def chat(message: ChatRequest, x_api_key: str = Header(None)):
     ]
     text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer([text], return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(

 import re
 import secrets
 import requests # Google Search API එකට අවශ්‍යයි
+from transformers import AutoModelForCausalLM, AutoTokenizer
 app = FastAPI()
 ADMIN_SECRET = "MINZO-SECRET-2026"
 # ── Google Search Config ──
 GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
 GOOGLE_CX = "YOUR_CUSTOM_SEARCH_ENGINE_ID"
+# ── Load AI Model for CPU ──
+model_id = "google/gemma-4-E4B-it"
+print(f"Loading {model_id} on CPU (Optimized for 18GB RAM)...")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# CPU එකේ දුවන නිසා torch_dtype එක float16 හෝ bfloat16 දැමීමෙන් RAM එක ඉතිරි කරගත හැක
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="cpu",
     trust_remote_code=True
 )
+print("Model loaded on CPU successfully.")
 # ── Pydantic Models ──
 class AdminRequest(BaseModel):
 # GOOGLE REAL-TIME WEB SEARCH HELPER
 # ──────────────────────────────────────
 def google_search(query: str, max_results: int = 3) -> str:
     url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "q": query,
             lines.append(f"\n{i}. {title}\n   {snippet}\n   Source: {link}")
         lines.append("\n[END OF SEARCH RESULTS]")
         return "\n".join(lines)
     except Exception as e:
         print(f"[Google search error] {e}")
         return ""
 def should_search(query: str) -> bool:
     no_search_patterns = [
         r"^\s*(write|create|generate|make|build)\s+(a\s+)?(code|function|script|program|class)",
 @app.post("/v1/generate-key")
 async def generate_key(data: AdminRequest):
     if data.admin_pass != ADMIN_SECRET:
+        raise HTTPException(status_code=401, detail="Unauthorized Access!")
     new_key = f"ELE-PRIME-{secrets.token_hex(4).upper()}"
     API_KEYS_DB[new_key] = {"limit": data.limit, "used": 0, "status": "active"}
     return {"api_key": new_key, "limit": data.limit}
     today = __import__("datetime").datetime.utcnow().strftime("%A, %d %B %Y, %H:%M UTC")
     system_instruction = (
+        "You are Elephant AI (Inachi-Core), an expert assistant for MINZO-PRIME. "
         "Respond in the same language the user uses. "
         f"Current date: {today}. "
     )
     ]
     text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer([text], return_tensors="pt").to("cpu")
     with torch.no_grad():
         outputs = model.generate(