chatbot / app.py
Neon-AI's picture
Update app.py
dc23d91 verified
import os
import requests
from pathlib import Path
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
import logging
# ================= CONFIG =================
MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
MODEL_PATH = "model.gguf"
N_CTX = 16384
N_THREADS = 4
N_BATCH = 256
MAX_TOKENS = 16384
TEMPERATURE = 0.7
TOP_P = 0.9
# ==========================================
# ---------- Logging setup ----------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger("KushinaAPI")
app = FastAPI(title="Kushina API", version="1.0")
llm = None # lazy-loaded
# ---------- Download GGUF if not present ----------
if not Path(MODEL_PATH).exists():
try:
logger.info("Downloading model.gguf from Hugging Face...")
r = requests.get(MODEL_URL, stream=True)
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logger.info("Download complete ✅")
except Exception as e:
logger.exception("Failed to download model.gguf")
raise RuntimeError(f"Failed to download model.gguf: {e}")
# ---------- Lazy load llama.cpp ----------
def get_llm():
global llm
if llm is None:
try:
logger.info("Loading GGUF model into llama.cpp…")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=N_CTX,
n_threads=N_THREADS,
n_batch=N_BATCH,
f16_kv=True,
use_mmap=True,
verbose=False,
)
logger.info("Model loaded ✅")
except Exception as e:
logger.exception("Failed to load GGUF model")
raise RuntimeError(f"Failed to load GGUF model: {e}")
return llm
# ---------- Request schema ----------
class PromptRequest(BaseModel):
prompt: str
# ---------- System prompt ----------
SYSTEM_PROMPT = """You are Kushina.
Modes: CHAT or CODE
Rules:
- CHAT: mirror user tone, short responses, no explanations unless asked.
- CODE: output only code when user asks, no commentary.
Switch to CODE if user asks for code, script, function, program, website, api, algorithm, app.
Otherwise use CHAT.
"""
def build_prompt(user_text: str) -> str:
return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
# ---------- API endpoints ----------
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/generate")
def generate(req: PromptRequest):
try:
llm_instance = get_llm() # lazy load
full_prompt = build_prompt(req.prompt)
output_text = ""
for chunk in llm_instance(
full_prompt,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
stream=True,
stop=["<|user|>", "<|system|>"],
):
if "choices" in chunk:
output_text += chunk["choices"][0]["text"]
return {"response": output_text}
except Exception as e:
# Instead of raising 500, return JSON with error
print("❌ Error during generation:", e)
return {"error": str(e)}