import os import requests from pathlib import Path from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama import logging # ================= CONFIG ================= MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf" MODEL_PATH = "model.gguf" N_CTX = 16384 N_THREADS = 4 N_BATCH = 256 MAX_TOKENS = 16384 TEMPERATURE = 0.7 TOP_P = 0.9 # ========================================== # ---------- Logging setup ---------- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) logger = logging.getLogger("KushinaAPI") app = FastAPI(title="Kushina API", version="1.0") llm = None # lazy-loaded # ---------- Download GGUF if not present ---------- if not Path(MODEL_PATH).exists(): try: logger.info("Downloading model.gguf from Hugging Face...") r = requests.get(MODEL_URL, stream=True) r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) logger.info("Download complete ✅") except Exception as e: logger.exception("Failed to download model.gguf") raise RuntimeError(f"Failed to download model.gguf: {e}") # ---------- Lazy load llama.cpp ---------- def get_llm(): global llm if llm is None: try: logger.info("Loading GGUF model into llama.cpp…") llm = Llama( model_path=MODEL_PATH, n_ctx=N_CTX, n_threads=N_THREADS, n_batch=N_BATCH, f16_kv=True, use_mmap=True, verbose=False, ) logger.info("Model loaded ✅") except Exception as e: logger.exception("Failed to load GGUF model") raise RuntimeError(f"Failed to load GGUF model: {e}") return llm # ---------- Request schema ---------- class PromptRequest(BaseModel): prompt: str # ---------- System prompt ---------- SYSTEM_PROMPT = """You are Kushina. Modes: CHAT or CODE Rules: - CHAT: mirror user tone, short responses, no explanations unless asked. - CODE: output only code when user asks, no commentary. Switch to CODE if user asks for code, script, function, program, website, api, algorithm, app. Otherwise use CHAT. """ def build_prompt(user_text: str) -> str: return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n" # ---------- API endpoints ---------- @app.get("/") def root(): return {"status": "ok"} @app.post("/generate") def generate(req: PromptRequest): try: llm_instance = get_llm() # lazy load full_prompt = build_prompt(req.prompt) output_text = "" for chunk in llm_instance( full_prompt, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=True, stop=["<|user|>", "<|system|>"], ): if "choices" in chunk: output_text += chunk["choices"][0]["text"] return {"response": output_text} except Exception as e: # Instead of raising 500, return JSON with error print("❌ Error during generation:", e) return {"error": str(e)}