| import os |
| import requests |
| from pathlib import Path |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| import logging |
|
|
| |
| MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf" |
| MODEL_PATH = "model.gguf" |
| N_CTX = 16384 |
| N_THREADS = 4 |
| N_BATCH = 256 |
| MAX_TOKENS = 16384 |
| TEMPERATURE = 0.7 |
| TOP_P = 0.9 |
| |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(message)s" |
| ) |
| logger = logging.getLogger("KushinaAPI") |
|
|
| app = FastAPI(title="Kushina API", version="1.0") |
| llm = None |
|
|
| |
| if not Path(MODEL_PATH).exists(): |
| try: |
| logger.info("Downloading model.gguf from Hugging Face...") |
| r = requests.get(MODEL_URL, stream=True) |
| r.raise_for_status() |
| with open(MODEL_PATH, "wb") as f: |
| for chunk in r.iter_content(chunk_size=8192): |
| f.write(chunk) |
| logger.info("Download complete ✅") |
| except Exception as e: |
| logger.exception("Failed to download model.gguf") |
| raise RuntimeError(f"Failed to download model.gguf: {e}") |
|
|
| |
| def get_llm(): |
| global llm |
| if llm is None: |
| try: |
| logger.info("Loading GGUF model into llama.cpp…") |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=N_CTX, |
| n_threads=N_THREADS, |
| n_batch=N_BATCH, |
| f16_kv=True, |
| use_mmap=True, |
| verbose=False, |
| ) |
| logger.info("Model loaded ✅") |
| except Exception as e: |
| logger.exception("Failed to load GGUF model") |
| raise RuntimeError(f"Failed to load GGUF model: {e}") |
| return llm |
|
|
| |
| class PromptRequest(BaseModel): |
| prompt: str |
|
|
| |
| SYSTEM_PROMPT = """You are Kushina. |
| Modes: CHAT or CODE |
| Rules: |
| - CHAT: mirror user tone, short responses, no explanations unless asked. |
| - CODE: output only code when user asks, no commentary. |
| Switch to CODE if user asks for code, script, function, program, website, api, algorithm, app. |
| Otherwise use CHAT. |
| """ |
|
|
| def build_prompt(user_text: str) -> str: |
| return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n" |
|
|
| |
| @app.get("/") |
| def root(): |
| return {"status": "ok"} |
|
|
| @app.post("/generate") |
| def generate(req: PromptRequest): |
| try: |
| llm_instance = get_llm() |
| full_prompt = build_prompt(req.prompt) |
| output_text = "" |
|
|
| for chunk in llm_instance( |
| full_prompt, |
| max_tokens=MAX_TOKENS, |
| temperature=TEMPERATURE, |
| top_p=TOP_P, |
| stream=True, |
| stop=["<|user|>", "<|system|>"], |
| ): |
| if "choices" in chunk: |
| output_text += chunk["choices"][0]["text"] |
|
|
| return {"response": output_text} |
|
|
| except Exception as e: |
| |
| print("❌ Error during generation:", e) |
| return {"error": str(e)} |