#!/usr/bin/env python3 """ Ultralekki entrypoint dla HF Spaces (Docker SDK) ✅ Pobiera model → uruchamia llama_cpp.server ✅ OpenAI format | ✅ Brak auth | ✅ Odporny na zerwania """ import os import sys import signal import logging import subprocess from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) MODEL_REPO = "unsloth/granite-4.1-3b-GGUF" MODEL_FILE = os.environ.get("MODEL_FILE", "granite-4.1-3b-UD-IQ2_M.gguf") PORT = os.environ.get("PORT", "7860") N_CTX = os.environ.get("N_CTX", "2048") N_THREADS = os.environ.get("N_THREADS", "2") N_BATCH = os.environ.get("N_BATCH", "512") def graceful_shutdown(signum, frame): logger.info("📡 Otrzymano sygnał zakończenia. Zamykanie...") sys.exit(0) signal.signal(signal.SIGTERM, graceful_shutdown) signal.signal(signal.SIGINT, graceful_shutdown) if __name__ == "__main__": try: logger.info(f"⬇️ Pobieranie/weryfikacja: {MODEL_REPO}/{MODEL_FILE}") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, resume_download=True ) logger.info(f"✅ Model gotowy: {model_path}") # Komenda startowa llama_cpp.server cmd = [ sys.executable, "-m", "llama_cpp.server", "--model", model_path, "--host", "0.0.0.0", "--port", PORT, "--n_ctx", N_CTX, "--n_threads", N_THREADS, "--n_batch", N_BATCH, "--n_gpu_layers", "0", "--use_mmap", "--no_flash_attn", "--chat_format", "chatml" # Domyślny format dla Granite/LLama ] logger.info(f"🚀 Start serwera: {' '.join(cmd)}") # Uruchom proces podrzędny i czekaj na jego zakończenie process = subprocess.Popen(cmd) process.wait() except Exception as e: logger.error(f"❌ Krytyczny błąd: {e}", exc_info=True) sys.exit(1)