File size: 1,880 Bytes
ce11c1e
f91f3f9
ce11c1e
 
7c8eb9e
 
 
 
 
 
 
 
 
 
 
b12d4b1
7c8eb9e
 
47e4282
7c8eb9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
---
sdk: docker
colorTo: indigo
---
#!/usr/bin/env python3
"""
Ultralekki entrypoint dla HF Spaces (Docker SDK)
✅ Pobiera model z cache HF → uruchamia llama_cpp.server
✅ OpenAI format | ✅ Brak auth | ✅ Odporny na zerwania | ✅ Optymalizacja CPU/RAM
"""
import os
import sys
import signal
import logging
from huggingface_hub import hf_hub_download

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

MODEL_REPO = "unsloth/granite-4.1-3b-GGUF"
MODEL_FILE = os.environ.get("MODEL_FILE", "granite-4.1-3b-UD-IQ2_M.gguf")
PORT = os.environ.get("PORT", "7860")
N_CTX = os.environ.get("N_CTX", "2048")
N_THREADS = os.environ.get("N_THREADS", "2")
N_BATCH = os.environ.get("N_BATCH", "512")

def graceful_shutdown(signum, frame):
    logger.info("📡 Otrzymano sygnał zakończenia. Zamykanie...")
    sys.exit(0)

signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)

if __name__ == "__main__":
    logger.info(f"⬇️ Pobieranie/weryfikacja: {MODEL_REPO}/{MODEL_FILE}")
    model_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILE,
        resume_download=True,
        local_dir_use_symlinks=False
    )
    logger.info(f"✅ Model gotowy: {model_path}")

    # Komenda startowa llama_cpp.server (wbudowany serwer OpenAI-compatible)
    cmd = [
        sys.executable, "-m", "llama_cpp.server",
        "--model", model_path,
        "--host", "0.0.0.0",
        "--port", PORT,
        "--n_ctx", N_CTX,
        "--n_threads", N_THREADS,
        "--n_batch", N_BATCH,
        "--n_gpu_layers", "0",
        "--use_mmap",
        "--no_flash_attn"
    ]

    logger.info(f"🚀 Start serwera: {' '.join(cmd)}")
    # execvp zastępuje proces Pythona serwerem → poprawna obsługa sygnałów Docker/HF
    os.execvp(sys.executable, cmd)