Spaces:
Sleeping
Sleeping
| # ============================================================================ | |
| # Sakhi container entrypoint β starts Ollama, ensures model is present, | |
| # then hands off to uvicorn serving the FastAPI app on $PORT. | |
| # ============================================================================ | |
| set -e | |
| # HF Space persistent storage (paid tier) mounts at /data. Point Ollama and | |
| # faster-whisper / HF hub caches there so the ~7GB of model weights survive | |
| # container restarts. On a fresh boot without persistent storage these fall | |
| # back to ephemeral disk and re-download on each restart. | |
| export OLLAMA_MODELS="${OLLAMA_MODELS:-/data/.ollama/models}" | |
| export HF_HOME="${HF_HOME:-/data/.cache/huggingface}" | |
| mkdir -p "$OLLAMA_MODELS" "$HF_HOME" | |
| PORT="${PORT:-7860}" | |
| MODEL="${OLLAMA_MODEL:-gemma4:e4b-it-q4_K_M}" | |
| echo "[entrypoint] OLLAMA_MODELS=$OLLAMA_MODELS" | |
| echo "[entrypoint] HF_HOME=$HF_HOME" | |
| echo "[entrypoint] PORT=$PORT" | |
| echo "[entrypoint] MODEL=$MODEL" | |
| # GPU diagnostics β surface whether the container actually sees the T4. Without | |
| # this, a misconfigured GPU passthrough silently falls back to CPU inference and | |
| # extraction times balloon from ~10s to ~3min. | |
| echo "[entrypoint] === GPU visibility check ===" | |
| if command -v nvidia-smi >/dev/null 2>&1; then | |
| nvidia-smi -L || echo "[entrypoint] nvidia-smi -L failed" | |
| nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader || true | |
| else | |
| echo "[entrypoint] WARN: nvidia-smi not on PATH β GPU drivers probably missing in image" | |
| fi | |
| echo "[entrypoint] NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-<unset>}" | |
| echo "[entrypoint] CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-<unset>}" | |
| echo "[entrypoint] ============================" | |
| # Start Ollama daemon in background. Tee logs to stdout so Ollama's GPU | |
| # detection lines ("inference compute" / "no compatible GPUs were discovered") | |
| # show up in the HF Logs tab in real time, while still keeping a copy on disk | |
| # for the boot-failure tail below. | |
| echo "[entrypoint] Starting Ollama daemon..." | |
| ollama serve 2>&1 | tee /tmp/ollama.log & | |
| # Wait up to 60s for the daemon to accept requests | |
| for i in $(seq 1 60); do | |
| if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then | |
| echo "[entrypoint] Ollama daemon ready after ${i}s" | |
| break | |
| fi | |
| if [ "$i" = "60" ]; then | |
| echo "[entrypoint] ERROR: Ollama daemon failed to start within 60s" | |
| tail -n 40 /tmp/ollama.log | |
| exit 1 | |
| fi | |
| sleep 1 | |
| done | |
| # Pull the model if it isn't already cached on the persistent volume | |
| if ollama list | awk '{print $1}' | grep -qx "$MODEL"; then | |
| echo "[entrypoint] Model $MODEL already present, skipping pull" | |
| else | |
| echo "[entrypoint] Pulling $MODEL (first boot only β ~9GB, takes 5-15 min)..." | |
| ollama pull "$MODEL" | |
| fi | |
| # Pre-warm the model into VRAM so the first user request doesn't pay the | |
| # ~150s mmap cost (T4 + 9GB weights on HF persistent disk). Combined with | |
| # OLLAMA_KEEP_ALIVE=24h in Dockerfile ENV, the model stays resident for the | |
| # life of the container. | |
| echo "[entrypoint] Pre-warming $MODEL into VRAM (one-shot generate, ~2-3 min on cold persistent disk)..." | |
| WARM_START=$(date +%s) | |
| curl -fsS http://127.0.0.1:11434/api/generate \ | |
| -H "Content-Type: application/json" \ | |
| -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\"}" \ | |
| >/tmp/warmup.json 2>&1 \ | |
| && echo "[entrypoint] Pre-warm done in $(($(date +%s) - WARM_START))s" \ | |
| || echo "[entrypoint] WARN: pre-warm failed β first user request will pay cold-load cost. See /tmp/warmup.json" | |
| # Hand off to FastAPI. uvicorn imports api:app, which imports app.py (loads | |
| # schemas eagerly via the FastAPI startup hook). Whisper model is loaded | |
| # lazily on the first audio request β keeps boot fast. | |
| echo "[entrypoint] Starting uvicorn on 0.0.0.0:${PORT}" | |
| exec uvicorn api:app --host 0.0.0.0 --port "$PORT" | |