sakhi / entrypoint.sh
Tushar9802's picture
perf(deploy): keep model resident + pre-warm to eliminate cold reload
5575d97
#!/usr/bin/env bash
# ============================================================================
# Sakhi container entrypoint β€” starts Ollama, ensures model is present,
# then hands off to uvicorn serving the FastAPI app on $PORT.
# ============================================================================
set -e
# HF Space persistent storage (paid tier) mounts at /data. Point Ollama and
# faster-whisper / HF hub caches there so the ~7GB of model weights survive
# container restarts. On a fresh boot without persistent storage these fall
# back to ephemeral disk and re-download on each restart.
export OLLAMA_MODELS="${OLLAMA_MODELS:-/data/.ollama/models}"
export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
mkdir -p "$OLLAMA_MODELS" "$HF_HOME"
PORT="${PORT:-7860}"
MODEL="${OLLAMA_MODEL:-gemma4:e4b-it-q4_K_M}"
echo "[entrypoint] OLLAMA_MODELS=$OLLAMA_MODELS"
echo "[entrypoint] HF_HOME=$HF_HOME"
echo "[entrypoint] PORT=$PORT"
echo "[entrypoint] MODEL=$MODEL"
# GPU diagnostics β€” surface whether the container actually sees the T4. Without
# this, a misconfigured GPU passthrough silently falls back to CPU inference and
# extraction times balloon from ~10s to ~3min.
echo "[entrypoint] === GPU visibility check ==="
if command -v nvidia-smi >/dev/null 2>&1; then
nvidia-smi -L || echo "[entrypoint] nvidia-smi -L failed"
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader || true
else
echo "[entrypoint] WARN: nvidia-smi not on PATH β€” GPU drivers probably missing in image"
fi
echo "[entrypoint] NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-<unset>}"
echo "[entrypoint] CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-<unset>}"
echo "[entrypoint] ============================"
# Start Ollama daemon in background. Tee logs to stdout so Ollama's GPU
# detection lines ("inference compute" / "no compatible GPUs were discovered")
# show up in the HF Logs tab in real time, while still keeping a copy on disk
# for the boot-failure tail below.
echo "[entrypoint] Starting Ollama daemon..."
ollama serve 2>&1 | tee /tmp/ollama.log &
# Wait up to 60s for the daemon to accept requests
for i in $(seq 1 60); do
if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then
echo "[entrypoint] Ollama daemon ready after ${i}s"
break
fi
if [ "$i" = "60" ]; then
echo "[entrypoint] ERROR: Ollama daemon failed to start within 60s"
tail -n 40 /tmp/ollama.log
exit 1
fi
sleep 1
done
# Pull the model if it isn't already cached on the persistent volume
if ollama list | awk '{print $1}' | grep -qx "$MODEL"; then
echo "[entrypoint] Model $MODEL already present, skipping pull"
else
echo "[entrypoint] Pulling $MODEL (first boot only β€” ~9GB, takes 5-15 min)..."
ollama pull "$MODEL"
fi
# Pre-warm the model into VRAM so the first user request doesn't pay the
# ~150s mmap cost (T4 + 9GB weights on HF persistent disk). Combined with
# OLLAMA_KEEP_ALIVE=24h in Dockerfile ENV, the model stays resident for the
# life of the container.
echo "[entrypoint] Pre-warming $MODEL into VRAM (one-shot generate, ~2-3 min on cold persistent disk)..."
WARM_START=$(date +%s)
curl -fsS http://127.0.0.1:11434/api/generate \
-H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\"}" \
>/tmp/warmup.json 2>&1 \
&& echo "[entrypoint] Pre-warm done in $(($(date +%s) - WARM_START))s" \
|| echo "[entrypoint] WARN: pre-warm failed β€” first user request will pay cold-load cost. See /tmp/warmup.json"
# Hand off to FastAPI. uvicorn imports api:app, which imports app.py (loads
# schemas eagerly via the FastAPI startup hook). Whisper model is loaded
# lazily on the first audio request β€” keeps boot fast.
echo "[entrypoint] Starting uvicorn on 0.0.0.0:${PORT}"
exec uvicorn api:app --host 0.0.0.0 --port "$PORT"