Spaces:

Tushar9802
/

sakhi

Sleeping

App Files Files Community

sakhi / entrypoint.sh

Tushar9802

perf(deploy): keep model resident + pre-warm to eliminate cold reload

5575d97 6 days ago

raw

history blame contribute delete

3.87 kB

	#!/usr/bin/env bash
	# ============================================================================
	# Sakhi container entrypoint — starts Ollama, ensures model is present,
	# then hands off to uvicorn serving the FastAPI app on $PORT.
	# ============================================================================
	set -e

	# HF Space persistent storage (paid tier) mounts at /data. Point Ollama and
	# faster-whisper / HF hub caches there so the ~7GB of model weights survive
	# container restarts. On a fresh boot without persistent storage these fall
	# back to ephemeral disk and re-download on each restart.
	export OLLAMA_MODELS="${OLLAMA_MODELS:-/data/.ollama/models}"
	export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
	mkdir -p "$OLLAMA_MODELS" "$HF_HOME"

	PORT="${PORT:-7860}"
	MODEL="${OLLAMA_MODEL:-gemma4:e4b-it-q4_K_M}"

	echo "[entrypoint] OLLAMA_MODELS=$OLLAMA_MODELS"
	echo "[entrypoint] HF_HOME=$HF_HOME"
	echo "[entrypoint] PORT=$PORT"
	echo "[entrypoint] MODEL=$MODEL"

	# GPU diagnostics — surface whether the container actually sees the T4. Without
	# this, a misconfigured GPU passthrough silently falls back to CPU inference and
	# extraction times balloon from ~10s to ~3min.
	echo "[entrypoint] === GPU visibility check ==="
	if command -v nvidia-smi >/dev/null 2>&1; then
	nvidia-smi -L \|\| echo "[entrypoint] nvidia-smi -L failed"
	nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader \|\| true
	else
	echo "[entrypoint] WARN: nvidia-smi not on PATH — GPU drivers probably missing in image"
	fi
	echo "[entrypoint] NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-<unset>}"
	echo "[entrypoint] CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-<unset>}"
	echo "[entrypoint] ============================"

	# Start Ollama daemon in background. Tee logs to stdout so Ollama's GPU
	# detection lines ("inference compute" / "no compatible GPUs were discovered")
	# show up in the HF Logs tab in real time, while still keeping a copy on disk
	# for the boot-failure tail below.
	echo "[entrypoint] Starting Ollama daemon..."
	ollama serve 2>&1 \| tee /tmp/ollama.log &

	# Wait up to 60s for the daemon to accept requests
	for i in $(seq 1 60); do
	if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then
	echo "[entrypoint] Ollama daemon ready after ${i}s"
	break
	fi
	if [ "$i" = "60" ]; then
	echo "[entrypoint] ERROR: Ollama daemon failed to start within 60s"
	tail -n 40 /tmp/ollama.log
	exit 1
	fi
	sleep 1
	done

	# Pull the model if it isn't already cached on the persistent volume
	if ollama list \| awk '{print $1}' \| grep -qx "$MODEL"; then
	echo "[entrypoint] Model $MODEL already present, skipping pull"
	else
	echo "[entrypoint] Pulling $MODEL (first boot only — ~9GB, takes 5-15 min)..."
	ollama pull "$MODEL"
	fi

	# Pre-warm the model into VRAM so the first user request doesn't pay the
	# ~150s mmap cost (T4 + 9GB weights on HF persistent disk). Combined with
	# OLLAMA_KEEP_ALIVE=24h in Dockerfile ENV, the model stays resident for the
	# life of the container.
	echo "[entrypoint] Pre-warming $MODEL into VRAM (one-shot generate, ~2-3 min on cold persistent disk)..."
	WARM_START=$(date +%s)
	curl -fsS http://127.0.0.1:11434/api/generate \
	-H "Content-Type: application/json" \
	-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\"}" \
	>/tmp/warmup.json 2>&1 \
	&& echo "[entrypoint] Pre-warm done in $(($(date +%s) - WARM_START))s" \
	\|\| echo "[entrypoint] WARN: pre-warm failed — first user request will pay cold-load cost. See /tmp/warmup.json"

	# Hand off to FastAPI. uvicorn imports api:app, which imports app.py (loads
	# schemas eagerly via the FastAPI startup hook). Whisper model is loaded
	# lazily on the first audio request — keeps boot fast.
	echo "[entrypoint] Starting uvicorn on 0.0.0.0:${PORT}"
	exec uvicorn api:app --host 0.0.0.0 --port "$PORT"