#!/usr/bin/env bash # ============================================================================ # Sakhi container entrypoint — starts Ollama, ensures model is present, # then hands off to uvicorn serving the FastAPI app on $PORT. # ============================================================================ set -e # HF Space persistent storage (paid tier) mounts at /data. Point Ollama and # faster-whisper / HF hub caches there so the ~7GB of model weights survive # container restarts. On a fresh boot without persistent storage these fall # back to ephemeral disk and re-download on each restart. export OLLAMA_MODELS="${OLLAMA_MODELS:-/data/.ollama/models}" export HF_HOME="${HF_HOME:-/data/.cache/huggingface}" mkdir -p "$OLLAMA_MODELS" "$HF_HOME" PORT="${PORT:-7860}" MODEL="${OLLAMA_MODEL:-gemma4:e4b-it-q4_K_M}" echo "[entrypoint] OLLAMA_MODELS=$OLLAMA_MODELS" echo "[entrypoint] HF_HOME=$HF_HOME" echo "[entrypoint] PORT=$PORT" echo "[entrypoint] MODEL=$MODEL" # GPU diagnostics — surface whether the container actually sees the T4. Without # this, a misconfigured GPU passthrough silently falls back to CPU inference and # extraction times balloon from ~10s to ~3min. echo "[entrypoint] === GPU visibility check ===" if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi -L || echo "[entrypoint] nvidia-smi -L failed" nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader || true else echo "[entrypoint] WARN: nvidia-smi not on PATH — GPU drivers probably missing in image" fi echo "[entrypoint] NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-}" echo "[entrypoint] CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" echo "[entrypoint] ============================" # Start Ollama daemon in background. Tee logs to stdout so Ollama's GPU # detection lines ("inference compute" / "no compatible GPUs were discovered") # show up in the HF Logs tab in real time, while still keeping a copy on disk # for the boot-failure tail below. echo "[entrypoint] Starting Ollama daemon..." ollama serve 2>&1 | tee /tmp/ollama.log & # Wait up to 60s for the daemon to accept requests for i in $(seq 1 60); do if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then echo "[entrypoint] Ollama daemon ready after ${i}s" break fi if [ "$i" = "60" ]; then echo "[entrypoint] ERROR: Ollama daemon failed to start within 60s" tail -n 40 /tmp/ollama.log exit 1 fi sleep 1 done # Pull the model if it isn't already cached on the persistent volume if ollama list | awk '{print $1}' | grep -qx "$MODEL"; then echo "[entrypoint] Model $MODEL already present, skipping pull" else echo "[entrypoint] Pulling $MODEL (first boot only — ~9GB, takes 5-15 min)..." ollama pull "$MODEL" fi # Pre-warm the model into VRAM so the first user request doesn't pay the # ~150s mmap cost (T4 + 9GB weights on HF persistent disk). Combined with # OLLAMA_KEEP_ALIVE=24h in Dockerfile ENV, the model stays resident for the # life of the container. echo "[entrypoint] Pre-warming $MODEL into VRAM (one-shot generate, ~2-3 min on cold persistent disk)..." WARM_START=$(date +%s) curl -fsS http://127.0.0.1:11434/api/generate \ -H "Content-Type: application/json" \ -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\"}" \ >/tmp/warmup.json 2>&1 \ && echo "[entrypoint] Pre-warm done in $(($(date +%s) - WARM_START))s" \ || echo "[entrypoint] WARN: pre-warm failed — first user request will pay cold-load cost. See /tmp/warmup.json" # Hand off to FastAPI. uvicorn imports api:app, which imports app.py (loads # schemas eagerly via the FastAPI startup hook). Whisper model is loaded # lazily on the first audio request — keeps boot fast. echo "[entrypoint] Starting uvicorn on 0.0.0.0:${PORT}" exec uvicorn api:app --host 0.0.0.0 --port "$PORT"