#!/usr/bin/env bash
# ============================================================================
# Sakhi container entrypoint — starts Ollama, ensures model is present,
# then hands off to uvicorn serving the FastAPI app on $PORT.
# ============================================================================
set -e

# HF Space persistent storage (paid tier) mounts at /data. Point Ollama and
# faster-whisper / HF hub caches there so the ~7GB of model weights survive
# container restarts. On a fresh boot without persistent storage these fall
# back to ephemeral disk and re-download on each restart.
export OLLAMA_MODELS="${OLLAMA_MODELS:-/data/.ollama/models}"
export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
mkdir -p "$OLLAMA_MODELS" "$HF_HOME"

PORT="${PORT:-7860}"
MODEL="${OLLAMA_MODEL:-gemma4:e4b-it-q4_K_M}"

echo "[entrypoint] OLLAMA_MODELS=$OLLAMA_MODELS"
echo "[entrypoint] HF_HOME=$HF_HOME"
echo "[entrypoint] PORT=$PORT"
echo "[entrypoint] MODEL=$MODEL"

# GPU diagnostics — surface whether the container actually sees the T4. Without
# this, a misconfigured GPU passthrough silently falls back to CPU inference and
# extraction times balloon from ~10s to ~3min.
echo "[entrypoint] === GPU visibility check ==="
if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi -L || echo "[entrypoint] nvidia-smi -L failed"
  nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader || true
else
  echo "[entrypoint] WARN: nvidia-smi not on PATH — GPU drivers probably missing in image"
fi
echo "[entrypoint] NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-<unset>}"
echo "[entrypoint] CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-<unset>}"
echo "[entrypoint] ============================"

# Start Ollama daemon in background. Tee logs to stdout so Ollama's GPU
# detection lines ("inference compute" / "no compatible GPUs were discovered")
# show up in the HF Logs tab in real time, while still keeping a copy on disk
# for the boot-failure tail below.
echo "[entrypoint] Starting Ollama daemon..."
ollama serve 2>&1 | tee /tmp/ollama.log &

# Wait up to 60s for the daemon to accept requests
for i in $(seq 1 60); do
  if curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; then
    echo "[entrypoint] Ollama daemon ready after ${i}s"
    break
  fi
  if [ "$i" = "60" ]; then
    echo "[entrypoint] ERROR: Ollama daemon failed to start within 60s"
    tail -n 40 /tmp/ollama.log
    exit 1
  fi
  sleep 1
done

# Pull the model if it isn't already cached on the persistent volume
if ollama list | awk '{print $1}' | grep -qx "$MODEL"; then
  echo "[entrypoint] Model $MODEL already present, skipping pull"
else
  echo "[entrypoint] Pulling $MODEL (first boot only — ~9GB, takes 5-15 min)..."
  ollama pull "$MODEL"
fi

# Pre-warm the model into VRAM so the first user request doesn't pay the
# ~150s mmap cost (T4 + 9GB weights on HF persistent disk). Combined with
# OLLAMA_KEEP_ALIVE=24h in Dockerfile ENV, the model stays resident for the
# life of the container.
echo "[entrypoint] Pre-warming $MODEL into VRAM (one-shot generate, ~2-3 min on cold persistent disk)..."
WARM_START=$(date +%s)
curl -fsS http://127.0.0.1:11434/api/generate \
  -H "Content-Type: application/json" \
  -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\"}" \
  >/tmp/warmup.json 2>&1 \
  && echo "[entrypoint] Pre-warm done in $(($(date +%s) - WARM_START))s" \
  || echo "[entrypoint] WARN: pre-warm failed — first user request will pay cold-load cost. See /tmp/warmup.json"

# Hand off to FastAPI. uvicorn imports api:app, which imports app.py (loads
# schemas eagerly via the FastAPI startup hook). Whisper model is loaded
# lazily on the first audio request — keeps boot fast.
echo "[entrypoint] Starting uvicorn on 0.0.0.0:${PORT}"
exec uvicorn api:app --host 0.0.0.0 --port "$PORT"