agentqq / startup.sh
madDegen's picture
fix: remove litellm[proxy], Ollama direct on :7860
a8bb2ab verified
#!/bin/bash
# ─────────────────────────────────────────────────────────────
# Agent Q-Q (QLAWED-Q) — HF Space Startup v3.1
# MODE: Ollama direct on :7860
#
# VRAM budget (T4-small, 16GB):
# qwen3:1.7b 1.10GB triage + chat
# nomic-embed-text 0.27GB embeddings / pgvector
# ──────────────────────────────────────────────
# Total resident: ~1.37GB / 16GB
# ─────────────────────────────────────────────────────────────
set -e
echo "╔═════════════════════════════════════════════╗"
echo "║ Agent Q-Q — Ollama Direct Mode ║"
echo "║ Serving on :7860 (OpenAI-compatible) ║"
echo "╚═════════════════════════════════════════════╝"
echo ""
# ── Persistent disk (HF Pro) ──────────────────────────────────
if [ -d "/data" ]; then
echo "✅ Persistent disk at /data"
mkdir -p /data/ollama-models
export OLLAMA_MODELS="/data/ollama-models"
else
echo "⚠️ Ephemeral storage (models re-download on restart)"
fi
# ── Start Ollama on port 7860 ─────────────────────────────────
export OLLAMA_HOST=0.0.0.0:7860
export OLLAMA_KEEP_ALIVE=10m
export OLLAMA_NUM_PARALLEL=2
ollama serve &
OLLAMA_PID=$!
echo "⏳ Waiting for Ollama on :7860..."
for i in {1..40}; do
if curl -sf http://localhost:7860/api/tags > /dev/null 2>&1; then
echo "✅ Ollama ready"
break
fi
if [ $i -eq 40 ]; then
echo "❌ Ollama failed to start"
exit 1
fi
sleep 2
done
# ── Pull models ───────────────────────────────────────────────
echo ""
echo "📦 Pulling models..."
pull_if_missing() {
local model="$1"
if ollama show "$model" > /dev/null 2>&1; then
echo " ✓ cached: $model"
else
echo " ↓ pulling: $model"
ollama pull "$model" && echo " ✅ $model" || echo " ⚠️ failed: $model (non-fatal)"
fi
}
pull_if_missing "qwen3:1.7b"
pull_if_missing "nomic-embed-text"
echo ""
echo "📊 Loaded models:"
ollama list
echo ""
echo "═══════════════════════════════════════════════════════════"
echo "✅ Agent Q-Q LIVE"
echo ""
echo " Public API: http://0.0.0.0:7860"
echo ""
echo " Endpoints:"
echo " GET /api/tags → list models"
echo " GET /v1/models → OpenAI model list"
echo " POST /v1/chat/completions → OpenAI chat"
echo " POST /api/generate → Ollama native"
echo " POST /api/embeddings → embeddings"
echo ""
echo " Models resident:"
echo " qwen3:1.7b ~1.1GB chat / triage"
echo " nomic-embed-text ~0.3GB embeddings"
echo "═══════════════════════════════════════════════════════════"
wait $OLLAMA_PID