#!/bin/bash # ───────────────────────────────────────────────────────────── # Agent Q-Q (QLAWED-Q) — HF Space Startup v3.1 # MODE: Ollama direct on :7860 # # VRAM budget (T4-small, 16GB): # qwen3:1.7b 1.10GB triage + chat # nomic-embed-text 0.27GB embeddings / pgvector # ────────────────────────────────────────────── # Total resident: ~1.37GB / 16GB # ───────────────────────────────────────────────────────────── set -e echo "╔═════════════════════════════════════════════╗" echo "║ Agent Q-Q — Ollama Direct Mode ║" echo "║ Serving on :7860 (OpenAI-compatible) ║" echo "╚═════════════════════════════════════════════╝" echo "" # ── Persistent disk (HF Pro) ────────────────────────────────── if [ -d "/data" ]; then echo "✅ Persistent disk at /data" mkdir -p /data/ollama-models export OLLAMA_MODELS="/data/ollama-models" else echo "⚠️ Ephemeral storage (models re-download on restart)" fi # ── Start Ollama on port 7860 ───────────────────────────────── export OLLAMA_HOST=0.0.0.0:7860 export OLLAMA_KEEP_ALIVE=10m export OLLAMA_NUM_PARALLEL=2 ollama serve & OLLAMA_PID=$! echo "⏳ Waiting for Ollama on :7860..." for i in {1..40}; do if curl -sf http://localhost:7860/api/tags > /dev/null 2>&1; then echo "✅ Ollama ready" break fi if [ $i -eq 40 ]; then echo "❌ Ollama failed to start" exit 1 fi sleep 2 done # ── Pull models ─────────────────────────────────────────────── echo "" echo "📦 Pulling models..." pull_if_missing() { local model="$1" if ollama show "$model" > /dev/null 2>&1; then echo " ✓ cached: $model" else echo " ↓ pulling: $model" ollama pull "$model" && echo " ✅ $model" || echo " ⚠️ failed: $model (non-fatal)" fi } pull_if_missing "qwen3:1.7b" pull_if_missing "nomic-embed-text" echo "" echo "📊 Loaded models:" ollama list echo "" echo "═══════════════════════════════════════════════════════════" echo "✅ Agent Q-Q LIVE" echo "" echo " Public API: http://0.0.0.0:7860" echo "" echo " Endpoints:" echo " GET /api/tags → list models" echo " GET /v1/models → OpenAI model list" echo " POST /v1/chat/completions → OpenAI chat" echo " POST /api/generate → Ollama native" echo " POST /api/embeddings → embeddings" echo "" echo " Models resident:" echo " qwen3:1.7b ~1.1GB chat / triage" echo " nomic-embed-text ~0.3GB embeddings" echo "═══════════════════════════════════════════════════════════" wait $OLLAMA_PID