madDegen commited on
Commit
a8bb2ab
·
verified ·
1 Parent(s): 8a35e8e

fix: remove litellm[proxy], Ollama direct on :7860

Browse files
Files changed (2) hide show
  1. Dockerfile +13 -26
  2. startup.sh +32 -71
Dockerfile CHANGED
@@ -1,15 +1,14 @@
1
  # ─────────────────────────────────────────────────────────────
2
- # Agent Q-Q (Agent Q-QAJAQS) — QLAWED-Q HF Space v2.0
3
- # SDK: Docker | Public Port: 7860 (LiteLLM proxy)
4
  #
5
  # Architecture:
6
- # Ollama internal :11434 (model inference)
7
- # LiteLLM → public :7860 (Claude-compatible API)
8
  #
9
- # Endpoints exposed at :7860:
10
  # POST /v1/chat/completions (OpenAI SDK compatible)
11
- # POST /v1/messages (Anthropic SDK compatible)
12
- # GET /health
13
  # GET /v1/models
14
  # ─────────────────────────────────────────────────────────────
15
 
@@ -21,43 +20,31 @@ ENV DEBIAN_FRONTEND=noninteractive
21
  RUN apt-get update && apt-get install -y \
22
  curl \
23
  ca-certificates \
24
- python3 \
25
- python3-pip \
26
- git \
27
- git-lfs \
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
  # ── Ollama ────────────────────────────────────────────────────
31
  RUN curl -fsSL https://ollama.com/install.sh | sh
32
 
33
- # ── Python packages (system-wide, available to all users) ─────
34
- RUN pip3 install --no-cache-dir \
35
- "litellm[proxy]>=1.40.0" \
36
- "huggingface_hub[cli]>=0.23.0" \
37
- PyYAML \
38
- uvicorn
39
-
40
  # ── HF Spaces: non-root user uid 1000 ─────────────────────────
41
  RUN useradd -m -u 1000 user
42
  USER user
43
 
44
  # ── Environment ───────────────────────────────────────────────
 
45
  ENV OLLAMA_MODELS=/home/user/.ollama/models
46
- ENV OLLAMA_HOST=0.0.0.0:11434
47
- ENV HF_HOME=/home/user/.cache/huggingface
 
48
  ENV HOME=/home/user
49
  ENV PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
50
 
51
  WORKDIR /home/user
52
 
53
- # ── Copy config files ─────────────────────────────────────────
54
- COPY --chown=user startup.sh /home/user/startup.sh
55
- COPY --chown=user litellm_config.yaml /home/user/litellm_config.yaml
56
- COPY --chown=user Modelfile.qlawed-frontend /home/user/Modelfile.qlawed-frontend
57
-
58
  RUN chmod +x /home/user/startup.sh
59
 
60
- # ── Public port: LiteLLM proxy ────────────────────────────────
61
  EXPOSE 7860
62
 
63
  CMD ["/home/user/startup.sh"]
 
1
  # ─────────────────────────────────────────────────────────────
2
+ # Agent Q-Q (Agent Q-QAJAQS) — QLAWED-Q HF Space v3.1
3
+ # SDK: Docker | Public Port: 7860 (Ollama direct)
4
  #
5
  # Architecture:
6
+ # Ollama public :7860 (native OpenAI-compatible API)
 
7
  #
8
+ # Endpoints at :7860:
9
  # POST /v1/chat/completions (OpenAI SDK compatible)
10
+ # POST /api/generate (Ollama native)
11
+ # GET /api/tags (model list)
12
  # GET /v1/models
13
  # ─────────────────────────────────────────────────────────────
14
 
 
20
  RUN apt-get update && apt-get install -y \
21
  curl \
22
  ca-certificates \
 
 
 
 
23
  && rm -rf /var/lib/apt/lists/*
24
 
25
  # ── Ollama ────────────────────────────────────────────────────
26
  RUN curl -fsSL https://ollama.com/install.sh | sh
27
 
 
 
 
 
 
 
 
28
  # ── HF Spaces: non-root user uid 1000 ─────────────────────────
29
  RUN useradd -m -u 1000 user
30
  USER user
31
 
32
  # ── Environment ───────────────────────────────────────────────
33
+ # Serve directly on the HF public port
34
  ENV OLLAMA_MODELS=/home/user/.ollama/models
35
+ ENV OLLAMA_HOST=0.0.0.0:7860
36
+ ENV OLLAMA_KEEP_ALIVE=10m
37
+ ENV OLLAMA_NUM_PARALLEL=2
38
  ENV HOME=/home/user
39
  ENV PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
40
 
41
  WORKDIR /home/user
42
 
43
+ # ── Copy startup script ───────────────────────────────────────
44
+ COPY --chown=user startup.sh /home/user/startup.sh
 
 
 
45
  RUN chmod +x /home/user/startup.sh
46
 
47
+ # ── Public port: Ollama API ───────────────────────────────────
48
  EXPOSE 7860
49
 
50
  CMD ["/home/user/startup.sh"]
startup.sh CHANGED
@@ -1,57 +1,43 @@
1
  #!/bin/bash
2
  # ─────────────────────────────────────────────────────────────
3
- # Agent Q-Q (QLAWED-Q) — HF Space Startup v3.0
4
- # MODE: Passthrough / Triage only
5
- #
6
- # When laptop is online: laptop handles all generative load.
7
- # This Space runs only when laptop is offline — triage + embeddings.
8
  #
9
  # VRAM budget (T4-small, 16GB):
10
- # qwen3:1.7b 1.10GB triage, non-generative
11
  # nomic-embed-text 0.27GB embeddings / pgvector
12
  # ──────────────────────────────────────────────
13
- # Total resident: ~1.37GB / 16GB (minimal cost)
14
  # ─────────────────────────────────────────────────────────────
15
 
16
  set -e
17
 
18
  echo "╔═════════════════════════════════════════════╗"
19
- echo "║ Agent Q-Q — Passthrough Mode ║"
20
- echo "║ qwen3:1.7b :11434 (triage) ║"
21
- echo "║ LiteLLM :7860 (Claude-compat proxy) ║"
22
  echo "╚═════════════════════════════════════════════╝"
23
  echo ""
24
 
25
  # ── Persistent disk (HF Pro) ──────────────────────────────────
26
- # HF Pro persistent storage mounts at /data
27
- # GGUF models cached here survive Space restarts (no re-download)
28
- PERSISTENT_DISK="/data"
29
- GGUF_DIR="$PERSISTENT_DISK/gguf"
30
- OLLAMA_MODELS_DIR="$PERSISTENT_DISK/ollama-models"
31
-
32
- if [ -d "$PERSISTENT_DISK" ]; then
33
- echo "✅ Persistent disk detected at /data"
34
- mkdir -p "$GGUF_DIR" "$OLLAMA_MODELS_DIR"
35
- export OLLAMA_MODELS="$OLLAMA_MODELS_DIR"
36
  else
37
- echo "⚠️ No persistent disk using ephemeral storage"
38
- echo " Enable in: Space Settings → Persistent Storage → mount /data"
39
- GGUF_DIR="$HOME/.ollama/gguf"
40
- mkdir -p "$GGUF_DIR"
41
  fi
42
 
43
- # ── 1. Start Ollama ───────────────────────────────────────────
44
- export OLLAMA_HOST=0.0.0.0:11434
45
- export OLLAMA_KEEP_ALIVE=5m
46
  export OLLAMA_NUM_PARALLEL=2
47
- export OLLAMA_MAX_LOADED_MODELS=2
48
 
49
  ollama serve &
50
  OLLAMA_PID=$!
51
 
52
- echo "⏳ Waiting for Ollama on :11434..."
53
  for i in {1..40}; do
54
- if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then
55
  echo "✅ Ollama ready"
56
  break
57
  fi
@@ -62,10 +48,9 @@ for i in {1..40}; do
62
  sleep 2
63
  done
64
 
65
- # ── 2. Pull triage models ─────────────────────────────────────
66
- echo ""
67
- echo "📦 Setting up passthrough models..."
68
  echo ""
 
69
 
70
  pull_if_missing() {
71
  local model="$1"
@@ -73,57 +58,33 @@ pull_if_missing() {
73
  echo " ✓ cached: $model"
74
  else
75
  echo " ↓ pulling: $model"
76
- ollama pull "$model" && echo " ✅ ready: $model" || echo " ⚠️ failed: $model"
77
  fi
78
  }
79
 
80
- # Triage agent — always resident, ~1.1GB
81
  pull_if_missing "qwen3:1.7b"
82
-
83
- # Embeddings — always resident, ~0.27GB
84
  pull_if_missing "nomic-embed-text"
85
 
86
  echo ""
87
- echo "📊 Registered models:"
88
  ollama list
89
  echo ""
90
 
91
- # ── 3. Start LiteLLM proxy ───────────────────────────────────
92
- echo "🔀 Starting LiteLLM proxy on :7860..."
93
-
94
- litellm \
95
- --config "$HOME/litellm_config.yaml" \
96
- --port 7860 \
97
- --host 0.0.0.0 \
98
- --telemetry False &
99
- LITELLM_PID=$!
100
-
101
- echo "⏳ Waiting for LiteLLM..."
102
- for i in {1..30}; do
103
- if curl -sf http://localhost:7860/health > /dev/null 2>&1; then
104
- echo "�� LiteLLM ready"
105
- break
106
- fi
107
- if [ $i -eq 30 ]; then
108
- echo "⚠️ LiteLLM slow to start — may still be initializing"
109
- break
110
- fi
111
- sleep 2
112
- done
113
-
114
- echo ""
115
  echo "═══════════════════════════════════════════════════════════"
116
- echo "✅ Agent Q-Q HF Space — PASSTHROUGH MODE LIVE"
117
  echo ""
118
- echo " Internal Ollama: http://localhost:11434"
119
- echo " Public LiteLLM: http://0.0.0.0:7860"
120
  echo ""
121
- echo " Routing (passthrough mode):"
122
- echo " claude-haiku-4-5qwen3:1.7b (triage/non-generative)"
123
- echo " text-embedding-*nomic-embed-text (embeddings)"
 
 
 
124
  echo ""
125
- echo " NOTE: Generative tasks (claude-sonnet-4-6) handled by laptop."
126
- echo " Start laptop stack to enable full generative capability."
 
127
  echo "═══════════════════════════════════════════════════════════"
128
 
129
- wait $LITELLM_PID $OLLAMA_PID
 
1
  #!/bin/bash
2
  # ─────────────────────────────────────────────────────────────
3
+ # Agent Q-Q (QLAWED-Q) — HF Space Startup v3.1
4
+ # MODE: Ollama direct on :7860
 
 
 
5
  #
6
  # VRAM budget (T4-small, 16GB):
7
+ # qwen3:1.7b 1.10GB triage + chat
8
  # nomic-embed-text 0.27GB embeddings / pgvector
9
  # ──────────────────────────────────────────────
10
+ # Total resident: ~1.37GB / 16GB
11
  # ─────────────────────────────────────────────────────────────
12
 
13
  set -e
14
 
15
  echo "╔═════════════════════════════════════════════╗"
16
+ echo "║ Agent Q-Q — Ollama Direct Mode ║"
17
+ echo "║ Serving on :7860 (OpenAI-compatible) ║"
 
18
  echo "╚═════════════════════════════════════════════╝"
19
  echo ""
20
 
21
  # ── Persistent disk (HF Pro) ──────────────────────────────────
22
+ if [ -d "/data" ]; then
23
+ echo "✅ Persistent disk at /data"
24
+ mkdir -p /data/ollama-models
25
+ export OLLAMA_MODELS="/data/ollama-models"
 
 
 
 
 
 
26
  else
27
+ echo "⚠️ Ephemeral storage (models re-download on restart)"
 
 
 
28
  fi
29
 
30
+ # ── Start Ollama on port 7860 ─────────────────────────────────
31
+ export OLLAMA_HOST=0.0.0.0:7860
32
+ export OLLAMA_KEEP_ALIVE=10m
33
  export OLLAMA_NUM_PARALLEL=2
 
34
 
35
  ollama serve &
36
  OLLAMA_PID=$!
37
 
38
+ echo "⏳ Waiting for Ollama on :7860..."
39
  for i in {1..40}; do
40
+ if curl -sf http://localhost:7860/api/tags > /dev/null 2>&1; then
41
  echo "✅ Ollama ready"
42
  break
43
  fi
 
48
  sleep 2
49
  done
50
 
51
+ # ── Pull models ───────────────────────────────────────────────
 
 
52
  echo ""
53
+ echo "📦 Pulling models..."
54
 
55
  pull_if_missing() {
56
  local model="$1"
 
58
  echo " ✓ cached: $model"
59
  else
60
  echo " ↓ pulling: $model"
61
+ ollama pull "$model" && echo " ✅ $model" || echo " ⚠️ failed: $model (non-fatal)"
62
  fi
63
  }
64
 
 
65
  pull_if_missing "qwen3:1.7b"
 
 
66
  pull_if_missing "nomic-embed-text"
67
 
68
  echo ""
69
+ echo "📊 Loaded models:"
70
  ollama list
71
  echo ""
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  echo "═══════════════════════════════════════════════════════════"
74
+ echo "✅ Agent Q-Q LIVE"
75
  echo ""
76
+ echo " Public API: http://0.0.0.0:7860"
 
77
  echo ""
78
+ echo " Endpoints:"
79
+ echo " GET /api/tags list models"
80
+ echo " GET /v1/models OpenAI model list"
81
+ echo " POST /v1/chat/completions → OpenAI chat"
82
+ echo " POST /api/generate → Ollama native"
83
+ echo " POST /api/embeddings → embeddings"
84
  echo ""
85
+ echo " Models resident:"
86
+ echo " qwen3:1.7b ~1.1GB chat / triage"
87
+ echo " nomic-embed-text ~0.3GB embeddings"
88
  echo "═══════════════════════════════════════════════════════════"
89
 
90
+ wait $OLLAMA_PID