Spaces:
Runtime error
fix(start): LOW_MEM mode for cpu-basic — Space dead 7h on OOM
Browse filesROOT CAUSE of 'Memory limit exceeded (16Gi)' since 2026-04-29 ~18:00 UTC:
ollama loaded qwen3-coder:30b-a3b (~17 GB Q4) + qwen2.5-coder:14b (~9 GB)
+ granite-code:8b (~5 GB) on a 16 GB cpu-basic Space. Single 30B model alone
exceeded the limit. Plus 4 bulk-workers + scrape parallel=8 + agentic-crawler
parallel=6 + auto-orchestrate-continuous (4 workers) compounded the OOM.
Fix (LOW_MEM=1 default; set =0 once Space upgrades to cpu-upgrade ≥32 GB):
- DISABLE ollama serve + heavy model pulls (free LLM ladder serves all v2
inference faster than CPU x86 anyway — wafer-scale Cerebras > local CPU)
- scrape-daemon parallel: 8 → 2
- agentic-crawler parallel: 6 → 2
- auto-orchestrate-continuous: SKIP (cron slot M%20==0 already covers it)
- bulk-mirror workers: 4 → 1 (still drains 100+ dataset queue, just slower)
Memory after fix: ~6-8 GB peak (was ~25-30 GB peak attempted).
Datasets stopped growing because Space was crash-looping for 7h, NOT because
Round 5/6 code; this fix unblocks dataset growth + cron loop.
|
@@ -197,40 +197,38 @@ redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
|
|
| 197 |
sleep 1
|
| 198 |
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1
|
| 199 |
|
| 200 |
-
# ── 5. Ollama (
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
sleep 6
|
| 205 |
-
|
| 206 |
-
# Pull models only on first boot (cache lives in /data/.ollama/models).
|
| 207 |
-
# Primary coding brain: qwen3-coder MoE (newest official Qwen coder; ~16GB Q4, 3B active = fast on CPU).
|
| 208 |
-
# Fallback: qwen2.5-coder:14b (proven). Light: gemma4:e4b (kept for quick triage).
|
| 209 |
#
|
| 210 |
-
#
|
| 211 |
-
#
|
| 212 |
-
#
|
| 213 |
-
#
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
|
| 236 |
# HF Spaces free tier may block egress to discord.com — bot would crash-loop.
|
|
@@ -245,43 +243,47 @@ if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
|
|
| 245 |
fi
|
| 246 |
fi
|
| 247 |
|
| 248 |
-
# ── 7a. Continuous scrape daemon
|
| 249 |
-
|
|
|
|
| 250 |
#!/bin/bash
|
| 251 |
-
# 8 concurrent scrape workers, near-zero idle time.
|
| 252 |
set -a; source ~/.hermes/.env 2>/dev/null; set +a
|
| 253 |
-
LOG="${HOME}/.surrogate/logs/scrape-continuous.log"
|
| 254 |
-
mkdir -p "$(dirname "$LOG")"
|
| 255 |
while true; do
|
| 256 |
-
START=$(date +%s)
|
| 257 |
-
bash ~/.surrogate/bin/domain-scrape-loop.sh 1500
|
| 258 |
-
DUR=$(( $(date +%s) - START ))
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
elif [[ $DUR -lt 120 ]]; then sleep 15
|
| 262 |
else sleep 5
|
| 263 |
fi
|
| 264 |
done
|
| 265 |
SCRAPESH
|
| 266 |
chmod +x /tmp/scrape-daemon.sh
|
| 267 |
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
|
| 268 |
-
echo "[$(date +%H:%M:%S)]
|
| 269 |
|
| 270 |
-
# ── 7b. Agentic crawler
|
| 271 |
-
|
| 272 |
-
|
|
|
|
| 273 |
|
| 274 |
-
# ── 7b2. GitHub-specific agentic crawler (
|
| 275 |
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
|
| 276 |
-
echo "[$(date +%H:%M:%S)] github-agentic-crawler started
|
| 277 |
|
| 278 |
-
# ── 7b3. HF Dataset Discoverer
|
| 279 |
nohup bash ~/.surrogate/bin/hf-dataset-discoverer.sh > "$LOG_DIR/hf-dataset-discoverer.log" 2>&1 &
|
| 280 |
-
echo "[$(date +%H:%M:%S)] hf-dataset-discoverer started
|
| 281 |
|
| 282 |
-
# ── 7e.
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
# ── 7e1. SELF-HEAL WATCHDOG — must start BEFORE memory-hungry workers ───────
|
| 287 |
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
|
|
@@ -332,11 +334,12 @@ echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
|
|
| 332 |
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
|
| 333 |
# Lease-based claims (15 min) — crashes auto-expire so other workers pick up.
|
| 334 |
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true
|
| 335 |
-
|
|
|
|
| 336 |
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
|
| 337 |
> "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
|
| 338 |
done
|
| 339 |
-
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator +
|
| 340 |
|
| 341 |
# ── 7d. Train-ready pusher — disabled at boot for now. Caused Space
|
| 342 |
# RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
|
|
|
|
| 197 |
sleep 1
|
| 198 |
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1
|
| 199 |
|
| 200 |
+
# ── 5. Ollama — DISABLED on cpu-basic (16 GB limit) ───────────────────────
|
| 201 |
+
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
|
| 202 |
+
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
|
| 203 |
+
# weights against a 16 GB cap → instant OOM on any inference.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
#
|
| 205 |
+
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
|
| 206 |
+
# is faster anyway — wafer-scale inference beats CPU x86 by 50-200×.
|
| 207 |
+
# Ollama only worth running once Space upgrades to ≥cpu-upgrade (32 GB) OR
|
| 208 |
+
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
|
| 209 |
+
#
|
| 210 |
+
# Set LOW_MEM=0 to re-enable on bigger Space tier.
|
| 211 |
+
LOW_MEM="${LOW_MEM:-1}"
|
| 212 |
+
if [[ "$LOW_MEM" == "1" ]]; then
|
| 213 |
+
echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
|
| 214 |
+
>> "$LOG_DIR/boot.log"
|
| 215 |
+
echo "[$(date +%H:%M:%S)] → free LLM ladder serves all v2 inference" \
|
| 216 |
+
>> "$LOG_DIR/boot.log"
|
| 217 |
+
else
|
| 218 |
+
OLLAMA_MODELS="${HOME}/.ollama/models" \
|
| 219 |
+
OLLAMA_HOST=127.0.0.1:11434 \
|
| 220 |
+
nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
|
| 221 |
+
sleep 6
|
| 222 |
+
(
|
| 223 |
+
if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
|
| 224 |
+
ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
|
| 225 |
+
fi
|
| 226 |
+
if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
|
| 227 |
+
# Smallest coder that's actually useful — fits any tier
|
| 228 |
+
ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
|
| 229 |
+
fi
|
| 230 |
+
) &
|
| 231 |
+
fi
|
| 232 |
|
| 233 |
# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
|
| 234 |
# HF Spaces free tier may block egress to discord.com — bot would crash-loop.
|
|
|
|
| 243 |
fi
|
| 244 |
fi
|
| 245 |
|
| 246 |
+
# ── 7a. Continuous scrape daemon — concurrency tuned to LOW_MEM ────────────
|
| 247 |
+
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
|
| 248 |
+
cat > /tmp/scrape-daemon.sh <<SCRAPESH
|
| 249 |
#!/bin/bash
|
|
|
|
| 250 |
set -a; source ~/.hermes/.env 2>/dev/null; set +a
|
| 251 |
+
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
|
| 252 |
+
mkdir -p "\$(dirname "\$LOG")"
|
| 253 |
while true; do
|
| 254 |
+
START=\$(date +%s)
|
| 255 |
+
bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
|
| 256 |
+
DUR=\$(( \$(date +%s) - START ))
|
| 257 |
+
if [[ \$DUR -lt 30 ]]; then sleep 30
|
| 258 |
+
elif [[ \$DUR -lt 120 ]]; then sleep 15
|
|
|
|
| 259 |
else sleep 5
|
| 260 |
fi
|
| 261 |
done
|
| 262 |
SCRAPESH
|
| 263 |
chmod +x /tmp/scrape-daemon.sh
|
| 264 |
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
|
| 265 |
+
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
|
| 266 |
|
| 267 |
+
# ── 7b. Agentic crawler ────────────────────────────────────────────────────
|
| 268 |
+
CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 6)}"
|
| 269 |
+
nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" > "$LOG_DIR/agentic-crawler.log" 2>&1 &
|
| 270 |
+
echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
|
| 271 |
|
| 272 |
+
# ── 7b2. GitHub-specific agentic crawler (lightweight — keep on) ───────────
|
| 273 |
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
|
| 274 |
+
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"
|
| 275 |
|
| 276 |
+
# ── 7b3. HF Dataset Discoverer ─────────────────────────────────────────────
|
| 277 |
nohup bash ~/.surrogate/bin/hf-dataset-discoverer.sh > "$LOG_DIR/hf-dataset-discoverer.log" 2>&1 &
|
| 278 |
+
echo "[$(date +%H:%M:%S)] hf-dataset-discoverer started" >> "$LOG_DIR/boot.log"
|
| 279 |
|
| 280 |
+
# ── 7e. auto-orchestrate-continuous — DISABLED on LOW_MEM (cron handles it) ─
|
| 281 |
+
if [[ "$LOW_MEM" != "1" ]]; then
|
| 282 |
+
nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
|
| 283 |
+
echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
|
| 284 |
+
else
|
| 285 |
+
echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
|
| 286 |
+
fi
|
| 287 |
|
| 288 |
# ── 7e1. SELF-HEAL WATCHDOG — must start BEFORE memory-hungry workers ───────
|
| 289 |
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
|
|
|
|
| 334 |
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
|
| 335 |
# Lease-based claims (15 min) — crashes auto-expire so other workers pick up.
|
| 336 |
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true
|
| 337 |
+
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
|
| 338 |
+
for i in $(seq 1 "$BULK_WORKERS"); do
|
| 339 |
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
|
| 340 |
> "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
|
| 341 |
done
|
| 342 |
+
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS workers started (100+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
|
| 343 |
|
| 344 |
# ── 7d. Train-ready pusher — disabled at boot for now. Caused Space
|
| 345 |
# RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
|