Spaces:
Runtime error
fix(oom): stagger cron offsets + STREAM=1 — minute=0 burst was OOM trigger
Browse filesUser: 'เห็นมีพังด้วย' — Space stuck in RUNTIME_ERROR cycle.
Root cause: at every hour boundary (M=0,60,120…) FIVE cron entries with
M%X==0 offsets fired simultaneously:
M%2==0 surrogate-dev-loop
M%3==0 push-training-to-hf ← biggest, loads large shards
M%5==0 work-queue-producer
M%15==0 surrogate-self-ingest
M%60==0 scrape-keyword-tuner
5 Python processes × 200-500 MB peak each + base stack (~3-4 GB) +
2 streaming workers (~1.6 GB) = >7 GB burst — past 16 GB cpu-basic
ceiling when continuous-discoverer (50 MB) and redis (1 GB) and status
server (200 MB) and github-crawler (200 MB) and master cron loop (50 MB)
also active.
Result: every 60 minutes Space crashed; restart 3-5 min; cycle repeats.
Datasets DID grow (latest commit shard45 +866K pairs ×3 in 90s burst)
but only during ALIVE windows — most of last 4h was DEAD.
Fix:
- M%2==0 → M%2==1
- M%3==0 → M%3==1
- M%5==0 → M%5==2
- M%15==0 → M%15==3
- M%60==0 → M%60==4
Each fires on a different minute residue → no simultaneous burst.
- STREAM_WORKERS LOW_MEM: 2 → 1 (anchor's 4 workers compensate when up)
Expected: Space stays up continuously, dataset growth becomes steady
~2.6M pairs/min sustained instead of bursts during alive windows.
|
@@ -352,8 +352,10 @@ python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mi
|
|
| 352 |
# LOW_MEM tuned for cpu-basic 16GB after Round 9+10 OOM:
|
| 353 |
# 0 bulk (full-download too heavy) + 2 streaming (lighter) on LOW_MEM
|
| 354 |
# Anchor handles bulk via 24GB ARM headroom.
|
|
|
|
|
|
|
| 355 |
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 0 || echo 4)}"
|
| 356 |
-
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo
|
| 357 |
|
| 358 |
for i in $(seq 1 "$BULK_WORKERS"); do
|
| 359 |
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
|
|
@@ -388,26 +390,25 @@ LOG="${HOME}/.surrogate/logs/cron.log"
|
|
| 388 |
mkdir -p "$(dirname "$LOG")"
|
| 389 |
while true; do
|
| 390 |
M=$(($(date +%s) / 60))
|
| 391 |
-
#
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
[[ $((M % 5)) -eq
|
| 395 |
-
|
| 396 |
-
[[ $((M % 3)) -eq 0 ]] && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
|
| 397 |
# auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) — see step 7e below.
|
| 398 |
# Cron entry retained for legacy single-fire boost (no harm if continuous already up):
|
| 399 |
[[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
|
| 400 |
# Every 30 min: research-apply (pop queue → orchestrate → ship feature)
|
| 401 |
[[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
|
| 402 |
# Every 60 min: keyword tuner (adapts scrape queue based on yields)
|
| 403 |
-
[[ $((M % 60)) -eq
|
| 404 |
# Every 6 hours: research-loop (discover new features from competitors/papers)
|
| 405 |
[[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
|
| 406 |
# Every 60 min: dataset enrich (pulls fresh public datasets, dedups, uploads to HF)
|
| 407 |
# (was 4h — accelerated to drain 96-dataset queue ASAP per user request)
|
| 408 |
[[ $((M % 60)) -eq 5 ]] && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
|
| 409 |
# Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
|
| 410 |
-
[[ $((M % 15)) -eq
|
| 411 |
# Every 30 min: build vector embeddings index (RAG semantic search)
|
| 412 |
[[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
|
| 413 |
# Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
|
|
|
|
| 352 |
# LOW_MEM tuned for cpu-basic 16GB after Round 9+10 OOM:
|
| 353 |
# 0 bulk (full-download too heavy) + 2 streaming (lighter) on LOW_MEM
|
| 354 |
# Anchor handles bulk via 24GB ARM headroom.
|
| 355 |
+
# Tighter further after Round 11+12 OOM:
|
| 356 |
+
# 1 streaming worker on LOW_MEM (was 2). Anchor takes the slack via 6 workers.
|
| 357 |
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 0 || echo 4)}"
|
| 358 |
+
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
|
| 359 |
|
| 360 |
for i in $(seq 1 "$BULK_WORKERS"); do
|
| 361 |
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
|
|
|
|
| 390 |
mkdir -p "$(dirname "$LOG")"
|
| 391 |
while true; do
|
| 392 |
M=$(($(date +%s) / 60))
|
| 393 |
+
# Cron offsets STAGGERED — minute=0 burst was OOM trigger.
|
| 394 |
+
# Each major task picks a unique M%X==N offset so no two fire together.
|
| 395 |
+
[[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
|
| 396 |
+
[[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
|
| 397 |
+
[[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
|
|
|
|
| 398 |
# auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) — see step 7e below.
|
| 399 |
# Cron entry retained for legacy single-fire boost (no harm if continuous already up):
|
| 400 |
[[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
|
| 401 |
# Every 30 min: research-apply (pop queue → orchestrate → ship feature)
|
| 402 |
[[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
|
| 403 |
# Every 60 min: keyword tuner (adapts scrape queue based on yields)
|
| 404 |
+
[[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
|
| 405 |
# Every 6 hours: research-loop (discover new features from competitors/papers)
|
| 406 |
[[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
|
| 407 |
# Every 60 min: dataset enrich (pulls fresh public datasets, dedups, uploads to HF)
|
| 408 |
# (was 4h — accelerated to drain 96-dataset queue ASAP per user request)
|
| 409 |
[[ $((M % 60)) -eq 5 ]] && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
|
| 410 |
# Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
|
| 411 |
+
[[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
|
| 412 |
# Every 30 min: build vector embeddings index (RAG semantic search)
|
| 413 |
[[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
|
| 414 |
# Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
|