Spaces:
Runtime error
Runtime error
File size: 35,499 Bytes
f7e1070 b1eb6a2 f7e1070 e36381e f7e1070 d4e26b2 bdedae9 d4e26b2 f7e1070 a5c37dd e36381e b1eb6a2 e36381e b1eb6a2 e36381e b1eb6a2 b532db8 b1eb6a2 b532db8 b1eb6a2 e36381e aa008c4 0ad083a a5c37dd 508b0e2 a5c37dd 0005a16 e36381e b1eb6a2 d0ef0a5 f7e1070 b1eb6a2 dc2d4b4 b1eb6a2 f7e1070 d0ef0a5 f7e1070 ce077ec b1eb6a2 147cb0f b1eb6a2 154f078 b1eb6a2 154f078 b1eb6a2 147cb0f b1eb6a2 147cb0f b1eb6a2 147cb0f b1eb6a2 d8d7a71 b1eb6a2 d8d7a71 f7e1070 b1eb6a2 f7e1070 426d0e5 9d0ec79 426d0e5 f7e1070 266304a f7e1070 266304a f7e1070 426d0e5 9d0ec79 426d0e5 9d0ec79 426d0e5 5c8d6dd 9d0ec79 426d0e5 5c8d6dd a8f4e74 5c8d6dd a8f4e74 d59de60 426d0e5 d59de60 a8f4e74 dd483c7 426d0e5 80f9271 859b78d 52df1e3 8f598ec 47f02de 8f598ec 859b78d 47f02de bad154c 859b78d 2f31c27 a8f4e74 9d0ec79 4831adb a9b3bd4 7d05ef5 a9b3bd4 426d0e5 4831adb a9b3bd4 4831adb a27499d 47f02de 6de7ab5 2fd0435 9d0ec79 f7e1070 e36381e b1eb6a2 f7e1070 b1eb6a2 3234167 ff2fbf3 47f02de d8d7a71 80f9271 88079d2 e36381e 9d0ec79 3234167 88079d2 e36381e 7d05ef5 d8d7a71 7cbea95 3234167 7cbea95 ea561c8 d8d7a71 57a564a c0c6fe0 6180513 6dd5997 6180513 6038c4f 17967dd d8d7a71 17967dd a9b3bd4 d8d7a71 a9b3bd4 8c00e62 a9b3bd4 f7e1070 b1eb6a2 f7e1070 7d77adb bdedae9 f7e1070 7d77adb e36381e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 | #!/usr/bin/env bash
# Hermes start orchestrator for HF Space.
# Boots: persistent /data mount β Redis β Ollama β axentx repos β daemons β status server.
set -uo pipefail
LOG_DIR="${HOME}/.surrogate/logs"
mkdir -p "$LOG_DIR"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log"
# Trace mode for early steps only (no secrets here yet) β find hang point but stay safe
PS4='[trace ${LINENO}] '
set -x
# Echo stdout so HF run-logs see progress (safe steps before .env is loaded)
exec > >(tee -a "$LOG_DIR/boot.log") 2>&1
# ββ Memory mode (must be set BEFORE any reference; we use `set -u`) βββββββ
# CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest
# launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer,
# dedup-bootstrap) β those run on GCP daemons instead. Set LOW_MEM=0 only
# on a paid Space tier (cpu-upgrade β₯32 GB).
LOW_MEM="${LOW_MEM:-1}"
# ββ 1. Persistent data β symlink state subdirs to /data (HF persistent mount) ββ
# bin/ is NOT persisted (baked into image, refreshed on every push).
# Persisted: state (DBs), logs, memory, skills, sessions, training pairs,
# workspace (hermes runtime), projects (axentx clones), ollama (model cache).
DATA="/data"
if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index}
# Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level
if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then
mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true
rmdir "$DATA/surrogate" 2>/dev/null || true
fi
for spec in \
"${HOME}/.surrogate/state:${DATA}/state" \
"${HOME}/.surrogate/logs:${DATA}/logs" \
"${HOME}/.surrogate/memory:${DATA}/memory" \
"${HOME}/.surrogate/skills:${DATA}/skills" \
"${HOME}/.surrogate/sessions:${DATA}/sessions" \
"${HOME}/.hermes/workspace:${DATA}/workspace" \
"${HOME}/.ollama:${DATA}/ollama"; do
target="${spec%%:*}"
link="${spec##*:}"
mkdir -p "$(dirname "$target")"
# Always ensure backing directory exists + writable. If the persistent
# /data mount becomes unavailable mid-run, daemon writes to symlinked
# path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the
# link defensively each boot fixes stale-symlink cases.
mkdir -p "$link" 2>/dev/null || true
if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then
# Either not-a-symlink OR broken symlink (target unreachable)
rm -rf "$target" 2>/dev/null
ln -sfn "$link" "$target"
fi
# Final sanity probe β write a marker; if it fails, the persistent
# mount is broken regardless of the symlink, so log loudly.
if ! touch "$target/.boot-marker" 2>/dev/null; then
echo "[$(date +%H:%M:%S)] β FATAL: $target/ not writable β daemon log writes will Errno 5"
fi
done
# training-pairs.jsonl β single file persistence
if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null
touch "${DATA}/training-pairs.jsonl"
ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
fi
# ββ One-time offset reset: skip polluted agentic-crawler placeholder backlog ββ
if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
touch "${HOME}/.surrogate/.offset-reset-done"
echo "[$(date +%H:%M:%S)] one-time offset reset β $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
fi
# ββ Boot-time dedup.db corruption check ββββββββββββββββββββββββββββββ
# 16 parallel shards previously corrupted the SQLite WAL. If the DB is
# unreadable on boot, back it up and force re-bootstrap from scratch.
DEDUP_DB="${HOME}/.surrogate/state/dedup.db"
if [[ -f "$DEDUP_DB" ]]; then
if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then
TS=$(date +%s)
mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null
rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm"
rm -f "${HOME}/.surrogate/.dedup-bootstrap-done"
echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db β ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log"
fi
fi
# ββ Heavy harvest launchers β only on HIGH_MEM (LOW_MEM=0) βββββββββββ
# On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest
# workers blew through the cap and HF auto-killed the container ~5 min after
# boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon
# (entries in data/hermes-jobs.json) so harvest still runs β just not from
# inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we
# upgrade to a β₯32 GB tier.
if [[ "$LOW_MEM" != "1" ]]; then
# ββ One-time central dedup bootstrap from existing data ββββββββββ
if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then
echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log"
nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 &
fi
# ββ BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron)
nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log"
# ββ BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) β
nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log"
# ββ BOOT-TIME lightning-trainer kickoff β H200 4 hr free for big model
nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log"
# ββ BOOT-TIME dataset-mirror β bulk-clone top community SFT mixes β
nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] LOW_MEM=1 β skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log"
fi
echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] WARN: /data not writable β running ephemeral!" >> "$LOG_DIR/boot.log"
fi
# ββ 2. Bind HF Space secrets β ~/.hermes/.env βββββββββββββββββββββββββββββββ
# π DISABLE shell trace before touching secret values.
set +x
echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)"
mkdir -p ~/.hermes
{
echo "# Auto-generated from HF Space secrets at boot"
for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \
GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \
CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \
CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \
HF_TOKEN HUGGING_FACE_HUB_TOKEN; do
v="${!k:-}"
[[ -n "$v" ]] && echo "${k}=${v}"
done
} > ~/.hermes/.env
chmod 600 ~/.hermes/.env
echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)"
# Trace OFF for the rest of boot β we already have line numbers above and won't need them post-secrets.
# ββ LOW_MEM short-circuit β skip ALL background daemons, exec status server ββ
# CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest
# launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer
# every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons
# below this point (scrape, agentic-crawler, github-crawler, self-heal, cron
# loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.)
# that collectively grow into the cap within an hour.
#
# In LOW_MEM=1 mode the Space's only job is the FastAPI status server on
# :7860 that serves harvest cursor advance to remote workers. Everything
# else (harvest, mirroring, agent pipeline, training pushes, dataset enrich)
# now runs on the GCP daemon fleet β see hermes-jobs.json (171 jobs scheduled
# via hermes-scheduler-daemon as of 2026-05-02).
#
# Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (β₯32GB).
if [[ "$LOW_MEM" == "1" ]]; then
echo "[$(date +%H:%M:%S)] LOW_MEM=1 β skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log"
set +x # silence trace
# Verify deps before exec β print what's missing rather than silent crash
if python3 -c "import fastapi, uvicorn" 2>/dev/null; then
echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log"
exec python3 ~/.surrogate/bin/hermes-status-server.py
else
echo "β fastapi/uvicorn not importable β falling back to plain http.server"
exec python3 -m http.server 7860 --bind 0.0.0.0
fi
fi
# ββ 3. Git config + clone axentx repos for auto-orchestrate auto-commit ββββ
# Disable interactive prompts globally so failed-auth git ops fail fast.
export GIT_TERMINAL_PROMPT=0
export GIT_ASKPASS=/bin/true
GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1)
if [[ -n "$GH_TOKEN" ]]; then
git config --global user.email "hermes@axentx.ai"
git config --global user.name "Hermes (Surrogate-1)"
git config --global init.defaultBranch main
git config --global pull.rebase true
git config --global push.default current
PROJECTS_DIR="${DATA}/projects"
mkdir -p "$PROJECTS_DIR"
rm -rf ~/axentx 2>/dev/null
ln -sfn "$PROJECTS_DIR" ~/axentx
# Clone axentx repos in background with hard timeout β never blocks boot.
# Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and
# were silently 404'ing (arkashira/* β only surrogate-1-harvest is there;
# the rest are private under axentx org). The agent pipeline's dev/qa/
# reviewer/commit daemons sat idle for a full day because no repo cloned
# for them to work on. Real paths confirmed via /repos/<owner>/<name>:
# axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1} β 200
# arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} β 404
# Note: 'arkship' was a typo for 'airship' (axentx/airship).
for repo_spec in \
"Costinel:axentx/Costinel" \
"vanguard:axentx/vanguard" \
"airship:axentx/airship" \
"workio:axentx/workio" \
"axiomops:axentx/axiomops" \
"surrogate-1:axentx/surrogate-1"; do
local_name="${repo_spec%%:*}"
gh_path="${repo_spec##*:}"
target="${PROJECTS_DIR}/${local_name}"
(
if [[ ! -d "$target/.git" ]]; then
echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log"
timeout 30 git clone --depth 50 \
"https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \
>> "$LOG_DIR/git-clone.log" 2>&1 || \
echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log"
else
cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true
fi
) &
done
# Don't wait β let clones finish in background while boot continues
# Persist token for any push from auto-orchestrate
git config --global credential.helper "store --file=$HOME/.git-credentials"
echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials
chmod 600 ~/.git-credentials
echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log"
fi
# ββ 4. Redis (TCP only) βββββββββββββββββββββββββββββββββββββββββββββββββββββ
# redis cap tightened on LOW_MEM (was 1gb β 256mb). Coordinator uses
# SQLite directly; redis is only a soft cache for work-queue priorities.
REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}"
redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
--maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru
sleep 1
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1
# ββ 5. Ollama β DISABLED on cpu-basic (16 GB limit) βββββββββββββββββββββββ
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
# weights against a 16 GB cap β instant OOM on any inference.
#
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
# is faster anyway β wafer-scale inference beats CPU x86 by 50-200Γ.
# Ollama only worth running once Space upgrades to β₯cpu-upgrade (32 GB) OR
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
#
# Set LOW_MEM=0 to re-enable on bigger Space tier.
LOW_MEM="${LOW_MEM:-1}"
if [[ "$LOW_MEM" == "1" ]]; then
echo "[$(date +%H:%M:%S)] β ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
>> "$LOG_DIR/boot.log"
echo "[$(date +%H:%M:%S)] β free LLM ladder serves all v2 inference" \
>> "$LOG_DIR/boot.log"
else
OLLAMA_MODELS="${HOME}/.ollama/models" \
OLLAMA_HOST=127.0.0.1:11434 \
nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
sleep 6
(
if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
fi
if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
# Smallest coder that's actually useful β fits any tier
ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
fi
) &
fi
# ββ 6. Discord bot (only if egress to discord.com is reachable) ββββββββββββ
# HF Spaces free tier may block egress to discord.com β bot would crash-loop.
# Pre-flight check: if discord.com unreachable, skip bot, use webhook-only.
if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then
set -a; source ~/.hermes/.env 2>/dev/null; set +a
nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 &
echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)"
else
echo "[$(date +%H:%M:%S)] discord.com unreachable β skipping bot, using webhook-only" >> "$LOG_DIR/boot.log"
fi
fi
# ββ 7a. Continuous scrape daemon β concurrency tuned to LOW_MEM ββββββββββββ
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
cat > /tmp/scrape-daemon.sh <<SCRAPESH
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
mkdir -p "\$(dirname "\$LOG")"
while true; do
START=\$(date +%s)
bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
DUR=\$(( \$(date +%s) - START ))
if [[ \$DUR -lt 30 ]]; then sleep 30
elif [[ \$DUR -lt 120 ]]; then sleep 15
else sleep 5
fi
done
SCRAPESH
chmod +x /tmp/scrape-daemon.sh
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
# ββ 7b. Agentic crawler β DISABLED on LOW_MEM (anchor takes this load) βββββ
if [[ "$LOW_MEM" != "1" ]]; then
CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}"
nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \
> "$LOG_DIR/agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] β agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log"
fi
# ββ 7b2. GitHub-specific agentic crawler (lightweight, keep on always) βββββ
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"
# ββ 7b3. HF Dataset Discoverer β DISABLED (replaced by continuous-discoverer) β
# Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv +
# Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh
# is now redundant + memory pressure on cpu-basic.
echo "[$(date +%H:%M:%S)] β hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log"
# ββ 7e. auto-orchestrate-continuous β DISABLED on LOW_MEM (cron handles it) β
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] β auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
fi
# ββ 7e1. SELF-HEAL WATCHDOG β must start BEFORE memory-hungry workers βββββββ
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
# usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise
# crash the entire container. Also restarts stuck ingest / kicks stale uploader.
nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 &
echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log"
# ββ 7e2. GH-ACTIONS TICKER β burst-dispatch external runners every 60s ββββββ
# Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every
# 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner
# timeouts, the 20-concurrent free-tier slot cap stays saturated.
# Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as
# Space secrets β operator can add later without restart-required.
nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 &
echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log"
# ββ 7e3. LLM BURST GENERATOR β synthetic training pairs from 8 free LLMs ββββ
# Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi.
# Each cycle fires 3 prompts at every active provider in parallel, writes
# {prompt, response} pairs to training-pairs.jsonl. Combined free-tier
# budget: ~7000+ pairs/day. Skips any provider whose key env is not set.
if [[ "$LOW_MEM" != "1" ]]; then
nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 &
echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] β llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log"
fi
sleep 3 # Stagger spawns β avoid memory burst at boot
# ββ 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) βββββ
# Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard
# peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides
# a second safety net if peak still spikes.
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 &
echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] β bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log"
fi
sleep 3
# ββ 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10Γ faster) ββ
# Downloads parquet shards directly via HF datasets-server API + pyarrow filter.
# Targets only trillion-scale corpora where streaming is too slow.
# DLs reduced to 2 parallel β combined with 6 ingest shards stays under 16Gi.
PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 &
echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log"
# ββ 7c. Skill-synthesis daemon β DISABLED on LOW_MEM (heavy LLM calls) ββββ
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] β skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log"
fi
# ββ 7d. Bulk mirror coordinator + 4 parallel workers ββββββββββββββββββββββββ
# User feedback 2026-04-29: "ΰΈΰΈΈΰΈ agent ΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈ£ΰΉΰΈ§ΰΈ‘ΰΈΰΈ±ΰΈ ΰΉΰΈ₯ΰΈ°ΰΉΰΈ‘ΰΉΰΉΰΈΰΈΰΈ΅ΰΉΰΈΰΉΰΈ³ΰΉ".
# Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db).
# Workers each pull next pending dataset, mirror+sanitize+dedup, mark done.
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
# Lease-based claims (15 min) β crashes auto-expire so other workers pick up.
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true
# Two worker types share the same coordinator queue:
# bulk-mirror-worker.sh β full-download, suits small/medium datasets
# streaming-mirror-worker.sh β HF datasets streaming, suits trillion-token
# LOW_MEM tuning for cpu-basic 16GB Space (history):
# v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2)
# v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened)
# v3 NOW: 1 bulk + 3 stream (post Civo-pivot + 4-Space fan-out;
# anchor never came up so we can't rely on
# it for bulk, and 16GB has ~8 GB unused
# under the v2 setting β reclaim it)
#
# Memory budget per Space (16 GB cpu-basic):
# ~6 GB reserved: OS + redis 256mb + continuous-discoverer +
# dataset-enrich + auto-startup-loop + push bursts
# ~10 GB available for harvest workers
# 3 stream Γ 500 MB + 1 bulk Γ 600 MB = 2.1 GB used
# ~8 GB headroom β memory-guard.sh kicks in at <3 GB free, safe
#
# Throughput delta: 4Γ workers/Space Γ 4 Spaces = 16Γ total worker count
# (vs previous 1Γ4 = 4). Combined with enrich cron M%30==5 (was M%60),
# expect 3-5Γ commit rate before HF soft-cap kicks in.
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}"
for i in $(seq 1 "$BULK_WORKERS"); do
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
> "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
done
for i in $(seq 1 "$STREAM_WORKERS"); do
nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \
> "$LOG_DIR/stream-worker-$i.log" 2>&1 &
done
TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS))
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
# ββ 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) β
# Replaces aggressive-harvester cron β runs always, sweeps HF + arxiv + SE + GH.
if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \
> "$LOG_DIR/continuous-discoverer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log"
fi
# ββ Auto-startup-loop: 45 personae Γ 9 LoRA clusters Γ auto-commit + auto-push β
# CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc.
# 1 role per 15-min cycle; chained roles fire downstream automatically.
if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \
> "$LOG_DIR/auto-startup-loop.log" 2>&1 &
echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log"
fi
# ββ 7d. Train-ready pusher β disabled at boot for now. Caused Space
# RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
# bin/train-ready-pusher.sh; launch manually after Space proves stable:
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 &
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &
# ββ 7b. Cron loop β non-scrape daemons (scrape now runs continuously above) β
cat > /tmp/hermes-cron.sh <<'CRONSH'
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="${HOME}/.surrogate/logs/cron.log"
mkdir -p "$(dirname "$LOG")"
while true; do
M=$(($(date +%s) / 60))
# Cron offsets STAGGERED β minute=0 burst was OOM trigger.
# Each major task picks a unique M%X==N offset so no two fire together.
[[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
[[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
# Auto-scaler β spawn/kill workers based on free memory tier (burst-but-don't-die)
[[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
# synth-puller β hit surrogate1 ZeroGPU /api/synth_batch every 5 min
# Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate).
[[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 &
# push-training-to-hf gated by memory (loads big shard into RAM).
# Anchor (24GB) takes over when capacity arrives β see anchor cron-loop.
[[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
# auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) β see step 7e below.
# Cron entry retained for legacy single-fire boost (no harm if continuous already up):
[[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
# Every 30 min: research-apply (pop queue β orchestrate β ship feature)
[[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
# Every 60 min: keyword tuner (adapts scrape queue based on yields)
[[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
# Every 6 hours: research-loop (discover new features from competitors/papers)
[[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
# Every 30 min: dataset enrich (was 60 min β bumped 2Γ now that we have
# 4 Spaces Γ (3 stream + 1 bulk) = 16 workers harvesting in parallel,
# producing more chunks per hour than the old 60-min push could drain).
# Memory-guarded β full HF Hub iter is heavy.
[[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
# Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
[[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
# Every 30 min: build vector embeddings index (RAG semantic search)
[[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
# Every 30 min: synthetic data generation (REWORKβAPPROVE DPO + distilabel rewrite)
[[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 &
# Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) β security-knowledge dataset
[[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
# Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
[[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
# Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
# Cerebras/Groq β +80 specific job-description-style search terms each).
# Discoverer auto-uses the expanded list on its next cycle.
[[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
# Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset
# slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
# so we DO want to keep submitting β Kaggle queues if 1 already running,
# auto-cancels older if 5+ pending. With shorter interval we keep the
# GPU pipeline saturated.
[[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
# Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo).
# H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA β biggest free training.
[[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
# ββ Round 5 (2026-04) sustainability loops ββββββββββββββββββββββββββ
# Every 6 hr (offset 90): self-improve loop β gen problems, judge,
# winners β training data, losers β reflexion-store.
[[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 &
# Every 30 min (offset 22): mine new tool-call traces from logs into
# SFT + DPO data, plus voyager skill candidates.
[[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 &
# Every 60 min (offset 17): export promoted voyager skills to JSONL
# (training-data slice + inference-time retrieval source).
[[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 &
# Daily 07:00 UTC: active-learning batch from one bulk-mirror file.
# Skips silently if no pool yet.
[[ $((M % 1440)) -eq 420 ]] && {
POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1)
[[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \
--pool "$POOL" --n 200 --scan 1500 \
>> "$LOG_DIR/active-learning.log" 2>&1 &
}
# Daily 08:00 UTC: constitutional self-critique on yesterday's
# winners (pulls latest self-improve winners file).
[[ $((M % 1440)) -eq 480 ]] && {
WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1)
[[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \
--input "$WIN" --n 200 \
>> "$LOG_DIR/constitutional.log" 2>&1 &
}
# ββ Round 7+8 (2026-04-30) β trillion-scale + harvester + enrich ββββββ
# Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep)
[[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \
>> "$LOG_DIR/aggressive-harvester.log" 2>&1 &
# Every 60 min (offset 35): enrich newly-mirrored bulk files
[[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/v2/enrich-pipeline.sh \
>> "$LOG_DIR/enrich-pipeline.log" 2>&1 &
# Every 30 min (offset 25): spawn extra streaming worker if pool empty
[[ $((M % 30)) -eq 25 ]] && {
if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \
> "$LOG_DIR/stream-worker-cron.log" 2>&1 &
fi
}
# Daily 09:00 UTC: teachable-prompt filter on harvested data
[[ $((M % 1440)) -eq 540 ]] && {
LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1)
[[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \
--input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \
--n 1000 --keep-target 200 \
>> "$LOG_DIR/teachable.log" 2>&1 &
}
# Daily 11:00 UTC: regression test suite (catches breakage post-push)
[[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \
>> "$LOG_DIR/regression.log" 2>&1 &
# Weekly Sun 10:00 UTC: abstract-cot compress reasoning data
[[ $((M % 10080)) -eq 600 ]] && {
for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do
[[ -f "$f" ]] || continue
python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \
--input "$f" --out "${f%.jsonl}-compressed.jsonl" \
>> "$LOG_DIR/abstract-cot.log" 2>&1
done
}
sleep 60
done
CRONSH
chmod +x /tmp/hermes-cron.sh
nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 &
echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log"
# ββ 8. Status HTTP server on :7860 (FastAPI/uvicorn β robust binding) ββββββ
set +x # silence trace for clean uvicorn logs
echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log"
# Verify deps before exec β print what's missing rather than silent crash
python3 -c "import fastapi, uvicorn; print(f' fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || {
echo "β fastapi/uvicorn not importable β falling back to plain http.server"
exec python3 -m http.server 7860 --bind 0.0.0.0
}
# Run as PID 1 β uvicorn handles signals + auto-restart on crash
exec python3 ~/.surrogate/bin/hermes-status-server.py
|