surrogate-1 / start.sh
ashirato's picture
start.sh: LOW_MEM=1 short-circuits to status server only (kill all bg daemons)
ce077ec
#!/usr/bin/env bash
# Hermes start orchestrator for HF Space.
# Boots: persistent /data mount β†’ Redis β†’ Ollama β†’ axentx repos β†’ daemons β†’ status server.
set -uo pipefail
LOG_DIR="${HOME}/.surrogate/logs"
mkdir -p "$LOG_DIR"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log"
# Trace mode for early steps only (no secrets here yet) β€” find hang point but stay safe
PS4='[trace ${LINENO}] '
set -x
# Echo stdout so HF run-logs see progress (safe steps before .env is loaded)
exec > >(tee -a "$LOG_DIR/boot.log") 2>&1
# ── Memory mode (must be set BEFORE any reference; we use `set -u`) ───────
# CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest
# launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer,
# dedup-bootstrap) β€” those run on GCP daemons instead. Set LOW_MEM=0 only
# on a paid Space tier (cpu-upgrade β‰₯32 GB).
LOW_MEM="${LOW_MEM:-1}"
# ── 1. Persistent data β€” symlink state subdirs to /data (HF persistent mount) ──
# bin/ is NOT persisted (baked into image, refreshed on every push).
# Persisted: state (DBs), logs, memory, skills, sessions, training pairs,
# workspace (hermes runtime), projects (axentx clones), ollama (model cache).
DATA="/data"
if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index}
# Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level
if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then
mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true
rmdir "$DATA/surrogate" 2>/dev/null || true
fi
for spec in \
"${HOME}/.surrogate/state:${DATA}/state" \
"${HOME}/.surrogate/logs:${DATA}/logs" \
"${HOME}/.surrogate/memory:${DATA}/memory" \
"${HOME}/.surrogate/skills:${DATA}/skills" \
"${HOME}/.surrogate/sessions:${DATA}/sessions" \
"${HOME}/.hermes/workspace:${DATA}/workspace" \
"${HOME}/.ollama:${DATA}/ollama"; do
target="${spec%%:*}"
link="${spec##*:}"
mkdir -p "$(dirname "$target")"
# Always ensure backing directory exists + writable. If the persistent
# /data mount becomes unavailable mid-run, daemon writes to symlinked
# path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the
# link defensively each boot fixes stale-symlink cases.
mkdir -p "$link" 2>/dev/null || true
if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then
# Either not-a-symlink OR broken symlink (target unreachable)
rm -rf "$target" 2>/dev/null
ln -sfn "$link" "$target"
fi
# Final sanity probe β€” write a marker; if it fails, the persistent
# mount is broken regardless of the symlink, so log loudly.
if ! touch "$target/.boot-marker" 2>/dev/null; then
echo "[$(date +%H:%M:%S)] ⚠ FATAL: $target/ not writable β€” daemon log writes will Errno 5"
fi
done
# training-pairs.jsonl β€” single file persistence
if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null
touch "${DATA}/training-pairs.jsonl"
ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
fi
# ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ──
if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
touch "${HOME}/.surrogate/.offset-reset-done"
echo "[$(date +%H:%M:%S)] one-time offset reset β†’ $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
fi
# ── Boot-time dedup.db corruption check ──────────────────────────────
# 16 parallel shards previously corrupted the SQLite WAL. If the DB is
# unreadable on boot, back it up and force re-bootstrap from scratch.
DEDUP_DB="${HOME}/.surrogate/state/dedup.db"
if [[ -f "$DEDUP_DB" ]]; then
if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then
TS=$(date +%s)
mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null
rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm"
rm -f "${HOME}/.surrogate/.dedup-bootstrap-done"
echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db β†’ ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log"
fi
fi
# ── Heavy harvest launchers β€” only on HIGH_MEM (LOW_MEM=0) ───────────
# On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest
# workers blew through the cap and HF auto-killed the container ~5 min after
# boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon
# (entries in data/hermes-jobs.json) so harvest still runs β€” just not from
# inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we
# upgrade to a β‰₯32 GB tier.
if [[ "$LOW_MEM" != "1" ]]; then
# ── One-time central dedup bootstrap from existing data ──────────
if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then
echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log"
nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 &
fi
# ── BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron)
nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log"
# ── BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) ─
nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log"
# ── BOOT-TIME lightning-trainer kickoff β€” H200 4 hr free for big model
nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log"
# ── BOOT-TIME dataset-mirror β€” bulk-clone top community SFT mixes ─
nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 &
echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] LOW_MEM=1 β†’ skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log"
fi
echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] WARN: /data not writable β€” running ephemeral!" >> "$LOG_DIR/boot.log"
fi
# ── 2. Bind HF Space secrets β†’ ~/.hermes/.env ───────────────────────────────
# πŸ”’ DISABLE shell trace before touching secret values.
set +x
echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)"
mkdir -p ~/.hermes
{
echo "# Auto-generated from HF Space secrets at boot"
for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \
GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \
CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \
CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \
HF_TOKEN HUGGING_FACE_HUB_TOKEN; do
v="${!k:-}"
[[ -n "$v" ]] && echo "${k}=${v}"
done
} > ~/.hermes/.env
chmod 600 ~/.hermes/.env
echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)"
# Trace OFF for the rest of boot β€” we already have line numbers above and won't need them post-secrets.
# ── LOW_MEM short-circuit β€” skip ALL background daemons, exec status server ──
# CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest
# launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer
# every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons
# below this point (scrape, agentic-crawler, github-crawler, self-heal, cron
# loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.)
# that collectively grow into the cap within an hour.
#
# In LOW_MEM=1 mode the Space's only job is the FastAPI status server on
# :7860 that serves harvest cursor advance to remote workers. Everything
# else (harvest, mirroring, agent pipeline, training pushes, dataset enrich)
# now runs on the GCP daemon fleet β€” see hermes-jobs.json (171 jobs scheduled
# via hermes-scheduler-daemon as of 2026-05-02).
#
# Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (β‰₯32GB).
if [[ "$LOW_MEM" == "1" ]]; then
echo "[$(date +%H:%M:%S)] LOW_MEM=1 β†’ skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log"
set +x # silence trace
# Verify deps before exec β€” print what's missing rather than silent crash
if python3 -c "import fastapi, uvicorn" 2>/dev/null; then
echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log"
exec python3 ~/.surrogate/bin/hermes-status-server.py
else
echo "❌ fastapi/uvicorn not importable β€” falling back to plain http.server"
exec python3 -m http.server 7860 --bind 0.0.0.0
fi
fi
# ── 3. Git config + clone axentx repos for auto-orchestrate auto-commit ────
# Disable interactive prompts globally so failed-auth git ops fail fast.
export GIT_TERMINAL_PROMPT=0
export GIT_ASKPASS=/bin/true
GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1)
if [[ -n "$GH_TOKEN" ]]; then
git config --global user.email "hermes@axentx.ai"
git config --global user.name "Hermes (Surrogate-1)"
git config --global init.defaultBranch main
git config --global pull.rebase true
git config --global push.default current
PROJECTS_DIR="${DATA}/projects"
mkdir -p "$PROJECTS_DIR"
rm -rf ~/axentx 2>/dev/null
ln -sfn "$PROJECTS_DIR" ~/axentx
# Clone axentx repos in background with hard timeout β€” never blocks boot.
# Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and
# were silently 404'ing (arkashira/* β€” only surrogate-1-harvest is there;
# the rest are private under axentx org). The agent pipeline's dev/qa/
# reviewer/commit daemons sat idle for a full day because no repo cloned
# for them to work on. Real paths confirmed via /repos/<owner>/<name>:
# axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1} β†’ 200
# arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} β†’ 404
# Note: 'arkship' was a typo for 'airship' (axentx/airship).
for repo_spec in \
"Costinel:axentx/Costinel" \
"vanguard:axentx/vanguard" \
"airship:axentx/airship" \
"workio:axentx/workio" \
"axiomops:axentx/axiomops" \
"surrogate-1:axentx/surrogate-1"; do
local_name="${repo_spec%%:*}"
gh_path="${repo_spec##*:}"
target="${PROJECTS_DIR}/${local_name}"
(
if [[ ! -d "$target/.git" ]]; then
echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log"
timeout 30 git clone --depth 50 \
"https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \
>> "$LOG_DIR/git-clone.log" 2>&1 || \
echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log"
else
cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true
fi
) &
done
# Don't wait β€” let clones finish in background while boot continues
# Persist token for any push from auto-orchestrate
git config --global credential.helper "store --file=$HOME/.git-credentials"
echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials
chmod 600 ~/.git-credentials
echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log"
fi
# ── 4. Redis (TCP only) ─────────────────────────────────────────────────────
# redis cap tightened on LOW_MEM (was 1gb β†’ 256mb). Coordinator uses
# SQLite directly; redis is only a soft cache for work-queue priorities.
REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}"
redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
--maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru
sleep 1
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1
# ── 5. Ollama β€” DISABLED on cpu-basic (16 GB limit) ───────────────────────
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
# weights against a 16 GB cap β†’ instant OOM on any inference.
#
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
# is faster anyway β€” wafer-scale inference beats CPU x86 by 50-200Γ—.
# Ollama only worth running once Space upgrades to β‰₯cpu-upgrade (32 GB) OR
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
#
# Set LOW_MEM=0 to re-enable on bigger Space tier.
LOW_MEM="${LOW_MEM:-1}"
if [[ "$LOW_MEM" == "1" ]]; then
echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
>> "$LOG_DIR/boot.log"
echo "[$(date +%H:%M:%S)] β†’ free LLM ladder serves all v2 inference" \
>> "$LOG_DIR/boot.log"
else
OLLAMA_MODELS="${HOME}/.ollama/models" \
OLLAMA_HOST=127.0.0.1:11434 \
nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
sleep 6
(
if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
fi
if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
# Smallest coder that's actually useful β€” fits any tier
ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
fi
) &
fi
# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
# HF Spaces free tier may block egress to discord.com β€” bot would crash-loop.
# Pre-flight check: if discord.com unreachable, skip bot, use webhook-only.
if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then
set -a; source ~/.hermes/.env 2>/dev/null; set +a
nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 &
echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)"
else
echo "[$(date +%H:%M:%S)] discord.com unreachable β€” skipping bot, using webhook-only" >> "$LOG_DIR/boot.log"
fi
fi
# ── 7a. Continuous scrape daemon β€” concurrency tuned to LOW_MEM ────────────
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
cat > /tmp/scrape-daemon.sh <<SCRAPESH
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
mkdir -p "\$(dirname "\$LOG")"
while true; do
START=\$(date +%s)
bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
DUR=\$(( \$(date +%s) - START ))
if [[ \$DUR -lt 30 ]]; then sleep 30
elif [[ \$DUR -lt 120 ]]; then sleep 15
else sleep 5
fi
done
SCRAPESH
chmod +x /tmp/scrape-daemon.sh
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
# ── 7b. Agentic crawler β€” DISABLED on LOW_MEM (anchor takes this load) ─────
if [[ "$LOW_MEM" != "1" ]]; then
CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}"
nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \
> "$LOG_DIR/agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] ⚠ agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log"
fi
# ── 7b2. GitHub-specific agentic crawler (lightweight, keep on always) ─────
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"
# ── 7b3. HF Dataset Discoverer β€” DISABLED (replaced by continuous-discoverer) ─
# Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv +
# Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh
# is now redundant + memory pressure on cpu-basic.
echo "[$(date +%H:%M:%S)] ⚠ hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log"
# ── 7e. auto-orchestrate-continuous β€” DISABLED on LOW_MEM (cron handles it) ─
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
fi
# ── 7e1. SELF-HEAL WATCHDOG β€” must start BEFORE memory-hungry workers ───────
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
# usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise
# crash the entire container. Also restarts stuck ingest / kicks stale uploader.
nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 &
echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log"
# ── 7e2. GH-ACTIONS TICKER β€” burst-dispatch external runners every 60s ──────
# Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every
# 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner
# timeouts, the 20-concurrent free-tier slot cap stays saturated.
# Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as
# Space secrets β€” operator can add later without restart-required.
nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 &
echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log"
# ── 7e3. LLM BURST GENERATOR β€” synthetic training pairs from 8 free LLMs ────
# Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi.
# Each cycle fires 3 prompts at every active provider in parallel, writes
# {prompt, response} pairs to training-pairs.jsonl. Combined free-tier
# budget: ~7000+ pairs/day. Skips any provider whose key env is not set.
if [[ "$LOW_MEM" != "1" ]]; then
nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 &
echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] ⚠ llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log"
fi
sleep 3 # Stagger spawns β€” avoid memory burst at boot
# ── 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) ─────
# Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard
# peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides
# a second safety net if peak still spikes.
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 &
echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] ⚠ bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log"
fi
sleep 3
# ── 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10Γ— faster) ──
# Downloads parquet shards directly via HF datasets-server API + pyarrow filter.
# Targets only trillion-scale corpora where streaming is too slow.
# DLs reduced to 2 parallel β€” combined with 6 ingest shards stays under 16Gi.
PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 &
echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log"
# ── 7c. Skill-synthesis daemon β€” DISABLED on LOW_MEM (heavy LLM calls) ────
if [[ "$LOW_MEM" != "1" ]]; then
nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
else
echo "[$(date +%H:%M:%S)] ⚠ skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log"
fi
# ── 7d. Bulk mirror coordinator + 4 parallel workers ────────────────────────
# User feedback 2026-04-29: "ทุก agent ΰΈ—ΰΈ³ΰΈ‡ΰΈ²ΰΈ™ΰΈ£ΰΉˆΰΈ§ΰΈ‘ΰΈΰΈ±ΰΈ™ แΰΈ₯ΰΈ°ΰΉ„ΰΈ‘ΰΉˆΰΉ„ΰΈ›ΰΈ—ΰΈ΅ΰΉˆΰΈ‹ΰΉ‰ΰΈ³ΰΉ†".
# Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db).
# Workers each pull next pending dataset, mirror+sanitize+dedup, mark done.
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
# Lease-based claims (15 min) β€” crashes auto-expire so other workers pick up.
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true
# Two worker types share the same coordinator queue:
# bulk-mirror-worker.sh β€” full-download, suits small/medium datasets
# streaming-mirror-worker.sh β€” HF datasets streaming, suits trillion-token
# LOW_MEM tuning for cpu-basic 16GB Space (history):
# v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2)
# v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened)
# v3 NOW: 1 bulk + 3 stream (post Civo-pivot + 4-Space fan-out;
# anchor never came up so we can't rely on
# it for bulk, and 16GB has ~8 GB unused
# under the v2 setting β†’ reclaim it)
#
# Memory budget per Space (16 GB cpu-basic):
# ~6 GB reserved: OS + redis 256mb + continuous-discoverer +
# dataset-enrich + auto-startup-loop + push bursts
# ~10 GB available for harvest workers
# 3 stream Γ— 500 MB + 1 bulk Γ— 600 MB = 2.1 GB used
# ~8 GB headroom β†’ memory-guard.sh kicks in at <3 GB free, safe
#
# Throughput delta: 4Γ— workers/Space Γ— 4 Spaces = 16Γ— total worker count
# (vs previous 1Γ—4 = 4). Combined with enrich cron M%30==5 (was M%60),
# expect 3-5Γ— commit rate before HF soft-cap kicks in.
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}"
for i in $(seq 1 "$BULK_WORKERS"); do
nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
> "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
done
for i in $(seq 1 "$STREAM_WORKERS"); do
nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \
> "$LOG_DIR/stream-worker-$i.log" 2>&1 &
done
TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS))
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"
# ── 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) ─
# Replaces aggressive-harvester cron β€” runs always, sweeps HF + arxiv + SE + GH.
if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \
> "$LOG_DIR/continuous-discoverer.log" 2>&1 &
echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log"
fi
# ── Auto-startup-loop: 45 personae Γ— 9 LoRA clusters Γ— auto-commit + auto-push ─
# CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc.
# 1 role per 15-min cycle; chained roles fire downstream automatically.
if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \
> "$LOG_DIR/auto-startup-loop.log" 2>&1 &
echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log"
fi
# ── 7d. Train-ready pusher β€” disabled at boot for now. Caused Space
# RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
# bin/train-ready-pusher.sh; launch manually after Space proves stable:
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 &
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &
# ── 7b. Cron loop β€” non-scrape daemons (scrape now runs continuously above) ─
cat > /tmp/hermes-cron.sh <<'CRONSH'
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="${HOME}/.surrogate/logs/cron.log"
mkdir -p "$(dirname "$LOG")"
while true; do
M=$(($(date +%s) / 60))
# Cron offsets STAGGERED β€” minute=0 burst was OOM trigger.
# Each major task picks a unique M%X==N offset so no two fire together.
[[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
[[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
# Auto-scaler β€” spawn/kill workers based on free memory tier (burst-but-don't-die)
[[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
# synth-puller β€” hit surrogate1 ZeroGPU /api/synth_batch every 5 min
# Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate).
[[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 &
# push-training-to-hf gated by memory (loads big shard into RAM).
# Anchor (24GB) takes over when capacity arrives β€” see anchor cron-loop.
[[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
# auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) β€” see step 7e below.
# Cron entry retained for legacy single-fire boost (no harm if continuous already up):
[[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
# Every 30 min: research-apply (pop queue β†’ orchestrate β†’ ship feature)
[[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
# Every 60 min: keyword tuner (adapts scrape queue based on yields)
[[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
# Every 6 hours: research-loop (discover new features from competitors/papers)
[[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
# Every 30 min: dataset enrich (was 60 min β€” bumped 2Γ— now that we have
# 4 Spaces Γ— (3 stream + 1 bulk) = 16 workers harvesting in parallel,
# producing more chunks per hour than the old 60-min push could drain).
# Memory-guarded β€” full HF Hub iter is heavy.
[[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
# Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
[[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
# Every 30 min: build vector embeddings index (RAG semantic search)
[[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
# Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
[[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 &
# Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) β†’ security-knowledge dataset
[[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
# Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
[[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
# Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
# Cerebras/Groq β†’ +80 specific job-description-style search terms each).
# Discoverer auto-uses the expanded list on its next cycle.
[[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
# Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset
# slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
# so we DO want to keep submitting β€” Kaggle queues if 1 already running,
# auto-cancels older if 5+ pending. With shorter interval we keep the
# GPU pipeline saturated.
[[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
# Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo).
# H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA β€” biggest free training.
[[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
# ── Round 5 (2026-04) sustainability loops ──────────────────────────
# Every 6 hr (offset 90): self-improve loop β€” gen problems, judge,
# winners β†’ training data, losers β†’ reflexion-store.
[[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 &
# Every 30 min (offset 22): mine new tool-call traces from logs into
# SFT + DPO data, plus voyager skill candidates.
[[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 &
# Every 60 min (offset 17): export promoted voyager skills to JSONL
# (training-data slice + inference-time retrieval source).
[[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 &
# Daily 07:00 UTC: active-learning batch from one bulk-mirror file.
# Skips silently if no pool yet.
[[ $((M % 1440)) -eq 420 ]] && {
POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1)
[[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \
--pool "$POOL" --n 200 --scan 1500 \
>> "$LOG_DIR/active-learning.log" 2>&1 &
}
# Daily 08:00 UTC: constitutional self-critique on yesterday's
# winners (pulls latest self-improve winners file).
[[ $((M % 1440)) -eq 480 ]] && {
WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1)
[[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \
--input "$WIN" --n 200 \
>> "$LOG_DIR/constitutional.log" 2>&1 &
}
# ── Round 7+8 (2026-04-30) β€” trillion-scale + harvester + enrich ──────
# Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep)
[[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \
>> "$LOG_DIR/aggressive-harvester.log" 2>&1 &
# Every 60 min (offset 35): enrich newly-mirrored bulk files
[[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
&& bash ~/.surrogate/bin/v2/enrich-pipeline.sh \
>> "$LOG_DIR/enrich-pipeline.log" 2>&1 &
# Every 30 min (offset 25): spawn extra streaming worker if pool empty
[[ $((M % 30)) -eq 25 ]] && {
if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then
nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \
> "$LOG_DIR/stream-worker-cron.log" 2>&1 &
fi
}
# Daily 09:00 UTC: teachable-prompt filter on harvested data
[[ $((M % 1440)) -eq 540 ]] && {
LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1)
[[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \
--input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \
--n 1000 --keep-target 200 \
>> "$LOG_DIR/teachable.log" 2>&1 &
}
# Daily 11:00 UTC: regression test suite (catches breakage post-push)
[[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \
>> "$LOG_DIR/regression.log" 2>&1 &
# Weekly Sun 10:00 UTC: abstract-cot compress reasoning data
[[ $((M % 10080)) -eq 600 ]] && {
for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do
[[ -f "$f" ]] || continue
python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \
--input "$f" --out "${f%.jsonl}-compressed.jsonl" \
>> "$LOG_DIR/abstract-cot.log" 2>&1
done
}
sleep 60
done
CRONSH
chmod +x /tmp/hermes-cron.sh
nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 &
echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log"
# ── 8. Status HTTP server on :7860 (FastAPI/uvicorn β€” robust binding) ──────
set +x # silence trace for clean uvicorn logs
echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log"
# Verify deps before exec β€” print what's missing rather than silent crash
python3 -c "import fastapi, uvicorn; print(f' fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || {
echo "❌ fastapi/uvicorn not importable β€” falling back to plain http.server"
exec python3 -m http.server 7860 --bind 0.0.0.0
}
# Run as PID 1 β€” uvicorn handles signals + auto-restart on crash
exec python3 ~/.surrogate/bin/hermes-status-server.py