#!/usr/bin/env bash # Hermes start orchestrator for HF Space. # Boots: persistent /data mount → Redis → Ollama → axentx repos → daemons → status server. set -uo pipefail LOG_DIR="${HOME}/.surrogate/logs" mkdir -p "$LOG_DIR" echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log" # Trace mode for early steps only (no secrets here yet) — find hang point but stay safe PS4='[trace ${LINENO}] ' set -x # Echo stdout so HF run-logs see progress (safe steps before .env is loaded) exec > >(tee -a "$LOG_DIR/boot.log") 2>&1 # ── Memory mode (must be set BEFORE any reference; we use `set -u`) ─────── # CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest # launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer, # dedup-bootstrap) — those run on GCP daemons instead. Set LOW_MEM=0 only # on a paid Space tier (cpu-upgrade ≥32 GB). LOW_MEM="${LOW_MEM:-1}" # ── 1. Persistent data — symlink state subdirs to /data (HF persistent mount) ── # bin/ is NOT persisted (baked into image, refreshed on every push). # Persisted: state (DBs), logs, memory, skills, sessions, training pairs, # workspace (hermes runtime), projects (axentx clones), ollama (model cache). DATA="/data" if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index} # Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true rmdir "$DATA/surrogate" 2>/dev/null || true fi for spec in \ "${HOME}/.surrogate/state:${DATA}/state" \ "${HOME}/.surrogate/logs:${DATA}/logs" \ "${HOME}/.surrogate/memory:${DATA}/memory" \ "${HOME}/.surrogate/skills:${DATA}/skills" \ "${HOME}/.surrogate/sessions:${DATA}/sessions" \ "${HOME}/.hermes/workspace:${DATA}/workspace" \ "${HOME}/.ollama:${DATA}/ollama"; do target="${spec%%:*}" link="${spec##*:}" mkdir -p "$(dirname "$target")" # Always ensure backing directory exists + writable. If the persistent # /data mount becomes unavailable mid-run, daemon writes to symlinked # path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the # link defensively each boot fixes stale-symlink cases. mkdir -p "$link" 2>/dev/null || true if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then # Either not-a-symlink OR broken symlink (target unreachable) rm -rf "$target" 2>/dev/null ln -sfn "$link" "$target" fi # Final sanity probe — write a marker; if it fails, the persistent # mount is broken regardless of the symlink, so log loudly. if ! touch "$target/.boot-marker" 2>/dev/null; then echo "[$(date +%H:%M:%S)] ⚠ FATAL: $target/ not writable — daemon log writes will Errno 5" fi done # training-pairs.jsonl — single file persistence if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null touch "${DATA}/training-pairs.jsonl" ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl" fi # ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ── if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ') echo "$CUR" > "${HOME}/.surrogate/.training-push-offset" echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset" touch "${HOME}/.surrogate/.offset-reset-done" echo "[$(date +%H:%M:%S)] one-time offset reset → $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log" fi # ── Boot-time dedup.db corruption check ────────────────────────────── # 16 parallel shards previously corrupted the SQLite WAL. If the DB is # unreadable on boot, back it up and force re-bootstrap from scratch. DEDUP_DB="${HOME}/.surrogate/state/dedup.db" if [[ -f "$DEDUP_DB" ]]; then if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then TS=$(date +%s) mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm" rm -f "${HOME}/.surrogate/.dedup-bootstrap-done" echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db → ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log" fi fi # ── Heavy harvest launchers — only on HIGH_MEM (LOW_MEM=0) ─────────── # On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest # workers blew through the cap and HF auto-killed the container ~5 min after # boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon # (entries in data/hermes-jobs.json) so harvest still runs — just not from # inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we # upgrade to a ≥32 GB tier. if [[ "$LOW_MEM" != "1" ]]; then # ── One-time central dedup bootstrap from existing data ────────── if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log" nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 & fi # ── BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron) nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 & echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log" # ── BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) ─ nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 & echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log" # ── BOOT-TIME lightning-trainer kickoff — H200 4 hr free for big model nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 & echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log" # ── BOOT-TIME dataset-mirror — bulk-clone top community SFT mixes ─ nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 & echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log" fi echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] WARN: /data not writable — running ephemeral!" >> "$LOG_DIR/boot.log" fi # ── 2. Bind HF Space secrets → ~/.hermes/.env ─────────────────────────────── # 🔒 DISABLE shell trace before touching secret values. set +x echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)" mkdir -p ~/.hermes { echo "# Auto-generated from HF Space secrets at boot" for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \ GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \ CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \ CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \ HF_TOKEN HUGGING_FACE_HUB_TOKEN; do v="${!k:-}" [[ -n "$v" ]] && echo "${k}=${v}" done } > ~/.hermes/.env chmod 600 ~/.hermes/.env echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)" # Trace OFF for the rest of boot — we already have line numbers above and won't need them post-secrets. # ── LOW_MEM short-circuit — skip ALL background daemons, exec status server ── # CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest # launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer # every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons # below this point (scrape, agentic-crawler, github-crawler, self-heal, cron # loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.) # that collectively grow into the cap within an hour. # # In LOW_MEM=1 mode the Space's only job is the FastAPI status server on # :7860 that serves harvest cursor advance to remote workers. Everything # else (harvest, mirroring, agent pipeline, training pushes, dataset enrich) # now runs on the GCP daemon fleet — see hermes-jobs.json (171 jobs scheduled # via hermes-scheduler-daemon as of 2026-05-02). # # Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (≥32GB). if [[ "$LOW_MEM" == "1" ]]; then echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log" set +x # silence trace # Verify deps before exec — print what's missing rather than silent crash if python3 -c "import fastapi, uvicorn" 2>/dev/null; then echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log" exec python3 ~/.surrogate/bin/hermes-status-server.py else echo "❌ fastapi/uvicorn not importable — falling back to plain http.server" exec python3 -m http.server 7860 --bind 0.0.0.0 fi fi # ── 3. Git config + clone axentx repos for auto-orchestrate auto-commit ──── # Disable interactive prompts globally so failed-auth git ops fail fast. export GIT_TERMINAL_PROMPT=0 export GIT_ASKPASS=/bin/true GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1) if [[ -n "$GH_TOKEN" ]]; then git config --global user.email "hermes@axentx.ai" git config --global user.name "Hermes (Surrogate-1)" git config --global init.defaultBranch main git config --global pull.rebase true git config --global push.default current PROJECTS_DIR="${DATA}/projects" mkdir -p "$PROJECTS_DIR" rm -rf ~/axentx 2>/dev/null ln -sfn "$PROJECTS_DIR" ~/axentx # Clone axentx repos in background with hard timeout — never blocks boot. # Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and # were silently 404'ing (arkashira/* — only surrogate-1-harvest is there; # the rest are private under axentx org). The agent pipeline's dev/qa/ # reviewer/commit daemons sat idle for a full day because no repo cloned # for them to work on. Real paths confirmed via /repos//: # axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1} → 200 # arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} → 404 # Note: 'arkship' was a typo for 'airship' (axentx/airship). for repo_spec in \ "Costinel:axentx/Costinel" \ "vanguard:axentx/vanguard" \ "airship:axentx/airship" \ "workio:axentx/workio" \ "axiomops:axentx/axiomops" \ "surrogate-1:axentx/surrogate-1"; do local_name="${repo_spec%%:*}" gh_path="${repo_spec##*:}" target="${PROJECTS_DIR}/${local_name}" ( if [[ ! -d "$target/.git" ]]; then echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log" timeout 30 git clone --depth 50 \ "https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \ >> "$LOG_DIR/git-clone.log" 2>&1 || \ echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log" else cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true fi ) & done # Don't wait — let clones finish in background while boot continues # Persist token for any push from auto-orchestrate git config --global credential.helper "store --file=$HOME/.git-credentials" echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials chmod 600 ~/.git-credentials echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log" fi # ── 4. Redis (TCP only) ───────────────────────────────────────────────────── # redis cap tightened on LOW_MEM (was 1gb → 256mb). Coordinator uses # SQLite directly; redis is only a soft cache for work-queue priorities. REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}" redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \ --maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru sleep 1 redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1 # ── 5. Ollama — DISABLED on cpu-basic (16 GB limit) ─────────────────────── # Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b # (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model # weights against a 16 GB cap → instant OOM on any inference. # # On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes) # is faster anyway — wafer-scale inference beats CPU x86 by 50-200×. # Ollama only worth running once Space upgrades to ≥cpu-upgrade (32 GB) OR # moves to OCI A1.Flex anchor (24 GB ARM, native ollama support). # # Set LOW_MEM=0 to re-enable on bigger Space tier. LOW_MEM="${LOW_MEM:-1}" if [[ "$LOW_MEM" == "1" ]]; then echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \ >> "$LOG_DIR/boot.log" echo "[$(date +%H:%M:%S)] → free LLM ladder serves all v2 inference" \ >> "$LOG_DIR/boot.log" else OLLAMA_MODELS="${HOME}/.ollama/models" \ OLLAMA_HOST=127.0.0.1:11434 \ nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 & sleep 6 ( if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1 fi if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then # Smallest coder that's actually useful — fits any tier ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1 fi ) & fi # ── 6. Discord bot (only if egress to discord.com is reachable) ──────────── # HF Spaces free tier may block egress to discord.com — bot would crash-loop. # Pre-flight check: if discord.com unreachable, skip bot, use webhook-only. if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then set -a; source ~/.hermes/.env 2>/dev/null; set +a nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 & echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)" else echo "[$(date +%H:%M:%S)] discord.com unreachable — skipping bot, using webhook-only" >> "$LOG_DIR/boot.log" fi fi # ── 7a. Continuous scrape daemon — concurrency tuned to LOW_MEM ──────────── SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}" cat > /tmp/scrape-daemon.sh </dev/null; set +a LOG="\${HOME}/.surrogate/logs/scrape-continuous.log" mkdir -p "\$(dirname "\$LOG")" while true; do START=\$(date +%s) bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1 DUR=\$(( \$(date +%s) - START )) if [[ \$DUR -lt 30 ]]; then sleep 30 elif [[ \$DUR -lt 120 ]]; then sleep 15 else sleep 5 fi done SCRAPESH chmod +x /tmp/scrape-daemon.sh nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 & echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log" # ── 7b. Agentic crawler — DISABLED on LOW_MEM (anchor takes this load) ───── if [[ "$LOW_MEM" != "1" ]]; then CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}" nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \ > "$LOG_DIR/agentic-crawler.log" 2>&1 & echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] ⚠ agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log" fi # ── 7b2. GitHub-specific agentic crawler (lightweight, keep on always) ───── nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 & echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log" # ── 7b3. HF Dataset Discoverer — DISABLED (replaced by continuous-discoverer) ─ # Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv + # Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh # is now redundant + memory pressure on cpu-basic. echo "[$(date +%H:%M:%S)] ⚠ hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log" # ── 7e. auto-orchestrate-continuous — DISABLED on LOW_MEM (cron handles it) ─ if [[ "$LOW_MEM" != "1" ]]; then nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 & echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log" fi # ── 7e1. SELF-HEAL WATCHDOG — must start BEFORE memory-hungry workers ─────── # Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if # usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise # crash the entire container. Also restarts stuck ingest / kicks stale uploader. nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 & echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log" # ── 7e2. GH-ACTIONS TICKER — burst-dispatch external runners every 60s ────── # Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every # 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner # timeouts, the 20-concurrent free-tier slot cap stays saturated. # Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as # Space secrets — operator can add later without restart-required. nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 & echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log" # ── 7e3. LLM BURST GENERATOR — synthetic training pairs from 8 free LLMs ──── # Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi. # Each cycle fires 3 prompts at every active provider in parallel, writes # {prompt, response} pairs to training-pairs.jsonl. Combined free-tier # budget: ~7000+ pairs/day. Skips any provider whose key env is not set. if [[ "$LOW_MEM" != "1" ]]; then nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 & echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] ⚠ llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log" fi sleep 3 # Stagger spawns — avoid memory burst at boot # ── 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) ───── # Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard # peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides # a second safety net if peak still spikes. if [[ "$LOW_MEM" != "1" ]]; then nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 & echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] ⚠ bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log" fi sleep 3 # ── 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10× faster) ── # Downloads parquet shards directly via HF datasets-server API + pyarrow filter. # Targets only trillion-scale corpora where streaming is too slow. # DLs reduced to 2 parallel — combined with 6 ingest shards stays under 16Gi. PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 & echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log" # ── 7c. Skill-synthesis daemon — DISABLED on LOW_MEM (heavy LLM calls) ──── if [[ "$LOW_MEM" != "1" ]]; then nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 & echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log" else echo "[$(date +%H:%M:%S)] ⚠ skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log" fi # ── 7d. Bulk mirror coordinator + 4 parallel workers ──────────────────────── # User feedback 2026-04-29: "ทุก agent ทำงานร่วมกัน และไม่ไปที่ซ้ำๆ". # Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db). # Workers each pull next pending dataset, mirror+sanitize+dedup, mark done. # 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc). # Lease-based claims (15 min) — crashes auto-expire so other workers pick up. python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true # Two worker types share the same coordinator queue: # bulk-mirror-worker.sh — full-download, suits small/medium datasets # streaming-mirror-worker.sh — HF datasets streaming, suits trillion-token # LOW_MEM tuning for cpu-basic 16GB Space (history): # v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2) # v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened) # v3 NOW: 1 bulk + 3 stream (post Civo-pivot + 4-Space fan-out; # anchor never came up so we can't rely on # it for bulk, and 16GB has ~8 GB unused # under the v2 setting → reclaim it) # # Memory budget per Space (16 GB cpu-basic): # ~6 GB reserved: OS + redis 256mb + continuous-discoverer + # dataset-enrich + auto-startup-loop + push bursts # ~10 GB available for harvest workers # 3 stream × 500 MB + 1 bulk × 600 MB = 2.1 GB used # ~8 GB headroom → memory-guard.sh kicks in at <3 GB free, safe # # Throughput delta: 4× workers/Space × 4 Spaces = 16× total worker count # (vs previous 1×4 = 4). Combined with enrich cron M%30==5 (was M%60), # expect 3-5× commit rate before HF soft-cap kicks in. BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}" STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}" for i in $(seq 1 "$BULK_WORKERS"); do nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \ > "$LOG_DIR/bulk-worker-$i.log" 2>&1 & done for i in $(seq 1 "$STREAM_WORKERS"); do nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \ > "$LOG_DIR/stream-worker-$i.log" 2>&1 & done TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS)) echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log" # ── 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) ─ # Replaces aggressive-harvester cron — runs always, sweeps HF + arxiv + SE + GH. if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \ > "$LOG_DIR/continuous-discoverer.log" 2>&1 & echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log" fi # ── Auto-startup-loop: 45 personae × 9 LoRA clusters × auto-commit + auto-push ─ # CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc. # 1 role per 15-min cycle; chained roles fire downstream automatically. if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \ > "$LOG_DIR/auto-startup-loop.log" 2>&1 & echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log" fi # ── 7d. Train-ready pusher — disabled at boot for now. Caused Space # RUNTIME_ERROR on first deployment (2026-04-29). Script kept at # bin/train-ready-pusher.sh; launch manually after Space proves stable: # nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 & # nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 & # ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─ cat > /tmp/hermes-cron.sh <<'CRONSH' #!/bin/bash set -a; source ~/.hermes/.env 2>/dev/null; set +a LOG="${HOME}/.surrogate/logs/cron.log" mkdir -p "$(dirname "$LOG")" while true; do M=$(($(date +%s) / 60)) # Cron offsets STAGGERED — minute=0 burst was OOM trigger. # Each major task picks a unique M%X==N offset so no two fire together. [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 & [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 & # Auto-scaler — spawn/kill workers based on free memory tier (burst-but-don't-die) [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 & # synth-puller — hit surrogate1 ZeroGPU /api/synth_batch every 5 min # Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate). [[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 & # push-training-to-hf gated by memory (loads big shard into RAM). # Anchor (24GB) takes over when capacity arrives — see anchor cron-loop. [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 & # auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) — see step 7e below. # Cron entry retained for legacy single-fire boost (no harm if continuous already up): [[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 & # Every 30 min: research-apply (pop queue → orchestrate → ship feature) [[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 & # Every 60 min: keyword tuner (adapts scrape queue based on yields) [[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 & # Every 6 hours: research-loop (discover new features from competitors/papers) [[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 & # Every 30 min: dataset enrich (was 60 min — bumped 2× now that we have # 4 Spaces × (3 stream + 1 bulk) = 16 workers harvesting in parallel, # producing more chunks per hour than the old 60-min push could drain). # Memory-guarded — full HF Hub iter is heavy. [[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 & # Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement) [[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 & # Every 30 min: build vector embeddings index (RAG semantic search) [[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 & # Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite) [[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ && bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 & # Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) → security-knowledge dataset [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 & # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems) [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 & # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to # Cerebras/Groq → +80 specific job-description-style search terms each). # Discoverer auto-uses the expanded list on its next cycle. [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 & # Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset # slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr, # so we DO want to keep submitting — Kaggle queues if 1 already running, # auto-cancels older if 5+ pending. With shorter interval we keep the # GPU pipeline saturated. [[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 & # Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo). # H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA — biggest free training. [[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 & # ── Round 5 (2026-04) sustainability loops ────────────────────────── # Every 6 hr (offset 90): self-improve loop — gen problems, judge, # winners → training data, losers → reflexion-store. [[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ && bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 & # Every 30 min (offset 22): mine new tool-call traces from logs into # SFT + DPO data, plus voyager skill candidates. [[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 & # Every 60 min (offset 17): export promoted voyager skills to JSONL # (training-data slice + inference-time retrieval source). [[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 & # Daily 07:00 UTC: active-learning batch from one bulk-mirror file. # Skips silently if no pool yet. [[ $((M % 1440)) -eq 420 ]] && { POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1) [[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \ --pool "$POOL" --n 200 --scan 1500 \ >> "$LOG_DIR/active-learning.log" 2>&1 & } # Daily 08:00 UTC: constitutional self-critique on yesterday's # winners (pulls latest self-improve winners file). [[ $((M % 1440)) -eq 480 ]] && { WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1) [[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \ --input "$WIN" --n 200 \ >> "$LOG_DIR/constitutional.log" 2>&1 & } # ── Round 7+8 (2026-04-30) — trillion-scale + harvester + enrich ────── # Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep) [[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \ >> "$LOG_DIR/aggressive-harvester.log" 2>&1 & # Every 60 min (offset 35): enrich newly-mirrored bulk files [[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ && bash ~/.surrogate/bin/v2/enrich-pipeline.sh \ >> "$LOG_DIR/enrich-pipeline.log" 2>&1 & # Every 30 min (offset 25): spawn extra streaming worker if pool empty [[ $((M % 30)) -eq 25 ]] && { if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \ > "$LOG_DIR/stream-worker-cron.log" 2>&1 & fi } # Daily 09:00 UTC: teachable-prompt filter on harvested data [[ $((M % 1440)) -eq 540 ]] && { LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1) [[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \ --input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \ --n 1000 --keep-target 200 \ >> "$LOG_DIR/teachable.log" 2>&1 & } # Daily 11:00 UTC: regression test suite (catches breakage post-push) [[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \ >> "$LOG_DIR/regression.log" 2>&1 & # Weekly Sun 10:00 UTC: abstract-cot compress reasoning data [[ $((M % 10080)) -eq 600 ]] && { for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do [[ -f "$f" ]] || continue python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \ --input "$f" --out "${f%.jsonl}-compressed.jsonl" \ >> "$LOG_DIR/abstract-cot.log" 2>&1 done } sleep 60 done CRONSH chmod +x /tmp/hermes-cron.sh nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 & echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log" # ── 8. Status HTTP server on :7860 (FastAPI/uvicorn — robust binding) ────── set +x # silence trace for clean uvicorn logs echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log" # Verify deps before exec — print what's missing rather than silent crash python3 -c "import fastapi, uvicorn; print(f' fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || { echo "❌ fastapi/uvicorn not importable — falling back to plain http.server" exec python3 -m http.server 7860 --bind 0.0.0.0 } # Run as PID 1 — uvicorn handles signals + auto-restart on crash exec python3 ~/.surrogate/bin/hermes-status-server.py