Spaces:
Running
Running
| # Hermes start orchestrator for HF Space. | |
| # Boots: persistent /data mount β Redis β Ollama β axentx repos β daemons β status server. | |
| set -uo pipefail | |
| LOG_DIR="${HOME}/.surrogate/logs" | |
| mkdir -p "$LOG_DIR" | |
| echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" | |
| echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log" | |
| # Trace mode for early steps only (no secrets here yet) β find hang point but stay safe | |
| PS4='[trace ${LINENO}] ' | |
| set -x | |
| # Echo stdout so HF run-logs see progress (safe steps before .env is loaded) | |
| exec > >(tee -a "$LOG_DIR/boot.log") 2>&1 | |
| # ββ Memory mode (must be set BEFORE any reference; we use `set -u`) βββββββ | |
| # CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest | |
| # launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer, | |
| # dedup-bootstrap) β those run on GCP daemons instead. Set LOW_MEM=0 only | |
| # on a paid Space tier (cpu-upgrade β₯32 GB). | |
| LOW_MEM="${LOW_MEM:-1}" | |
| # ββ 1. Persistent data β symlink state subdirs to /data (HF persistent mount) ββ | |
| # bin/ is NOT persisted (baked into image, refreshed on every push). | |
| # Persisted: state (DBs), logs, memory, skills, sessions, training pairs, | |
| # workspace (hermes runtime), projects (axentx clones), ollama (model cache). | |
| DATA="/data" | |
| if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then | |
| mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index} | |
| # Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level | |
| if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then | |
| mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true | |
| rmdir "$DATA/surrogate" 2>/dev/null || true | |
| fi | |
| for spec in \ | |
| "${HOME}/.surrogate/state:${DATA}/state" \ | |
| "${HOME}/.surrogate/logs:${DATA}/logs" \ | |
| "${HOME}/.surrogate/memory:${DATA}/memory" \ | |
| "${HOME}/.surrogate/skills:${DATA}/skills" \ | |
| "${HOME}/.surrogate/sessions:${DATA}/sessions" \ | |
| "${HOME}/.hermes/workspace:${DATA}/workspace" \ | |
| "${HOME}/.ollama:${DATA}/ollama"; do | |
| target="${spec%%:*}" | |
| link="${spec##*:}" | |
| mkdir -p "$(dirname "$target")" | |
| # Always ensure backing directory exists + writable. If the persistent | |
| # /data mount becomes unavailable mid-run, daemon writes to symlinked | |
| # path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the | |
| # link defensively each boot fixes stale-symlink cases. | |
| mkdir -p "$link" 2>/dev/null || true | |
| if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then | |
| # Either not-a-symlink OR broken symlink (target unreachable) | |
| rm -rf "$target" 2>/dev/null | |
| ln -sfn "$link" "$target" | |
| fi | |
| # Final sanity probe β write a marker; if it fails, the persistent | |
| # mount is broken regardless of the symlink, so log loudly. | |
| if ! touch "$target/.boot-marker" 2>/dev/null; then | |
| echo "[$(date +%H:%M:%S)] β FATAL: $target/ not writable β daemon log writes will Errno 5" | |
| fi | |
| done | |
| # training-pairs.jsonl β single file persistence | |
| if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then | |
| rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null | |
| touch "${DATA}/training-pairs.jsonl" | |
| ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl" | |
| fi | |
| # ββ One-time offset reset: skip polluted agentic-crawler placeholder backlog ββ | |
| if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then | |
| CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ') | |
| echo "$CUR" > "${HOME}/.surrogate/.training-push-offset" | |
| echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset" | |
| touch "${HOME}/.surrogate/.offset-reset-done" | |
| echo "[$(date +%H:%M:%S)] one-time offset reset β $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ Boot-time dedup.db corruption check ββββββββββββββββββββββββββββββ | |
| # 16 parallel shards previously corrupted the SQLite WAL. If the DB is | |
| # unreadable on boot, back it up and force re-bootstrap from scratch. | |
| DEDUP_DB="${HOME}/.surrogate/state/dedup.db" | |
| if [[ -f "$DEDUP_DB" ]]; then | |
| if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then | |
| TS=$(date +%s) | |
| mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null | |
| rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm" | |
| rm -f "${HOME}/.surrogate/.dedup-bootstrap-done" | |
| echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db β ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log" | |
| fi | |
| fi | |
| # ββ Heavy harvest launchers β only on HIGH_MEM (LOW_MEM=0) βββββββββββ | |
| # On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest | |
| # workers blew through the cap and HF auto-killed the container ~5 min after | |
| # boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon | |
| # (entries in data/hermes-jobs.json) so harvest still runs β just not from | |
| # inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we | |
| # upgrade to a β₯32 GB tier. | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| # ββ One-time central dedup bootstrap from existing data ββββββββββ | |
| if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then | |
| echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log" | |
| nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 & | |
| fi | |
| # ββ BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron) | |
| nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log" | |
| # ββ BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) β | |
| nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log" | |
| # ββ BOOT-TIME lightning-trainer kickoff β H200 4 hr free for big model | |
| nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log" | |
| # ββ BOOT-TIME dataset-mirror β bulk-clone top community SFT mixes β | |
| nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] LOW_MEM=1 β skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log" | |
| fi | |
| echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] WARN: /data not writable β running ephemeral!" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 2. Bind HF Space secrets β ~/.hermes/.env βββββββββββββββββββββββββββββββ | |
| # π DISABLE shell trace before touching secret values. | |
| set +x | |
| echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)" | |
| mkdir -p ~/.hermes | |
| { | |
| echo "# Auto-generated from HF Space secrets at boot" | |
| for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \ | |
| GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \ | |
| CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \ | |
| CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \ | |
| HF_TOKEN HUGGING_FACE_HUB_TOKEN; do | |
| v="${!k:-}" | |
| [[ -n "$v" ]] && echo "${k}=${v}" | |
| done | |
| } > ~/.hermes/.env | |
| chmod 600 ~/.hermes/.env | |
| echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)" | |
| # Trace OFF for the rest of boot β we already have line numbers above and won't need them post-secrets. | |
| # ββ LOW_MEM short-circuit β skip ALL background daemons, exec status server ββ | |
| # CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest | |
| # launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer | |
| # every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons | |
| # below this point (scrape, agentic-crawler, github-crawler, self-heal, cron | |
| # loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.) | |
| # that collectively grow into the cap within an hour. | |
| # | |
| # In LOW_MEM=1 mode the Space's only job is the FastAPI status server on | |
| # :7860 that serves harvest cursor advance to remote workers. Everything | |
| # else (harvest, mirroring, agent pipeline, training pushes, dataset enrich) | |
| # now runs on the GCP daemon fleet β see hermes-jobs.json (171 jobs scheduled | |
| # via hermes-scheduler-daemon as of 2026-05-02). | |
| # | |
| # Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (β₯32GB). | |
| if [[ "$LOW_MEM" == "1" ]]; then | |
| echo "[$(date +%H:%M:%S)] LOW_MEM=1 β skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log" | |
| set +x # silence trace | |
| # Verify deps before exec β print what's missing rather than silent crash | |
| if python3 -c "import fastapi, uvicorn" 2>/dev/null; then | |
| echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log" | |
| exec python3 ~/.surrogate/bin/hermes-status-server.py | |
| else | |
| echo "β fastapi/uvicorn not importable β falling back to plain http.server" | |
| exec python3 -m http.server 7860 --bind 0.0.0.0 | |
| fi | |
| fi | |
| # ββ 3. Git config + clone axentx repos for auto-orchestrate auto-commit ββββ | |
| # Disable interactive prompts globally so failed-auth git ops fail fast. | |
| export GIT_TERMINAL_PROMPT=0 | |
| export GIT_ASKPASS=/bin/true | |
| GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1) | |
| if [[ -n "$GH_TOKEN" ]]; then | |
| git config --global user.email "hermes@axentx.ai" | |
| git config --global user.name "Hermes (Surrogate-1)" | |
| git config --global init.defaultBranch main | |
| git config --global pull.rebase true | |
| git config --global push.default current | |
| PROJECTS_DIR="${DATA}/projects" | |
| mkdir -p "$PROJECTS_DIR" | |
| rm -rf ~/axentx 2>/dev/null | |
| ln -sfn "$PROJECTS_DIR" ~/axentx | |
| # Clone axentx repos in background with hard timeout β never blocks boot. | |
| # Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and | |
| # were silently 404'ing (arkashira/* β only surrogate-1-harvest is there; | |
| # the rest are private under axentx org). The agent pipeline's dev/qa/ | |
| # reviewer/commit daemons sat idle for a full day because no repo cloned | |
| # for them to work on. Real paths confirmed via /repos/<owner>/<name>: | |
| # axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1} β 200 | |
| # arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} β 404 | |
| # Note: 'arkship' was a typo for 'airship' (axentx/airship). | |
| for repo_spec in \ | |
| "Costinel:axentx/Costinel" \ | |
| "vanguard:axentx/vanguard" \ | |
| "airship:axentx/airship" \ | |
| "workio:axentx/workio" \ | |
| "axiomops:axentx/axiomops" \ | |
| "surrogate-1:axentx/surrogate-1"; do | |
| local_name="${repo_spec%%:*}" | |
| gh_path="${repo_spec##*:}" | |
| target="${PROJECTS_DIR}/${local_name}" | |
| ( | |
| if [[ ! -d "$target/.git" ]]; then | |
| echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log" | |
| timeout 30 git clone --depth 50 \ | |
| "https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \ | |
| >> "$LOG_DIR/git-clone.log" 2>&1 || \ | |
| echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log" | |
| else | |
| cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true | |
| fi | |
| ) & | |
| done | |
| # Don't wait β let clones finish in background while boot continues | |
| # Persist token for any push from auto-orchestrate | |
| git config --global credential.helper "store --file=$HOME/.git-credentials" | |
| echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials | |
| chmod 600 ~/.git-credentials | |
| echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 4. Redis (TCP only) βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # redis cap tightened on LOW_MEM (was 1gb β 256mb). Coordinator uses | |
| # SQLite directly; redis is only a soft cache for work-queue priorities. | |
| REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}" | |
| redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \ | |
| --maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru | |
| sleep 1 | |
| redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1 | |
| # ββ 5. Ollama β DISABLED on cpu-basic (16 GB limit) βββββββββββββββββββββββ | |
| # Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b | |
| # (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model | |
| # weights against a 16 GB cap β instant OOM on any inference. | |
| # | |
| # On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes) | |
| # is faster anyway β wafer-scale inference beats CPU x86 by 50-200Γ. | |
| # Ollama only worth running once Space upgrades to β₯cpu-upgrade (32 GB) OR | |
| # moves to OCI A1.Flex anchor (24 GB ARM, native ollama support). | |
| # | |
| # Set LOW_MEM=0 to re-enable on bigger Space tier. | |
| LOW_MEM="${LOW_MEM:-1}" | |
| if [[ "$LOW_MEM" == "1" ]]; then | |
| echo "[$(date +%H:%M:%S)] β ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \ | |
| >> "$LOG_DIR/boot.log" | |
| echo "[$(date +%H:%M:%S)] β free LLM ladder serves all v2 inference" \ | |
| >> "$LOG_DIR/boot.log" | |
| else | |
| OLLAMA_MODELS="${HOME}/.ollama/models" \ | |
| OLLAMA_HOST=127.0.0.1:11434 \ | |
| nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 & | |
| sleep 6 | |
| ( | |
| if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then | |
| ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1 | |
| fi | |
| if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then | |
| # Smallest coder that's actually useful β fits any tier | |
| ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1 | |
| fi | |
| ) & | |
| fi | |
| # ββ 6. Discord bot (only if egress to discord.com is reachable) ββββββββββββ | |
| # HF Spaces free tier may block egress to discord.com β bot would crash-loop. | |
| # Pre-flight check: if discord.com unreachable, skip bot, use webhook-only. | |
| if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then | |
| if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then | |
| set -a; source ~/.hermes/.env 2>/dev/null; set +a | |
| nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)" | |
| else | |
| echo "[$(date +%H:%M:%S)] discord.com unreachable β skipping bot, using webhook-only" >> "$LOG_DIR/boot.log" | |
| fi | |
| fi | |
| # ββ 7a. Continuous scrape daemon β concurrency tuned to LOW_MEM ββββββββββββ | |
| SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}" | |
| cat > /tmp/scrape-daemon.sh <<SCRAPESH | |
| #!/bin/bash | |
| set -a; source ~/.hermes/.env 2>/dev/null; set +a | |
| LOG="\${HOME}/.surrogate/logs/scrape-continuous.log" | |
| mkdir -p "\$(dirname "\$LOG")" | |
| while true; do | |
| START=\$(date +%s) | |
| bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1 | |
| DUR=\$(( \$(date +%s) - START )) | |
| if [[ \$DUR -lt 30 ]]; then sleep 30 | |
| elif [[ \$DUR -lt 120 ]]; then sleep 15 | |
| else sleep 5 | |
| fi | |
| done | |
| SCRAPESH | |
| chmod +x /tmp/scrape-daemon.sh | |
| nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log" | |
| # ββ 7b. Agentic crawler β DISABLED on LOW_MEM (anchor takes this load) βββββ | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}" | |
| nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \ | |
| > "$LOG_DIR/agentic-crawler.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] β agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 7b2. GitHub-specific agentic crawler (lightweight, keep on always) βββββ | |
| nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log" | |
| # ββ 7b3. HF Dataset Discoverer β DISABLED (replaced by continuous-discoverer) β | |
| # Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv + | |
| # Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh | |
| # is now redundant + memory pressure on cpu-basic. | |
| echo "[$(date +%H:%M:%S)] β hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log" | |
| # ββ 7e. auto-orchestrate-continuous β DISABLED on LOW_MEM (cron handles it) β | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] β auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 7e1. SELF-HEAL WATCHDOG β must start BEFORE memory-hungry workers βββββββ | |
| # Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if | |
| # usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise | |
| # crash the entire container. Also restarts stuck ingest / kicks stale uploader. | |
| nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log" | |
| # ββ 7e2. GH-ACTIONS TICKER β burst-dispatch external runners every 60s ββββββ | |
| # Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every | |
| # 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner | |
| # timeouts, the 20-concurrent free-tier slot cap stays saturated. | |
| # Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as | |
| # Space secrets β operator can add later without restart-required. | |
| nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log" | |
| # ββ 7e3. LLM BURST GENERATOR β synthetic training pairs from 8 free LLMs ββββ | |
| # Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi. | |
| # Each cycle fires 3 prompts at every active provider in parallel, writes | |
| # {prompt, response} pairs to training-pairs.jsonl. Combined free-tier | |
| # budget: ~7000+ pairs/day. Skips any provider whose key env is not set. | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] β llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log" | |
| fi | |
| sleep 3 # Stagger spawns β avoid memory burst at boot | |
| # ββ 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) βββββ | |
| # Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard | |
| # peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides | |
| # a second safety net if peak still spikes. | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] β bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log" | |
| fi | |
| sleep 3 | |
| # ββ 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10Γ faster) ββ | |
| # Downloads parquet shards directly via HF datasets-server API + pyarrow filter. | |
| # Targets only trillion-scale corpora where streaming is too slow. | |
| # DLs reduced to 2 parallel β combined with 6 ingest shards stays under 16Gi. | |
| PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log" | |
| # ββ 7c. Skill-synthesis daemon β DISABLED on LOW_MEM (heavy LLM calls) ββββ | |
| if [[ "$LOW_MEM" != "1" ]]; then | |
| nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log" | |
| else | |
| echo "[$(date +%H:%M:%S)] β skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 7d. Bulk mirror coordinator + 4 parallel workers ββββββββββββββββββββββββ | |
| # User feedback 2026-04-29: "ΰΈΰΈΈΰΈ agent ΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈ£ΰΉΰΈ§ΰΈ‘ΰΈΰΈ±ΰΈ ΰΉΰΈ₯ΰΈ°ΰΉΰΈ‘ΰΉΰΉΰΈΰΈΰΈ΅ΰΉΰΈΰΉΰΈ³ΰΉ". | |
| # Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db). | |
| # Workers each pull next pending dataset, mirror+sanitize+dedup, mark done. | |
| # 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc). | |
| # Lease-based claims (15 min) β crashes auto-expire so other workers pick up. | |
| python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true | |
| # Two worker types share the same coordinator queue: | |
| # bulk-mirror-worker.sh β full-download, suits small/medium datasets | |
| # streaming-mirror-worker.sh β HF datasets streaming, suits trillion-token | |
| # LOW_MEM tuning for cpu-basic 16GB Space (history): | |
| # v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2) | |
| # v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened) | |
| # v3 NOW: 1 bulk + 3 stream (post Civo-pivot + 4-Space fan-out; | |
| # anchor never came up so we can't rely on | |
| # it for bulk, and 16GB has ~8 GB unused | |
| # under the v2 setting β reclaim it) | |
| # | |
| # Memory budget per Space (16 GB cpu-basic): | |
| # ~6 GB reserved: OS + redis 256mb + continuous-discoverer + | |
| # dataset-enrich + auto-startup-loop + push bursts | |
| # ~10 GB available for harvest workers | |
| # 3 stream Γ 500 MB + 1 bulk Γ 600 MB = 2.1 GB used | |
| # ~8 GB headroom β memory-guard.sh kicks in at <3 GB free, safe | |
| # | |
| # Throughput delta: 4Γ workers/Space Γ 4 Spaces = 16Γ total worker count | |
| # (vs previous 1Γ4 = 4). Combined with enrich cron M%30==5 (was M%60), | |
| # expect 3-5Γ commit rate before HF soft-cap kicks in. | |
| BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}" | |
| STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}" | |
| for i in $(seq 1 "$BULK_WORKERS"); do | |
| nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \ | |
| > "$LOG_DIR/bulk-worker-$i.log" 2>&1 & | |
| done | |
| for i in $(seq 1 "$STREAM_WORKERS"); do | |
| nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \ | |
| > "$LOG_DIR/stream-worker-$i.log" 2>&1 & | |
| done | |
| TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS)) | |
| echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log" | |
| # ββ 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) β | |
| # Replaces aggressive-harvester cron β runs always, sweeps HF + arxiv + SE + GH. | |
| if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then | |
| nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \ | |
| > "$LOG_DIR/continuous-discoverer.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ Auto-startup-loop: 45 personae Γ 9 LoRA clusters Γ auto-commit + auto-push β | |
| # CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc. | |
| # 1 role per 15-min cycle; chained roles fire downstream automatically. | |
| if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then | |
| nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \ | |
| > "$LOG_DIR/auto-startup-loop.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log" | |
| fi | |
| # ββ 7d. Train-ready pusher β disabled at boot for now. Caused Space | |
| # RUNTIME_ERROR on first deployment (2026-04-29). Script kept at | |
| # bin/train-ready-pusher.sh; launch manually after Space proves stable: | |
| # nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 & | |
| # nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 & | |
| # ββ 7b. Cron loop β non-scrape daemons (scrape now runs continuously above) β | |
| cat > /tmp/hermes-cron.sh <<'CRONSH' | |
| #!/bin/bash | |
| set -a; source ~/.hermes/.env 2>/dev/null; set +a | |
| LOG="${HOME}/.surrogate/logs/cron.log" | |
| mkdir -p "$(dirname "$LOG")" | |
| while true; do | |
| M=$(($(date +%s) / 60)) | |
| # Cron offsets STAGGERED β minute=0 burst was OOM trigger. | |
| # Each major task picks a unique M%X==N offset so no two fire together. | |
| [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 & | |
| [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 & | |
| # Auto-scaler β spawn/kill workers based on free memory tier (burst-but-don't-die) | |
| [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 & | |
| # synth-puller β hit surrogate1 ZeroGPU /api/synth_batch every 5 min | |
| # Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate). | |
| [[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 & | |
| # push-training-to-hf gated by memory (loads big shard into RAM). | |
| # Anchor (24GB) takes over when capacity arrives β see anchor cron-loop. | |
| [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ | |
| && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 & | |
| # auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) β see step 7e below. | |
| # Cron entry retained for legacy single-fire boost (no harm if continuous already up): | |
| [[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 & | |
| # Every 30 min: research-apply (pop queue β orchestrate β ship feature) | |
| [[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 & | |
| # Every 60 min: keyword tuner (adapts scrape queue based on yields) | |
| [[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 & | |
| # Every 6 hours: research-loop (discover new features from competitors/papers) | |
| [[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 & | |
| # Every 30 min: dataset enrich (was 60 min β bumped 2Γ now that we have | |
| # 4 Spaces Γ (3 stream + 1 bulk) = 16 workers harvesting in parallel, | |
| # producing more chunks per hour than the old 60-min push could drain). | |
| # Memory-guarded β full HF Hub iter is heavy. | |
| [[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ | |
| && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 & | |
| # Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement) | |
| [[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 & | |
| # Every 30 min: build vector embeddings index (RAG semantic search) | |
| [[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 & | |
| # Every 30 min: synthetic data generation (REWORKβAPPROVE DPO + distilabel rewrite) | |
| [[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ | |
| && bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 & | |
| # Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) β security-knowledge dataset | |
| [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 & | |
| # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems) | |
| [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 & | |
| # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to | |
| # Cerebras/Groq β +80 specific job-description-style search terms each). | |
| # Discoverer auto-uses the expanded list on its next cycle. | |
| [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 & | |
| # Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset | |
| # slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr, | |
| # so we DO want to keep submitting β Kaggle queues if 1 already running, | |
| # auto-cancels older if 5+ pending. With shorter interval we keep the | |
| # GPU pipeline saturated. | |
| [[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 & | |
| # Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo). | |
| # H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA β biggest free training. | |
| [[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 & | |
| # ββ Round 5 (2026-04) sustainability loops ββββββββββββββββββββββββββ | |
| # Every 6 hr (offset 90): self-improve loop β gen problems, judge, | |
| # winners β training data, losers β reflexion-store. | |
| [[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ | |
| && bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 & | |
| # Every 30 min (offset 22): mine new tool-call traces from logs into | |
| # SFT + DPO data, plus voyager skill candidates. | |
| [[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 & | |
| # Every 60 min (offset 17): export promoted voyager skills to JSONL | |
| # (training-data slice + inference-time retrieval source). | |
| [[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 & | |
| # Daily 07:00 UTC: active-learning batch from one bulk-mirror file. | |
| # Skips silently if no pool yet. | |
| [[ $((M % 1440)) -eq 420 ]] && { | |
| POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1) | |
| [[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \ | |
| --pool "$POOL" --n 200 --scan 1500 \ | |
| >> "$LOG_DIR/active-learning.log" 2>&1 & | |
| } | |
| # Daily 08:00 UTC: constitutional self-critique on yesterday's | |
| # winners (pulls latest self-improve winners file). | |
| [[ $((M % 1440)) -eq 480 ]] && { | |
| WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1) | |
| [[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \ | |
| --input "$WIN" --n 200 \ | |
| >> "$LOG_DIR/constitutional.log" 2>&1 & | |
| } | |
| # ββ Round 7+8 (2026-04-30) β trillion-scale + harvester + enrich ββββββ | |
| # Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep) | |
| [[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \ | |
| >> "$LOG_DIR/aggressive-harvester.log" 2>&1 & | |
| # Every 60 min (offset 35): enrich newly-mirrored bulk files | |
| [[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \ | |
| && bash ~/.surrogate/bin/v2/enrich-pipeline.sh \ | |
| >> "$LOG_DIR/enrich-pipeline.log" 2>&1 & | |
| # Every 30 min (offset 25): spawn extra streaming worker if pool empty | |
| [[ $((M % 30)) -eq 25 ]] && { | |
| if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then | |
| nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \ | |
| > "$LOG_DIR/stream-worker-cron.log" 2>&1 & | |
| fi | |
| } | |
| # Daily 09:00 UTC: teachable-prompt filter on harvested data | |
| [[ $((M % 1440)) -eq 540 ]] && { | |
| LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1) | |
| [[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \ | |
| --input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \ | |
| --n 1000 --keep-target 200 \ | |
| >> "$LOG_DIR/teachable.log" 2>&1 & | |
| } | |
| # Daily 11:00 UTC: regression test suite (catches breakage post-push) | |
| [[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \ | |
| >> "$LOG_DIR/regression.log" 2>&1 & | |
| # Weekly Sun 10:00 UTC: abstract-cot compress reasoning data | |
| [[ $((M % 10080)) -eq 600 ]] && { | |
| for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do | |
| [[ -f "$f" ]] || continue | |
| python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \ | |
| --input "$f" --out "${f%.jsonl}-compressed.jsonl" \ | |
| >> "$LOG_DIR/abstract-cot.log" 2>&1 | |
| done | |
| } | |
| sleep 60 | |
| done | |
| CRONSH | |
| chmod +x /tmp/hermes-cron.sh | |
| nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 & | |
| echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log" | |
| # ββ 8. Status HTTP server on :7860 (FastAPI/uvicorn β robust binding) ββββββ | |
| set +x # silence trace for clean uvicorn logs | |
| echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log" | |
| # Verify deps before exec β print what's missing rather than silent crash | |
| python3 -c "import fastapi, uvicorn; print(f' fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || { | |
| echo "β fastapi/uvicorn not importable β falling back to plain http.server" | |
| exec python3 -m http.server 7860 --bind 0.0.0.0 | |
| } | |
| # Run as PID 1 β uvicorn handles signals + auto-restart on crash | |
| exec python3 ~/.surrogate/bin/hermes-status-server.py | |