Spaces:

axentx
/

surrogate-1

Runtime error

File size: 35,499 Bytes

f7e1070
 
b1eb6a2
f7e1070
 
e36381e
f7e1070
d4e26b2
 
bdedae9
 
 
 
 
 
d4e26b2
f7e1070
a5c37dd
 
 
 
 
 
 
e36381e
 
 
 
b1eb6a2
 
e36381e
 
 
 
 
 
 
 
 
 
 
 
 
b1eb6a2
 
e36381e
 
b1eb6a2
b532db8
 
 
 
 
 
 
b1eb6a2
 
 
b532db8
 
 
 
 
b1eb6a2
e36381e
 
 
 
 
 
 
 
aa008c4
 
 
 
 
 
 
 
 
0ad083a
 
 
 
 
 
 
 
 
 
 
 
 
 
a5c37dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508b0e2
a5c37dd
 
 
 
 
 
 
 
 
 
 
 
 
 
0005a16
e36381e
b1eb6a2
 
 
 
 
d0ef0a5
 
 
f7e1070
 
 
b1eb6a2
 
 
dc2d4b4
 
b1eb6a2
 
 
f7e1070
 
d0ef0a5
 
f7e1070
ce077ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1eb6a2
147cb0f
 
 
 
b1eb6a2
 
 
 
 
 
 
 
 
 
 
 
 
154f078
 
 
 
 
 
 
 
 
b1eb6a2
154f078
 
 
 
 
 
b1eb6a2
 
 
147cb0f
 
 
 
 
 
 
 
 
 
 
b1eb6a2
147cb0f
b1eb6a2
 
 
 
 
147cb0f
b1eb6a2
 
 
d8d7a71
 
 
b1eb6a2
d8d7a71
f7e1070
b1eb6a2
f7e1070
426d0e5
 
 
 
9d0ec79
426d0e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7e1070
266304a
 
 
f7e1070
266304a
 
 
 
 
 
 
f7e1070
 
426d0e5
 
 
9d0ec79
 
426d0e5
 
9d0ec79
426d0e5
 
 
 
 
5c8d6dd
9d0ec79
 
 
 
 
426d0e5
5c8d6dd
a8f4e74
 
 
 
 
 
 
 
 
5c8d6dd
a8f4e74
d59de60
426d0e5
d59de60
a8f4e74
 
 
 
 
dd483c7
426d0e5
 
 
 
 
 
 
80f9271
859b78d
 
 
 
 
 
 
52df1e3
 
 
 
 
 
 
 
 
8f598ec
 
 
 
 
47f02de
 
 
 
 
 
 
8f598ec
859b78d
 
 
 
47f02de
 
 
 
 
 
 
bad154c
 
 
 
859b78d
 
 
2f31c27
a8f4e74
 
 
 
 
 
 
9d0ec79
4831adb
 
 
 
 
 
 
a9b3bd4
 
 
 
7d05ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b3bd4
426d0e5
4831adb
 
 
a9b3bd4
 
 
 
 
 
4831adb
a27499d
 
 
 
 
 
 
 
47f02de
 
 
 
 
 
 
 
 
6de7ab5
 
 
 
 
2fd0435
9d0ec79
f7e1070
 
 
e36381e
b1eb6a2
f7e1070
b1eb6a2
3234167
 
 
 
ff2fbf3
 
47f02de
 
 
d8d7a71
 
 
 
80f9271
 
 
88079d2
e36381e
9d0ec79
3234167
88079d2
e36381e
7d05ef5
 
 
 
 
d8d7a71
7cbea95
3234167
7cbea95
 
ea561c8
d8d7a71
 
57a564a
 
 
 
c0c6fe0
 
 
 
6180513
6dd5997
6180513
 
 
 
6038c4f
 
 
17967dd
 
 
 
d8d7a71
 
17967dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b3bd4
 
 
 
 
 
d8d7a71
 
a9b3bd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c00e62
 
 
a9b3bd4
 
 
 
 
 
 
 
 
f7e1070
 
 
 
b1eb6a2
f7e1070
 
7d77adb
bdedae9
 
 
 
 
 
 
 
f7e1070
7d77adb
e36381e

#!/usr/bin/env bash
# Hermes start orchestrator for HF Space.
# Boots: persistent /data mount → Redis → Ollama → axentx repos → daemons → status server.
set -uo pipefail

LOG_DIR="${HOME}/.surrogate/logs"
mkdir -p "$LOG_DIR"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log"

# Trace mode for early steps only (no secrets here yet) — find hang point but stay safe
PS4='[trace ${LINENO}] '
set -x

# Echo stdout so HF run-logs see progress (safe steps before .env is loaded)
exec > >(tee -a "$LOG_DIR/boot.log") 2>&1

# ── Memory mode (must be set BEFORE any reference; we use `set -u`) ───────
# CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest
# launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer,
# dedup-bootstrap) — those run on GCP daemons instead. Set LOW_MEM=0 only
# on a paid Space tier (cpu-upgrade ≥32 GB).
LOW_MEM="${LOW_MEM:-1}"

# ── 1. Persistent data — symlink state subdirs to /data (HF persistent mount) ──
# bin/ is NOT persisted (baked into image, refreshed on every push).
# Persisted: state (DBs), logs, memory, skills, sessions, training pairs,
#            workspace (hermes runtime), projects (axentx clones), ollama (model cache).
DATA="/data"
if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
    mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index}
    # Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level
    if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then
        mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true
        rmdir "$DATA/surrogate" 2>/dev/null || true
    fi

    for spec in \
        "${HOME}/.surrogate/state:${DATA}/state" \
        "${HOME}/.surrogate/logs:${DATA}/logs" \
        "${HOME}/.surrogate/memory:${DATA}/memory" \
        "${HOME}/.surrogate/skills:${DATA}/skills" \
        "${HOME}/.surrogate/sessions:${DATA}/sessions" \
        "${HOME}/.hermes/workspace:${DATA}/workspace" \
        "${HOME}/.ollama:${DATA}/ollama"; do
        target="${spec%%:*}"
        link="${spec##*:}"
        mkdir -p "$(dirname "$target")"
        # Always ensure backing directory exists + writable. If the persistent
        # /data mount becomes unavailable mid-run, daemon writes to symlinked
        # path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the
        # link defensively each boot fixes stale-symlink cases.
        mkdir -p "$link" 2>/dev/null || true
        if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then
            # Either not-a-symlink OR broken symlink (target unreachable)
            rm -rf "$target" 2>/dev/null
            ln -sfn "$link" "$target"
        fi
        # Final sanity probe — write a marker; if it fails, the persistent
        # mount is broken regardless of the symlink, so log loudly.
        if ! touch "$target/.boot-marker" 2>/dev/null; then
            echo "[$(date +%H:%M:%S)] ⚠ FATAL: $target/ not writable — daemon log writes will Errno 5"
        fi
    done

    # training-pairs.jsonl — single file persistence
    if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null
        touch "${DATA}/training-pairs.jsonl"
        ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
    fi

    # ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ──
    if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
        echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
        echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
        touch "${HOME}/.surrogate/.offset-reset-done"
        echo "[$(date +%H:%M:%S)] one-time offset reset → $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
    fi

    # ── Boot-time dedup.db corruption check ──────────────────────────────
    # 16 parallel shards previously corrupted the SQLite WAL. If the DB is
    # unreadable on boot, back it up and force re-bootstrap from scratch.
    DEDUP_DB="${HOME}/.surrogate/state/dedup.db"
    if [[ -f "$DEDUP_DB" ]]; then
        if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then
            TS=$(date +%s)
            mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null
            rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm"
            rm -f "${HOME}/.surrogate/.dedup-bootstrap-done"
            echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db → ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log"
        fi
    fi

    # ── Heavy harvest launchers — only on HIGH_MEM (LOW_MEM=0) ───────────
    # On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest
    # workers blew through the cap and HF auto-killed the container ~5 min after
    # boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon
    # (entries in data/hermes-jobs.json) so harvest still runs — just not from
    # inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we
    # upgrade to a ≥32 GB tier.
    if [[ "$LOW_MEM" != "1" ]]; then
        # ── One-time central dedup bootstrap from existing data ──────────
        if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then
            echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log"
            nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 &
        fi

        # ── BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron)
        nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) ─
        nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME lightning-trainer kickoff — H200 4 hr free for big model
        nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME dataset-mirror — bulk-clone top community SFT mixes ─
        nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log"
    else
        echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log"
    fi

    echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] WARN: /data not writable — running ephemeral!" >> "$LOG_DIR/boot.log"
fi

# ── 2. Bind HF Space secrets → ~/.hermes/.env ───────────────────────────────
# 🔒 DISABLE shell trace before touching secret values.
set +x
echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)"
mkdir -p ~/.hermes
{
    echo "# Auto-generated from HF Space secrets at boot"
    for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \
             GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \
             CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \
             CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \
             HF_TOKEN HUGGING_FACE_HUB_TOKEN; do
        v="${!k:-}"
        [[ -n "$v" ]] && echo "${k}=${v}"
    done
} > ~/.hermes/.env
chmod 600 ~/.hermes/.env
echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)"
# Trace OFF for the rest of boot — we already have line numbers above and won't need them post-secrets.

# ── LOW_MEM short-circuit — skip ALL background daemons, exec status server ──
# CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest
# launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer
# every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons
# below this point (scrape, agentic-crawler, github-crawler, self-heal, cron
# loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.)
# that collectively grow into the cap within an hour.
#
# In LOW_MEM=1 mode the Space's only job is the FastAPI status server on
# :7860 that serves harvest cursor advance to remote workers. Everything
# else (harvest, mirroring, agent pipeline, training pushes, dataset enrich)
# now runs on the GCP daemon fleet — see hermes-jobs.json (171 jobs scheduled
# via hermes-scheduler-daemon as of 2026-05-02).
#
# Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (≥32GB).
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log"
    set +x   # silence trace
    # Verify deps before exec — print what's missing rather than silent crash
    if python3 -c "import fastapi, uvicorn" 2>/dev/null; then
        echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log"
        exec python3 ~/.surrogate/bin/hermes-status-server.py
    else
        echo "❌ fastapi/uvicorn not importable — falling back to plain http.server"
        exec python3 -m http.server 7860 --bind 0.0.0.0
    fi
fi

# ── 3. Git config + clone axentx repos for auto-orchestrate auto-commit ────
# Disable interactive prompts globally so failed-auth git ops fail fast.
export GIT_TERMINAL_PROMPT=0
export GIT_ASKPASS=/bin/true

GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1)
if [[ -n "$GH_TOKEN" ]]; then
    git config --global user.email "hermes@axentx.ai"
    git config --global user.name  "Hermes (Surrogate-1)"
    git config --global init.defaultBranch main
    git config --global pull.rebase true
    git config --global push.default current

    PROJECTS_DIR="${DATA}/projects"
    mkdir -p "$PROJECTS_DIR"
    rm -rf ~/axentx 2>/dev/null
    ln -sfn "$PROJECTS_DIR" ~/axentx

    # Clone axentx repos in background with hard timeout — never blocks boot.
    # Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and
    # were silently 404'ing (arkashira/* — only surrogate-1-harvest is there;
    # the rest are private under axentx org). The agent pipeline's dev/qa/
    # reviewer/commit daemons sat idle for a full day because no repo cloned
    # for them to work on. Real paths confirmed via /repos/<owner>/<name>:
    #   axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1}  → 200
    #   arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} → 404
    # Note: 'arkship' was a typo for 'airship' (axentx/airship).
    for repo_spec in \
        "Costinel:axentx/Costinel" \
        "vanguard:axentx/vanguard" \
        "airship:axentx/airship" \
        "workio:axentx/workio" \
        "axiomops:axentx/axiomops" \
        "surrogate-1:axentx/surrogate-1"; do
        local_name="${repo_spec%%:*}"
        gh_path="${repo_spec##*:}"
        target="${PROJECTS_DIR}/${local_name}"
        (
            if [[ ! -d "$target/.git" ]]; then
                echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log"
                timeout 30 git clone --depth 50 \
                    "https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \
                    >> "$LOG_DIR/git-clone.log" 2>&1 || \
                    echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log"
            else
                cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true
            fi
        ) &
    done
    # Don't wait — let clones finish in background while boot continues

    # Persist token for any push from auto-orchestrate
    git config --global credential.helper "store --file=$HOME/.git-credentials"
    echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials
    chmod 600 ~/.git-credentials
    echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log"
fi

# ── 4. Redis (TCP only) ─────────────────────────────────────────────────────
# redis cap tightened on LOW_MEM (was 1gb → 256mb). Coordinator uses
# SQLite directly; redis is only a soft cache for work-queue priorities.
REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}"
redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
    --maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru
sleep 1
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1

# ── 5. Ollama — DISABLED on cpu-basic (16 GB limit) ───────────────────────
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
# weights against a 16 GB cap → instant OOM on any inference.
#
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
# is faster anyway — wafer-scale inference beats CPU x86 by 50-200×.
# Ollama only worth running once Space upgrades to ≥cpu-upgrade (32 GB) OR
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
#
# Set LOW_MEM=0 to re-enable on bigger Space tier.
LOW_MEM="${LOW_MEM:-1}"
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
        >> "$LOG_DIR/boot.log"
    echo "[$(date +%H:%M:%S)]   → free LLM ladder serves all v2 inference" \
        >> "$LOG_DIR/boot.log"
else
    OLLAMA_MODELS="${HOME}/.ollama/models" \
    OLLAMA_HOST=127.0.0.1:11434 \
    nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
    sleep 6
    (
        if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
            ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
        fi
        if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
            # Smallest coder that's actually useful — fits any tier
            ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
        fi
    ) &
fi

# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
# HF Spaces free tier may block egress to discord.com — bot would crash-loop.
# Pre-flight check: if discord.com unreachable, skip bot, use webhook-only.
if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
    if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then
        set -a; source ~/.hermes/.env 2>/dev/null; set +a
        nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)"
    else
        echo "[$(date +%H:%M:%S)] discord.com unreachable — skipping bot, using webhook-only" >> "$LOG_DIR/boot.log"
    fi
fi

# ── 7a. Continuous scrape daemon — concurrency tuned to LOW_MEM ────────────
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
cat > /tmp/scrape-daemon.sh <<SCRAPESH
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
mkdir -p "\$(dirname "\$LOG")"
while true; do
    START=\$(date +%s)
    bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
    DUR=\$(( \$(date +%s) - START ))
    if [[ \$DUR -lt 30 ]]; then sleep 30
    elif [[ \$DUR -lt 120 ]]; then sleep 15
    else sleep 5
    fi
done
SCRAPESH
chmod +x /tmp/scrape-daemon.sh
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7b. Agentic crawler — DISABLED on LOW_MEM (anchor takes this load) ─────
if [[ "$LOW_MEM" != "1" ]]; then
    CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}"
    nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \
        > "$LOG_DIR/agentic-crawler.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log"
fi

# ── 7b2. GitHub-specific agentic crawler (lightweight, keep on always) ─────
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"

# ── 7b3. HF Dataset Discoverer — DISABLED (replaced by continuous-discoverer) ─
# Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv +
# Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh
# is now redundant + memory pressure on cpu-basic.
echo "[$(date +%H:%M:%S)] ⚠ hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log"

# ── 7e. auto-orchestrate-continuous — DISABLED on LOW_MEM (cron handles it) ─
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
fi

# ── 7e1. SELF-HEAL WATCHDOG — must start BEFORE memory-hungry workers ───────
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
# usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise
# crash the entire container. Also restarts stuck ingest / kicks stale uploader.
nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 &
echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log"

# ── 7e2. GH-ACTIONS TICKER — burst-dispatch external runners every 60s ──────
# Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every
# 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner
# timeouts, the 20-concurrent free-tier slot cap stays saturated.
# Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as
# Space secrets — operator can add later without restart-required.
nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 &
echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log"

# ── 7e3. LLM BURST GENERATOR — synthetic training pairs from 8 free LLMs ────
# Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi.
# Each cycle fires 3 prompts at every active provider in parallel, writes
# {prompt, response} pairs to training-pairs.jsonl. Combined free-tier
# budget: ~7000+ pairs/day. Skips any provider whose key env is not set.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log"
fi
sleep 3   # Stagger spawns — avoid memory burst at boot

# ── 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) ─────
# Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard
# peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides
# a second safety net if peak still spikes.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log"
fi
sleep 3

# ── 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10× faster) ──
# Downloads parquet shards directly via HF datasets-server API + pyarrow filter.
# Targets only trillion-scale corpora where streaming is too slow.
# DLs reduced to 2 parallel — combined with 6 ingest shards stays under 16Gi.
PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 &
echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log"

# ── 7c. Skill-synthesis daemon — DISABLED on LOW_MEM (heavy LLM calls) ────
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Bulk mirror coordinator + 4 parallel workers ────────────────────────
# User feedback 2026-04-29: "ทุก agent ทำงานร่วมกัน และไม่ไปที่ซ้ำๆ".
# Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db).
# Workers each pull next pending dataset, mirror+sanitize+dedup, mark done.
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
# Lease-based claims (15 min) — crashes auto-expire so other workers pick up.
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true

# Two worker types share the same coordinator queue:
#   bulk-mirror-worker.sh    — full-download, suits small/medium datasets
#   streaming-mirror-worker.sh — HF datasets streaming, suits trillion-token
# LOW_MEM tuning for cpu-basic 16GB Space (history):
#   v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2)
#   v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened)
#   v3 NOW: 1 bulk + 3 stream  (post Civo-pivot + 4-Space fan-out;
#                                anchor never came up so we can't rely on
#                                it for bulk, and 16GB has ~8 GB unused
#                                under the v2 setting → reclaim it)
#
# Memory budget per Space (16 GB cpu-basic):
#   ~6 GB reserved: OS + redis 256mb + continuous-discoverer +
#                    dataset-enrich + auto-startup-loop + push bursts
#   ~10 GB available for harvest workers
#   3 stream × 500 MB + 1 bulk × 600 MB = 2.1 GB used
#   ~8 GB headroom → memory-guard.sh kicks in at <3 GB free, safe
#
# Throughput delta: 4× workers/Space × 4 Spaces = 16× total worker count
# (vs previous 1×4 = 4). Combined with enrich cron M%30==5 (was M%60),
# expect 3-5× commit rate before HF soft-cap kicks in.
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}"

for i in $(seq 1 "$BULK_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
        > "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
done
for i in $(seq 1 "$STREAM_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \
        > "$LOG_DIR/stream-worker-$i.log" 2>&1 &
done
TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS))
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) ─
# Replaces aggressive-harvester cron — runs always, sweeps HF + arxiv + SE + GH.
if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \
        > "$LOG_DIR/continuous-discoverer.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log"
fi

# ── Auto-startup-loop: 45 personae × 9 LoRA clusters × auto-commit + auto-push ─
# CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc.
# 1 role per 15-min cycle; chained roles fire downstream automatically.
if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \
        > "$LOG_DIR/auto-startup-loop.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Train-ready pusher — disabled at boot for now. Caused Space
#       RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
#       bin/train-ready-pusher.sh; launch manually after Space proves stable:
#         nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 &
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &

# ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
cat > /tmp/hermes-cron.sh <<'CRONSH'
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="${HOME}/.surrogate/logs/cron.log"
mkdir -p "$(dirname "$LOG")"
while true; do
    M=$(($(date +%s) / 60))
    # Cron offsets STAGGERED — minute=0 burst was OOM trigger.
    # Each major task picks a unique M%X==N offset so no two fire together.
    [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
    [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
    # Auto-scaler — spawn/kill workers based on free memory tier (burst-but-don't-die)
    [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
    # synth-puller — hit surrogate1 ZeroGPU /api/synth_batch every 5 min
    # Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate).
    [[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 &
    # push-training-to-hf gated by memory (loads big shard into RAM).
    # Anchor (24GB) takes over when capacity arrives — see anchor cron-loop.
    [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
    # auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) — see step 7e below.
    # Cron entry retained for legacy single-fire boost (no harm if continuous already up):
    [[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: research-apply (pop queue → orchestrate → ship feature)
    [[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
    # Every 60 min: keyword tuner (adapts scrape queue based on yields)
    [[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
    # Every 6 hours: research-loop (discover new features from competitors/papers)
    [[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: dataset enrich (was 60 min — bumped 2× now that we have
    # 4 Spaces × (3 stream + 1 bulk) = 16 workers harvesting in parallel,
    # producing more chunks per hour than the old 60-min push could drain).
    # Memory-guarded — full HF Hub iter is heavy.
    [[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
    # Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
    [[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
    # Every 30 min: build vector embeddings index (RAG semantic search)
    [[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
    # Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
    [[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 &
    # Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) → security-knowledge dataset
    [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
    # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
    [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
    # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
    # Cerebras/Groq → +80 specific job-description-style search terms each).
    # Discoverer auto-uses the expanded list on its next cycle.
    [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
    # Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset
    # slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
    # so we DO want to keep submitting — Kaggle queues if 1 already running,
    # auto-cancels older if 5+ pending. With shorter interval we keep the
    # GPU pipeline saturated.
    [[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
    # Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo).
    # H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA — biggest free training.
    [[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 &

    # ── Round 5 (2026-04) sustainability loops ──────────────────────────
    # Every 6 hr (offset 90): self-improve loop — gen problems, judge,
    # winners → training data, losers → reflexion-store.
    [[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 &
    # Every 30 min (offset 22): mine new tool-call traces from logs into
    # SFT + DPO data, plus voyager skill candidates.
    [[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 &
    # Every 60 min (offset 17): export promoted voyager skills to JSONL
    # (training-data slice + inference-time retrieval source).
    [[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 &
    # Daily 07:00 UTC: active-learning batch from one bulk-mirror file.
    # Skips silently if no pool yet.
    [[ $((M % 1440)) -eq 420 ]] && {
        POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1)
        [[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \
            --pool "$POOL" --n 200 --scan 1500 \
            >> "$LOG_DIR/active-learning.log" 2>&1 &
    }
    # Daily 08:00 UTC: constitutional self-critique on yesterday's
    # winners (pulls latest self-improve winners file).
    [[ $((M % 1440)) -eq 480 ]] && {
        WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1)
        [[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \
            --input "$WIN" --n 200 \
            >> "$LOG_DIR/constitutional.log" 2>&1 &
    }

    # ── Round 7+8 (2026-04-30) — trillion-scale + harvester + enrich ──────
    # Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep)
    [[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \
        >> "$LOG_DIR/aggressive-harvester.log" 2>&1 &
    # Every 60 min (offset 35): enrich newly-mirrored bulk files
    [[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/enrich-pipeline.sh \
        >> "$LOG_DIR/enrich-pipeline.log" 2>&1 &
    # Every 30 min (offset 25): spawn extra streaming worker if pool empty
    [[ $((M % 30)) -eq 25 ]] && {
        if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then
            nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \
                > "$LOG_DIR/stream-worker-cron.log" 2>&1 &
        fi
    }
    # Daily 09:00 UTC: teachable-prompt filter on harvested data
    [[ $((M % 1440)) -eq 540 ]] && {
        LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1)
        [[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \
            --input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \
            --n 1000 --keep-target 200 \
            >> "$LOG_DIR/teachable.log" 2>&1 &
    }
    # Daily 11:00 UTC: regression test suite (catches breakage post-push)
    [[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \
        >> "$LOG_DIR/regression.log" 2>&1 &
    # Weekly Sun 10:00 UTC: abstract-cot compress reasoning data
    [[ $((M % 10080)) -eq 600 ]] && {
        for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do
            [[ -f "$f" ]] || continue
            python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \
                --input "$f" --out "${f%.jsonl}-compressed.jsonl" \
                >> "$LOG_DIR/abstract-cot.log" 2>&1
        done
    }
    sleep 60
done
CRONSH
chmod +x /tmp/hermes-cron.sh
nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 &
echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log"

# ── 8. Status HTTP server on :7860 (FastAPI/uvicorn — robust binding) ──────
set +x   # silence trace for clean uvicorn logs
echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log"

# Verify deps before exec — print what's missing rather than silent crash
python3 -c "import fastapi, uvicorn; print(f'  fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || {
    echo "❌ fastapi/uvicorn not importable — falling back to plain http.server"
    exec python3 -m http.server 7860 --bind 0.0.0.0
}

# Run as PID 1 — uvicorn handles signals + auto-restart on crash
exec python3 ~/.surrogate/bin/hermes-status-server.py