#!/usr/bin/env bash
# Hermes start orchestrator for HF Space.
# Boots: persistent /data mount → Redis → Ollama → axentx repos → daemons → status server.
set -uo pipefail

LOG_DIR="${HOME}/.surrogate/logs"
mkdir -p "$LOG_DIR"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log"

# Trace mode for early steps only (no secrets here yet) — find hang point but stay safe
PS4='[trace ${LINENO}] '
set -x

# Echo stdout so HF run-logs see progress (safe steps before .env is loaded)
exec > >(tee -a "$LOG_DIR/boot.log") 2>&1

# ── Memory mode (must be set BEFORE any reference; we use `set -u`) ───────
# CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest
# launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer,
# dedup-bootstrap) — those run on GCP daemons instead. Set LOW_MEM=0 only
# on a paid Space tier (cpu-upgrade ≥32 GB).
LOW_MEM="${LOW_MEM:-1}"

# ── 1. Persistent data — symlink state subdirs to /data (HF persistent mount) ──
# bin/ is NOT persisted (baked into image, refreshed on every push).
# Persisted: state (DBs), logs, memory, skills, sessions, training pairs,
#            workspace (hermes runtime), projects (axentx clones), ollama (model cache).
DATA="/data"
if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
    mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index}
    # Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level
    if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then
        mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true
        rmdir "$DATA/surrogate" 2>/dev/null || true
    fi

    for spec in \
        "${HOME}/.surrogate/state:${DATA}/state" \
        "${HOME}/.surrogate/logs:${DATA}/logs" \
        "${HOME}/.surrogate/memory:${DATA}/memory" \
        "${HOME}/.surrogate/skills:${DATA}/skills" \
        "${HOME}/.surrogate/sessions:${DATA}/sessions" \
        "${HOME}/.hermes/workspace:${DATA}/workspace" \
        "${HOME}/.ollama:${DATA}/ollama"; do
        target="${spec%%:*}"
        link="${spec##*:}"
        mkdir -p "$(dirname "$target")"
        # Always ensure backing directory exists + writable. If the persistent
        # /data mount becomes unavailable mid-run, daemon writes to symlinked
        # path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the
        # link defensively each boot fixes stale-symlink cases.
        mkdir -p "$link" 2>/dev/null || true
        if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then
            # Either not-a-symlink OR broken symlink (target unreachable)
            rm -rf "$target" 2>/dev/null
            ln -sfn "$link" "$target"
        fi
        # Final sanity probe — write a marker; if it fails, the persistent
        # mount is broken regardless of the symlink, so log loudly.
        if ! touch "$target/.boot-marker" 2>/dev/null; then
            echo "[$(date +%H:%M:%S)] ⚠ FATAL: $target/ not writable — daemon log writes will Errno 5"
        fi
    done

    # training-pairs.jsonl — single file persistence
    if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null
        touch "${DATA}/training-pairs.jsonl"
        ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
    fi

    # ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ──
    if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
        echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
        echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
        touch "${HOME}/.surrogate/.offset-reset-done"
        echo "[$(date +%H:%M:%S)] one-time offset reset → $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
    fi

    # ── Boot-time dedup.db corruption check ──────────────────────────────
    # 16 parallel shards previously corrupted the SQLite WAL. If the DB is
    # unreadable on boot, back it up and force re-bootstrap from scratch.
    DEDUP_DB="${HOME}/.surrogate/state/dedup.db"
    if [[ -f "$DEDUP_DB" ]]; then
        if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then
            TS=$(date +%s)
            mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null
            rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm"
            rm -f "${HOME}/.surrogate/.dedup-bootstrap-done"
            echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db → ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log"
        fi
    fi

    # ── Heavy harvest launchers — only on HIGH_MEM (LOW_MEM=0) ───────────
    # On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest
    # workers blew through the cap and HF auto-killed the container ~5 min after
    # boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon
    # (entries in data/hermes-jobs.json) so harvest still runs — just not from
    # inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we
    # upgrade to a ≥32 GB tier.
    if [[ "$LOW_MEM" != "1" ]]; then
        # ── One-time central dedup bootstrap from existing data ──────────
        if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then
            echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log"
            nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 &
        fi

        # ── BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron)
        nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) ─
        nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME lightning-trainer kickoff — H200 4 hr free for big model
        nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME dataset-mirror — bulk-clone top community SFT mixes ─
        nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log"
    else
        echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log"
    fi

    echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] WARN: /data not writable — running ephemeral!" >> "$LOG_DIR/boot.log"
fi

# ── 2. Bind HF Space secrets → ~/.hermes/.env ───────────────────────────────
# 🔒 DISABLE shell trace before touching secret values.
set +x
echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)"
mkdir -p ~/.hermes
{
    echo "# Auto-generated from HF Space secrets at boot"
    for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \
             GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \
             CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \
             CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \
             HF_TOKEN HUGGING_FACE_HUB_TOKEN; do
        v="${!k:-}"
        [[ -n "$v" ]] && echo "${k}=${v}"
    done
} > ~/.hermes/.env
chmod 600 ~/.hermes/.env
echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)"
# Trace OFF for the rest of boot — we already have line numbers above and won't need them post-secrets.

# ── LOW_MEM short-circuit — skip ALL background daemons, exec status server ──
# CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest
# launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer
# every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons
# below this point (scrape, agentic-crawler, github-crawler, self-heal, cron
# loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.)
# that collectively grow into the cap within an hour.
#
# In LOW_MEM=1 mode the Space's only job is the FastAPI status server on
# :7860 that serves harvest cursor advance to remote workers. Everything
# else (harvest, mirroring, agent pipeline, training pushes, dataset enrich)
# now runs on the GCP daemon fleet — see hermes-jobs.json (171 jobs scheduled
# via hermes-scheduler-daemon as of 2026-05-02).
#
# Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (≥32GB).
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] LOW_MEM=1 → skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log"
    set +x   # silence trace
    # Verify deps before exec — print what's missing rather than silent crash
    if python3 -c "import fastapi, uvicorn" 2>/dev/null; then
        echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log"
        exec python3 ~/.surrogate/bin/hermes-status-server.py
    else
        echo "❌ fastapi/uvicorn not importable — falling back to plain http.server"
        exec python3 -m http.server 7860 --bind 0.0.0.0
    fi
fi

# ── 3. Git config + clone axentx repos for auto-orchestrate auto-commit ────
# Disable interactive prompts globally so failed-auth git ops fail fast.
export GIT_TERMINAL_PROMPT=0
export GIT_ASKPASS=/bin/true

GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1)
if [[ -n "$GH_TOKEN" ]]; then
    git config --global user.email "hermes@axentx.ai"
    git config --global user.name  "Hermes (Surrogate-1)"
    git config --global init.defaultBranch main
    git config --global pull.rebase true
    git config --global push.default current

    PROJECTS_DIR="${DATA}/projects"
    mkdir -p "$PROJECTS_DIR"
    rm -rf ~/axentx 2>/dev/null
    ln -sfn "$PROJECTS_DIR" ~/axentx

    # Clone axentx repos in background with hard timeout — never blocks boot.
    # Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and
    # were silently 404'ing (arkashira/* — only surrogate-1-harvest is there;
    # the rest are private under axentx org). The agent pipeline's dev/qa/
    # reviewer/commit daemons sat idle for a full day because no repo cloned
    # for them to work on. Real paths confirmed via /repos/<owner>/<name>:
    #   axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1}  → 200
    #   arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} → 404
    # Note: 'arkship' was a typo for 'airship' (axentx/airship).
    for repo_spec in \
        "Costinel:axentx/Costinel" \
        "vanguard:axentx/vanguard" \
        "airship:axentx/airship" \
        "workio:axentx/workio" \
        "axiomops:axentx/axiomops" \
        "surrogate-1:axentx/surrogate-1"; do
        local_name="${repo_spec%%:*}"
        gh_path="${repo_spec##*:}"
        target="${PROJECTS_DIR}/${local_name}"
        (
            if [[ ! -d "$target/.git" ]]; then
                echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log"
                timeout 30 git clone --depth 50 \
                    "https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \
                    >> "$LOG_DIR/git-clone.log" 2>&1 || \
                    echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log"
            else
                cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true
            fi
        ) &
    done
    # Don't wait — let clones finish in background while boot continues

    # Persist token for any push from auto-orchestrate
    git config --global credential.helper "store --file=$HOME/.git-credentials"
    echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials
    chmod 600 ~/.git-credentials
    echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log"
fi

# ── 4. Redis (TCP only) ─────────────────────────────────────────────────────
# redis cap tightened on LOW_MEM (was 1gb → 256mb). Coordinator uses
# SQLite directly; redis is only a soft cache for work-queue priorities.
REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}"
redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
    --maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru
sleep 1
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1

# ── 5. Ollama — DISABLED on cpu-basic (16 GB limit) ───────────────────────
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
# weights against a 16 GB cap → instant OOM on any inference.
#
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
# is faster anyway — wafer-scale inference beats CPU x86 by 50-200×.
# Ollama only worth running once Space upgrades to ≥cpu-upgrade (32 GB) OR
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
#
# Set LOW_MEM=0 to re-enable on bigger Space tier.
LOW_MEM="${LOW_MEM:-1}"
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
        >> "$LOG_DIR/boot.log"
    echo "[$(date +%H:%M:%S)]   → free LLM ladder serves all v2 inference" \
        >> "$LOG_DIR/boot.log"
else
    OLLAMA_MODELS="${HOME}/.ollama/models" \
    OLLAMA_HOST=127.0.0.1:11434 \
    nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
    sleep 6
    (
        if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
            ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
        fi
        if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
            # Smallest coder that's actually useful — fits any tier
            ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
        fi
    ) &
fi

# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
# HF Spaces free tier may block egress to discord.com — bot would crash-loop.
# Pre-flight check: if discord.com unreachable, skip bot, use webhook-only.
if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
    if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then
        set -a; source ~/.hermes/.env 2>/dev/null; set +a
        nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)"
    else
        echo "[$(date +%H:%M:%S)] discord.com unreachable — skipping bot, using webhook-only" >> "$LOG_DIR/boot.log"
    fi
fi

# ── 7a. Continuous scrape daemon — concurrency tuned to LOW_MEM ────────────
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
cat > /tmp/scrape-daemon.sh <<SCRAPESH
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
mkdir -p "\$(dirname "\$LOG")"
while true; do
    START=\$(date +%s)
    bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
    DUR=\$(( \$(date +%s) - START ))
    if [[ \$DUR -lt 30 ]]; then sleep 30
    elif [[ \$DUR -lt 120 ]]; then sleep 15
    else sleep 5
    fi
done
SCRAPESH
chmod +x /tmp/scrape-daemon.sh
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7b. Agentic crawler — DISABLED on LOW_MEM (anchor takes this load) ─────
if [[ "$LOW_MEM" != "1" ]]; then
    CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}"
    nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \
        > "$LOG_DIR/agentic-crawler.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log"
fi

# ── 7b2. GitHub-specific agentic crawler (lightweight, keep on always) ─────
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"

# ── 7b3. HF Dataset Discoverer — DISABLED (replaced by continuous-discoverer) ─
# Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv +
# Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh
# is now redundant + memory pressure on cpu-basic.
echo "[$(date +%H:%M:%S)] ⚠ hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log"

# ── 7e. auto-orchestrate-continuous — DISABLED on LOW_MEM (cron handles it) ─
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
fi

# ── 7e1. SELF-HEAL WATCHDOG — must start BEFORE memory-hungry workers ───────
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
# usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise
# crash the entire container. Also restarts stuck ingest / kicks stale uploader.
nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 &
echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log"

# ── 7e2. GH-ACTIONS TICKER — burst-dispatch external runners every 60s ──────
# Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every
# 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner
# timeouts, the 20-concurrent free-tier slot cap stays saturated.
# Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as
# Space secrets — operator can add later without restart-required.
nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 &
echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log"

# ── 7e3. LLM BURST GENERATOR — synthetic training pairs from 8 free LLMs ────
# Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi.
# Each cycle fires 3 prompts at every active provider in parallel, writes
# {prompt, response} pairs to training-pairs.jsonl. Combined free-tier
# budget: ~7000+ pairs/day. Skips any provider whose key env is not set.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log"
fi
sleep 3   # Stagger spawns — avoid memory burst at boot

# ── 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) ─────
# Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard
# peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides
# a second safety net if peak still spikes.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log"
fi
sleep 3

# ── 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10× faster) ──
# Downloads parquet shards directly via HF datasets-server API + pyarrow filter.
# Targets only trillion-scale corpora where streaming is too slow.
# DLs reduced to 2 parallel — combined with 6 ingest shards stays under 16Gi.
PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 &
echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log"

# ── 7c. Skill-synthesis daemon — DISABLED on LOW_MEM (heavy LLM calls) ────
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Bulk mirror coordinator + 4 parallel workers ────────────────────────
# User feedback 2026-04-29: "ทุก agent ทำงานร่วมกัน และไม่ไปที่ซ้ำๆ".
# Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db).
# Workers each pull next pending dataset, mirror+sanitize+dedup, mark done.
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
# Lease-based claims (15 min) — crashes auto-expire so other workers pick up.
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true

# Two worker types share the same coordinator queue:
#   bulk-mirror-worker.sh    — full-download, suits small/medium datasets
#   streaming-mirror-worker.sh — HF datasets streaming, suits trillion-token
# LOW_MEM tuning for cpu-basic 16GB Space (history):
#   v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2)
#   v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened)
#   v3 NOW: 1 bulk + 3 stream  (post Civo-pivot + 4-Space fan-out;
#                                anchor never came up so we can't rely on
#                                it for bulk, and 16GB has ~8 GB unused
#                                under the v2 setting → reclaim it)
#
# Memory budget per Space (16 GB cpu-basic):
#   ~6 GB reserved: OS + redis 256mb + continuous-discoverer +
#                    dataset-enrich + auto-startup-loop + push bursts
#   ~10 GB available for harvest workers
#   3 stream × 500 MB + 1 bulk × 600 MB = 2.1 GB used
#   ~8 GB headroom → memory-guard.sh kicks in at <3 GB free, safe
#
# Throughput delta: 4× workers/Space × 4 Spaces = 16× total worker count
# (vs previous 1×4 = 4). Combined with enrich cron M%30==5 (was M%60),
# expect 3-5× commit rate before HF soft-cap kicks in.
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}"

for i in $(seq 1 "$BULK_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
        > "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
done
for i in $(seq 1 "$STREAM_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \
        > "$LOG_DIR/stream-worker-$i.log" 2>&1 &
done
TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS))
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) ─
# Replaces aggressive-harvester cron — runs always, sweeps HF + arxiv + SE + GH.
if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \
        > "$LOG_DIR/continuous-discoverer.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log"
fi

# ── Auto-startup-loop: 45 personae × 9 LoRA clusters × auto-commit + auto-push ─
# CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc.
# 1 role per 15-min cycle; chained roles fire downstream automatically.
if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \
        > "$LOG_DIR/auto-startup-loop.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Train-ready pusher — disabled at boot for now. Caused Space
#       RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
#       bin/train-ready-pusher.sh; launch manually after Space proves stable:
#         nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 &
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &

# ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
cat > /tmp/hermes-cron.sh <<'CRONSH'
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="${HOME}/.surrogate/logs/cron.log"
mkdir -p "$(dirname "$LOG")"
while true; do
    M=$(($(date +%s) / 60))
    # Cron offsets STAGGERED — minute=0 burst was OOM trigger.
    # Each major task picks a unique M%X==N offset so no two fire together.
    [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
    [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
    # Auto-scaler — spawn/kill workers based on free memory tier (burst-but-don't-die)
    [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
    # synth-puller — hit surrogate1 ZeroGPU /api/synth_batch every 5 min
    # Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate).
    [[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 &
    # push-training-to-hf gated by memory (loads big shard into RAM).
    # Anchor (24GB) takes over when capacity arrives — see anchor cron-loop.
    [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
    # auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) — see step 7e below.
    # Cron entry retained for legacy single-fire boost (no harm if continuous already up):
    [[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: research-apply (pop queue → orchestrate → ship feature)
    [[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
    # Every 60 min: keyword tuner (adapts scrape queue based on yields)
    [[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
    # Every 6 hours: research-loop (discover new features from competitors/papers)
    [[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: dataset enrich (was 60 min — bumped 2× now that we have
    # 4 Spaces × (3 stream + 1 bulk) = 16 workers harvesting in parallel,
    # producing more chunks per hour than the old 60-min push could drain).
    # Memory-guarded — full HF Hub iter is heavy.
    [[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
    # Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
    [[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
    # Every 30 min: build vector embeddings index (RAG semantic search)
    [[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
    # Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
    [[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 &
    # Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) → security-knowledge dataset
    [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
    # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
    [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
    # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
    # Cerebras/Groq → +80 specific job-description-style search terms each).
    # Discoverer auto-uses the expanded list on its next cycle.
    [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
    # Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset
    # slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
    # so we DO want to keep submitting — Kaggle queues if 1 already running,
    # auto-cancels older if 5+ pending. With shorter interval we keep the
    # GPU pipeline saturated.
    [[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
    # Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo).
    # H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA — biggest free training.
    [[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 &

    # ── Round 5 (2026-04) sustainability loops ──────────────────────────
    # Every 6 hr (offset 90): self-improve loop — gen problems, judge,
    # winners → training data, losers → reflexion-store.
    [[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 &
    # Every 30 min (offset 22): mine new tool-call traces from logs into
    # SFT + DPO data, plus voyager skill candidates.
    [[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 &
    # Every 60 min (offset 17): export promoted voyager skills to JSONL
    # (training-data slice + inference-time retrieval source).
    [[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 &
    # Daily 07:00 UTC: active-learning batch from one bulk-mirror file.
    # Skips silently if no pool yet.
    [[ $((M % 1440)) -eq 420 ]] && {
        POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1)
        [[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \
            --pool "$POOL" --n 200 --scan 1500 \
            >> "$LOG_DIR/active-learning.log" 2>&1 &
    }
    # Daily 08:00 UTC: constitutional self-critique on yesterday's
    # winners (pulls latest self-improve winners file).
    [[ $((M % 1440)) -eq 480 ]] && {
        WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1)
        [[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \
            --input "$WIN" --n 200 \
            >> "$LOG_DIR/constitutional.log" 2>&1 &
    }

    # ── Round 7+8 (2026-04-30) — trillion-scale + harvester + enrich ──────
    # Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep)
    [[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \
        >> "$LOG_DIR/aggressive-harvester.log" 2>&1 &
    # Every 60 min (offset 35): enrich newly-mirrored bulk files
    [[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/enrich-pipeline.sh \
        >> "$LOG_DIR/enrich-pipeline.log" 2>&1 &
    # Every 30 min (offset 25): spawn extra streaming worker if pool empty
    [[ $((M % 30)) -eq 25 ]] && {
        if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then
            nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \
                > "$LOG_DIR/stream-worker-cron.log" 2>&1 &
        fi
    }
    # Daily 09:00 UTC: teachable-prompt filter on harvested data
    [[ $((M % 1440)) -eq 540 ]] && {
        LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1)
        [[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \
            --input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \
            --n 1000 --keep-target 200 \
            >> "$LOG_DIR/teachable.log" 2>&1 &
    }
    # Daily 11:00 UTC: regression test suite (catches breakage post-push)
    [[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \
        >> "$LOG_DIR/regression.log" 2>&1 &
    # Weekly Sun 10:00 UTC: abstract-cot compress reasoning data
    [[ $((M % 10080)) -eq 600 ]] && {
        for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do
            [[ -f "$f" ]] || continue
            python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \
                --input "$f" --out "${f%.jsonl}-compressed.jsonl" \
                >> "$LOG_DIR/abstract-cot.log" 2>&1
        done
    }
    sleep 60
done
CRONSH
chmod +x /tmp/hermes-cron.sh
nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 &
echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log"

# ── 8. Status HTTP server on :7860 (FastAPI/uvicorn — robust binding) ──────
set +x   # silence trace for clean uvicorn logs
echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log"

# Verify deps before exec — print what's missing rather than silent crash
python3 -c "import fastapi, uvicorn; print(f'  fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || {
    echo "❌ fastapi/uvicorn not importable — falling back to plain http.server"
    exec python3 -m http.server 7860 --bind 0.0.0.0
}

# Run as PID 1 — uvicorn handles signals + auto-restart on crash
exec python3 ~/.surrogate/bin/hermes-status-server.py