File size: 35,499 Bytes
f7e1070
 
b1eb6a2
f7e1070
 
e36381e
f7e1070
d4e26b2
 
bdedae9
 
 
 
 
 
d4e26b2
f7e1070
a5c37dd
 
 
 
 
 
 
e36381e
 
 
 
b1eb6a2
 
e36381e
 
 
 
 
 
 
 
 
 
 
 
 
b1eb6a2
 
e36381e
 
b1eb6a2
b532db8
 
 
 
 
 
 
b1eb6a2
 
 
b532db8
 
 
 
 
b1eb6a2
e36381e
 
 
 
 
 
 
 
aa008c4
 
 
 
 
 
 
 
 
0ad083a
 
 
 
 
 
 
 
 
 
 
 
 
 
a5c37dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508b0e2
a5c37dd
 
 
 
 
 
 
 
 
 
 
 
 
 
0005a16
e36381e
b1eb6a2
 
 
 
 
d0ef0a5
 
 
f7e1070
 
 
b1eb6a2
 
 
dc2d4b4
 
b1eb6a2
 
 
f7e1070
 
d0ef0a5
 
f7e1070
ce077ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1eb6a2
147cb0f
 
 
 
b1eb6a2
 
 
 
 
 
 
 
 
 
 
 
 
154f078
 
 
 
 
 
 
 
 
b1eb6a2
154f078
 
 
 
 
 
b1eb6a2
 
 
147cb0f
 
 
 
 
 
 
 
 
 
 
b1eb6a2
147cb0f
b1eb6a2
 
 
 
 
147cb0f
b1eb6a2
 
 
d8d7a71
 
 
b1eb6a2
d8d7a71
f7e1070
b1eb6a2
f7e1070
426d0e5
 
 
 
9d0ec79
426d0e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7e1070
266304a
 
 
f7e1070
266304a
 
 
 
 
 
 
f7e1070
 
426d0e5
 
 
9d0ec79
 
426d0e5
 
9d0ec79
426d0e5
 
 
 
 
5c8d6dd
9d0ec79
 
 
 
 
426d0e5
5c8d6dd
a8f4e74
 
 
 
 
 
 
 
 
5c8d6dd
a8f4e74
d59de60
426d0e5
d59de60
a8f4e74
 
 
 
 
dd483c7
426d0e5
 
 
 
 
 
 
80f9271
859b78d
 
 
 
 
 
 
52df1e3
 
 
 
 
 
 
 
 
8f598ec
 
 
 
 
47f02de
 
 
 
 
 
 
8f598ec
859b78d
 
 
 
47f02de
 
 
 
 
 
 
bad154c
 
 
 
859b78d
 
 
2f31c27
a8f4e74
 
 
 
 
 
 
9d0ec79
4831adb
 
 
 
 
 
 
a9b3bd4
 
 
 
7d05ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b3bd4
426d0e5
4831adb
 
 
a9b3bd4
 
 
 
 
 
4831adb
a27499d
 
 
 
 
 
 
 
47f02de
 
 
 
 
 
 
 
 
6de7ab5
 
 
 
 
2fd0435
9d0ec79
f7e1070
 
 
e36381e
b1eb6a2
f7e1070
b1eb6a2
3234167
 
 
 
ff2fbf3
 
47f02de
 
 
d8d7a71
 
 
 
80f9271
 
 
88079d2
e36381e
9d0ec79
3234167
88079d2
e36381e
7d05ef5
 
 
 
 
d8d7a71
7cbea95
3234167
7cbea95
 
ea561c8
d8d7a71
 
57a564a
 
 
 
c0c6fe0
 
 
 
6180513
6dd5997
6180513
 
 
 
6038c4f
 
 
17967dd
 
 
 
d8d7a71
 
17967dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b3bd4
 
 
 
 
 
d8d7a71
 
a9b3bd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c00e62
 
 
a9b3bd4
 
 
 
 
 
 
 
 
f7e1070
 
 
 
b1eb6a2
f7e1070
 
7d77adb
bdedae9
 
 
 
 
 
 
 
f7e1070
7d77adb
e36381e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
#!/usr/bin/env bash
# Hermes start orchestrator for HF Space.
# Boots: persistent /data mount β†’ Redis β†’ Ollama β†’ axentx repos β†’ daemons β†’ status server.
set -uo pipefail

LOG_DIR="${HOME}/.surrogate/logs"
mkdir -p "$LOG_DIR"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start"
echo "[$(date +%H:%M:%S)] hermes-hf-space boot start" >> "$LOG_DIR/boot.log"

# Trace mode for early steps only (no secrets here yet) β€” find hang point but stay safe
PS4='[trace ${LINENO}] '
set -x

# Echo stdout so HF run-logs see progress (safe steps before .env is loaded)
exec > >(tee -a "$LOG_DIR/boot.log") 2>&1

# ── Memory mode (must be set BEFORE any reference; we use `set -u`) ───────
# CPU-Basic Space = 16 GB cap. With LOW_MEM=1 we skip the heavy harvest
# launchers (dataset-enrich, dataset-mirror, kaggle-trainer, lightning-trainer,
# dedup-bootstrap) β€” those run on GCP daemons instead. Set LOW_MEM=0 only
# on a paid Space tier (cpu-upgrade β‰₯32 GB).
LOW_MEM="${LOW_MEM:-1}"

# ── 1. Persistent data β€” symlink state subdirs to /data (HF persistent mount) ──
# bin/ is NOT persisted (baked into image, refreshed on every push).
# Persisted: state (DBs), logs, memory, skills, sessions, training pairs,
#            workspace (hermes runtime), projects (axentx clones), ollama (model cache).
DATA="/data"
if [[ -d "$DATA" ]] && [[ -w "$DATA" ]]; then
    mkdir -p "$DATA"/{state,logs,memory,skills,sessions,workspace,projects,ollama,training,reflexion,index}
    # Migrate from any older layout (one-time): if /data/surrogate/state exists, move up one level
    if [[ -d "$DATA/surrogate/state" ]] && [[ ! -L "$DATA/state" ]]; then
        mv "$DATA/surrogate"/* "$DATA/" 2>/dev/null || true
        rmdir "$DATA/surrogate" 2>/dev/null || true
    fi

    for spec in \
        "${HOME}/.surrogate/state:${DATA}/state" \
        "${HOME}/.surrogate/logs:${DATA}/logs" \
        "${HOME}/.surrogate/memory:${DATA}/memory" \
        "${HOME}/.surrogate/skills:${DATA}/skills" \
        "${HOME}/.surrogate/sessions:${DATA}/sessions" \
        "${HOME}/.hermes/workspace:${DATA}/workspace" \
        "${HOME}/.ollama:${DATA}/ollama"; do
        target="${spec%%:*}"
        link="${spec##*:}"
        mkdir -p "$(dirname "$target")"
        # Always ensure backing directory exists + writable. If the persistent
        # /data mount becomes unavailable mid-run, daemon writes to symlinked
        # path fail with Errno 5 I/O error (audit 2026-04-29). Recreating the
        # link defensively each boot fixes stale-symlink cases.
        mkdir -p "$link" 2>/dev/null || true
        if [[ ! -L "$target" ]] || [[ ! -d "$target/" ]]; then
            # Either not-a-symlink OR broken symlink (target unreachable)
            rm -rf "$target" 2>/dev/null
            ln -sfn "$link" "$target"
        fi
        # Final sanity probe β€” write a marker; if it fails, the persistent
        # mount is broken regardless of the symlink, so log loudly.
        if ! touch "$target/.boot-marker" 2>/dev/null; then
            echo "[$(date +%H:%M:%S)] ⚠ FATAL: $target/ not writable β€” daemon log writes will Errno 5"
        fi
    done

    # training-pairs.jsonl β€” single file persistence
    if [[ ! -L "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        rm -f "${HOME}/.surrogate/training-pairs.jsonl" 2>/dev/null
        touch "${DATA}/training-pairs.jsonl"
        ln -sfn "${DATA}/training-pairs.jsonl" "${HOME}/.surrogate/training-pairs.jsonl"
    fi

    # ── One-time offset reset: skip polluted agentic-crawler placeholder backlog ──
    if [[ ! -f "${HOME}/.surrogate/.offset-reset-done" ]] && [[ -f "${HOME}/.surrogate/training-pairs.jsonl" ]]; then
        CUR=$(wc -l < "${HOME}/.surrogate/training-pairs.jsonl" | tr -d ' ')
        echo "$CUR" > "${HOME}/.surrogate/.training-push-offset"
        echo "$CUR" > "${HOME}/.surrogate/.self-ingest-offset"
        touch "${HOME}/.surrogate/.offset-reset-done"
        echo "[$(date +%H:%M:%S)] one-time offset reset β†’ $CUR (skip placeholder backlog)" >> "$LOG_DIR/boot.log"
    fi

    # ── Boot-time dedup.db corruption check ──────────────────────────────
    # 16 parallel shards previously corrupted the SQLite WAL. If the DB is
    # unreadable on boot, back it up and force re-bootstrap from scratch.
    DEDUP_DB="${HOME}/.surrogate/state/dedup.db"
    if [[ -f "$DEDUP_DB" ]]; then
        if ! sqlite3 "$DEDUP_DB" "SELECT 1 FROM seen_hashes LIMIT 1" >/dev/null 2>&1; then
            TS=$(date +%s)
            mv "$DEDUP_DB" "${DEDUP_DB}.corrupt-${TS}.bak" 2>/dev/null
            rm -f "${DEDUP_DB}-wal" "${DEDUP_DB}-shm"
            rm -f "${HOME}/.surrogate/.dedup-bootstrap-done"
            echo "[$(date +%H:%M:%S)] WIPED corrupt dedup.db β†’ ${DEDUP_DB}.corrupt-${TS}.bak (forcing re-bootstrap)" >> "$LOG_DIR/boot.log"
        fi
    fi

    # ── Heavy harvest launchers β€” only on HIGH_MEM (LOW_MEM=0) ───────────
    # On CPU-Basic (16 GB cap) launching 5 background bash + uvicorn + 5 harvest
    # workers blew through the cap and HF auto-killed the container ~5 min after
    # boot. These launchers are now scheduled on GCP via hermes-scheduler-daemon
    # (entries in data/hermes-jobs.json) so harvest still runs β€” just not from
    # inside the Space's RAM. Re-enable in-Space by setting LOW_MEM=0 once we
    # upgrade to a β‰₯32 GB tier.
    if [[ "$LOW_MEM" != "1" ]]; then
        # ── One-time central dedup bootstrap from existing data ──────────
        if [[ ! -f "${HOME}/.surrogate/.dedup-bootstrap-done" ]]; then
            echo "[$(date +%H:%M:%S)] running central dedup bootstrap (one-time)" >> "$LOG_DIR/boot.log"
            nohup bash "${HOME}/.surrogate/bin/dedup-bootstrap.sh" > "$LOG_DIR/dedup-bootstrap.log" 2>&1 &
        fi

        # ── BOOT-TIME enrich kickoff (trigger immediate pull, don't wait for cron)
        nohup bash "${HOME}/.surrogate/bin/dataset-enrich.sh" >> "$LOG_DIR/dataset-enrich.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-enrich kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME kaggle-trainer kickoff (don't wait for 90-min cron) ─
        nohup bash "${HOME}/.surrogate/bin/kaggle-trainer.sh" >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time kaggle-trainer kicked off" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME lightning-trainer kickoff β€” H200 4 hr free for big model
        nohup bash "${HOME}/.surrogate/bin/lightning-trainer.sh" >> "$LOG_DIR/lightning-trainer.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time lightning-trainer kicked off (H200 4hr quota)" >> "$LOG_DIR/boot.log"

        # ── BOOT-TIME dataset-mirror β€” bulk-clone top community SFT mixes ─
        nohup bash "${HOME}/.surrogate/bin/dataset-mirror.sh" >> "$LOG_DIR/dataset-mirror.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] boot-time dataset-mirror kicked off (30 community sources)" >> "$LOG_DIR/boot.log"
    else
        echo "[$(date +%H:%M:%S)] LOW_MEM=1 β†’ skipped 5 heavy harvest launchers (delegated to GCP daemons)" >> "$LOG_DIR/boot.log"
    fi

    echo "[$(date +%H:%M:%S)] persistent /data linked (state, logs, memory, skills, sessions, workspace, ollama, training-pairs)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] WARN: /data not writable β€” running ephemeral!" >> "$LOG_DIR/boot.log"
fi

# ── 2. Bind HF Space secrets β†’ ~/.hermes/.env ───────────────────────────────
# πŸ”’ DISABLE shell trace before touching secret values.
set +x
echo "[$(date +%H:%M:%S)] writing ~/.hermes/.env from secret env vars (trace OFF)"
mkdir -p ~/.hermes
{
    echo "# Auto-generated from HF Space secrets at boot"
    for k in OPENROUTER_API_KEY GEMINI_API_KEY GEMINI_API_KEY_2 \
             GITHUB_TOKEN GITHUB_TOKEN_POOL DISCORD_BOT_TOKEN DISCORD_WEBHOOK \
             CEREBRAS_API_KEY GROQ_API_KEY SAMBANOVA_API_KEY \
             CLOUDFLARE_API_KEY NVIDIA_API_KEY CHUTES_API_KEY ANTHROPIC_API_KEY \
             HF_TOKEN HUGGING_FACE_HUB_TOKEN; do
        v="${!k:-}"
        [[ -n "$v" ]] && echo "${k}=${v}"
    done
} > ~/.hermes/.env
chmod 600 ~/.hermes/.env
echo "[$(date +%H:%M:%S)] .env written ($(wc -l < ~/.hermes/.env) keys, perms 600)"
# Trace OFF for the rest of boot β€” we already have line numbers above and won't need them post-secrets.

# ── LOW_MEM short-circuit β€” skip ALL background daemons, exec status server ──
# CPU-Basic Space cap is 16 GB. Even after gating the 5 boot-time harvest
# launchers, the Space kept hitting 16 GB cap and going hung at HTTP layer
# every ~30-40 min. Investigation found 15+ MORE nohup'd background daemons
# below this point (scrape, agentic-crawler, github-crawler, self-heal, cron
# loop, bulk-mirror workers, streaming-mirror workers, parquet-ingest, etc.)
# that collectively grow into the cap within an hour.
#
# In LOW_MEM=1 mode the Space's only job is the FastAPI status server on
# :7860 that serves harvest cursor advance to remote workers. Everything
# else (harvest, mirroring, agent pipeline, training pushes, dataset enrich)
# now runs on the GCP daemon fleet β€” see hermes-jobs.json (171 jobs scheduled
# via hermes-scheduler-daemon as of 2026-05-02).
#
# Set LOW_MEM=0 to re-enable in-Space launchers when on a paid tier (β‰₯32GB).
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] LOW_MEM=1 β†’ skipping all bg daemons + cron, going straight to :7860 status server" | tee -a "$LOG_DIR/boot.log"
    set +x   # silence trace
    # Verify deps before exec β€” print what's missing rather than silent crash
    if python3 -c "import fastapi, uvicorn" 2>/dev/null; then
        echo "[$(date +%H:%M:%S)] starting uvicorn :7860 (LOW_MEM fast-path)" | tee -a "$LOG_DIR/boot.log"
        exec python3 ~/.surrogate/bin/hermes-status-server.py
    else
        echo "❌ fastapi/uvicorn not importable β€” falling back to plain http.server"
        exec python3 -m http.server 7860 --bind 0.0.0.0
    fi
fi

# ── 3. Git config + clone axentx repos for auto-orchestrate auto-commit ────
# Disable interactive prompts globally so failed-auth git ops fail fast.
export GIT_TERMINAL_PROMPT=0
export GIT_ASKPASS=/bin/true

GH_TOKEN=$(echo "${GITHUB_TOKEN_POOL:-}" | cut -d',' -f1)
if [[ -n "$GH_TOKEN" ]]; then
    git config --global user.email "hermes@axentx.ai"
    git config --global user.name  "Hermes (Surrogate-1)"
    git config --global init.defaultBranch main
    git config --global pull.rebase true
    git config --global push.default current

    PROJECTS_DIR="${DATA}/projects"
    mkdir -p "$PROJECTS_DIR"
    rm -rf ~/axentx 2>/dev/null
    ln -sfn "$PROJECTS_DIR" ~/axentx

    # Clone axentx repos in background with hard timeout β€” never blocks boot.
    # Verified 2026-05-02 via gh api: 5 of 6 entries had wrong org/name and
    # were silently 404'ing (arkashira/* β€” only surrogate-1-harvest is there;
    # the rest are private under axentx org). The agent pipeline's dev/qa/
    # reviewer/commit daemons sat idle for a full day because no repo cloned
    # for them to work on. Real paths confirmed via /repos/<owner>/<name>:
    #   axentx/{Costinel,vanguard,airship,workio,axiomops,surrogate-1}  β†’ 200
    #   arkashira/{Costinel,vanguard,arkship,surrogate,workio,hermes-toolbelt} β†’ 404
    # Note: 'arkship' was a typo for 'airship' (axentx/airship).
    for repo_spec in \
        "Costinel:axentx/Costinel" \
        "vanguard:axentx/vanguard" \
        "airship:axentx/airship" \
        "workio:axentx/workio" \
        "axiomops:axentx/axiomops" \
        "surrogate-1:axentx/surrogate-1"; do
        local_name="${repo_spec%%:*}"
        gh_path="${repo_spec##*:}"
        target="${PROJECTS_DIR}/${local_name}"
        (
            if [[ ! -d "$target/.git" ]]; then
                echo "[$(date +%H:%M:%S)] cloning $gh_path..." >> "$LOG_DIR/boot.log"
                timeout 30 git clone --depth 50 \
                    "https://x-access-token:${GH_TOKEN}@github.com/${gh_path}.git" "$target" \
                    >> "$LOG_DIR/git-clone.log" 2>&1 || \
                    echo "[$(date +%H:%M:%S)] WARN: clone $gh_path failed/timeout" >> "$LOG_DIR/boot.log"
            else
                cd "$target" && timeout 20 git pull --rebase >> "$LOG_DIR/git-pull.log" 2>&1 || true
            fi
        ) &
    done
    # Don't wait β€” let clones finish in background while boot continues

    # Persist token for any push from auto-orchestrate
    git config --global credential.helper "store --file=$HOME/.git-credentials"
    echo "https://x-access-token:${GH_TOKEN}@github.com" > ~/.git-credentials
    chmod 600 ~/.git-credentials
    echo "[$(date +%H:%M:%S)] git auth configured + clone jobs spawned" >> "$LOG_DIR/boot.log"
fi

# ── 4. Redis (TCP only) ─────────────────────────────────────────────────────
# redis cap tightened on LOW_MEM (was 1gb β†’ 256mb). Coordinator uses
# SQLite directly; redis is only a soft cache for work-queue priorities.
REDIS_MAX="${REDIS_MAX:-$([[ "$LOW_MEM" == "1" ]] && echo "256mb" || echo "1gb")}"
redis-server --daemonize yes --port 6379 --bind 127.0.0.1 \
    --maxmemory "$REDIS_MAX" --maxmemory-policy allkeys-lru
sleep 1
redis-cli -h 127.0.0.1 -p 6379 ping >> "$LOG_DIR/redis.log" 2>&1

# ── 5. Ollama β€” DISABLED on cpu-basic (16 GB limit) ───────────────────────
# Root cause of 7-hr Runtime Error 2026-04-29: ollama loading qwen3-coder:30b
# (~17 GB Q4) + qwen2.5-coder:14b (~9 GB) + granite (~5 GB) = ~31 GB of model
# weights against a 16 GB cap β†’ instant OOM on any inference.
#
# On cpu-basic the FREE LLM LADDER (cerebras/groq/openrouter/gemini/chutes)
# is faster anyway β€” wafer-scale inference beats CPU x86 by 50-200Γ—.
# Ollama only worth running once Space upgrades to β‰₯cpu-upgrade (32 GB) OR
# moves to OCI A1.Flex anchor (24 GB ARM, native ollama support).
#
# Set LOW_MEM=0 to re-enable on bigger Space tier.
LOW_MEM="${LOW_MEM:-1}"
if [[ "$LOW_MEM" == "1" ]]; then
    echo "[$(date +%H:%M:%S)] ⚠ ollama SKIPPED (LOW_MEM=1, cpu-basic 16 GB)" \
        >> "$LOG_DIR/boot.log"
    echo "[$(date +%H:%M:%S)]   β†’ free LLM ladder serves all v2 inference" \
        >> "$LOG_DIR/boot.log"
else
    OLLAMA_MODELS="${HOME}/.ollama/models" \
    OLLAMA_HOST=127.0.0.1:11434 \
    nohup ollama serve > "$LOG_DIR/ollama.log" 2>&1 &
    sleep 6
    (
        if ! ollama list 2>/dev/null | grep -q "nomic-embed-text"; then
            ollama pull nomic-embed-text > "$LOG_DIR/ollama-pull-embed.log" 2>&1
        fi
        if ! ollama list 2>/dev/null | grep -q "qwen2.5-coder:3b"; then
            # Smallest coder that's actually useful β€” fits any tier
            ollama pull qwen2.5-coder:3b > "$LOG_DIR/ollama-pull-3b.log" 2>&1
        fi
    ) &
fi

# ── 6. Discord bot (only if egress to discord.com is reachable) ────────────
# HF Spaces free tier may block egress to discord.com β€” bot would crash-loop.
# Pre-flight check: if discord.com unreachable, skip bot, use webhook-only.
if [[ -n "${DISCORD_BOT_TOKEN:-}" ]]; then
    if curl -sS -o /dev/null -w "%{http_code}" --max-time 6 https://discord.com 2>/dev/null | grep -qE "^(200|301|302|307|308)$"; then
        set -a; source ~/.hermes/.env 2>/dev/null; set +a
        nohup python ~/.surrogate/bin/hermes-discord-bot.py >> "$LOG_DIR/discord-bot.log" 2>&1 &
        echo "[$(date +%H:%M:%S)] discord bot started (gateway reachable)"
    else
        echo "[$(date +%H:%M:%S)] discord.com unreachable β€” skipping bot, using webhook-only" >> "$LOG_DIR/boot.log"
    fi
fi

# ── 7a. Continuous scrape daemon β€” concurrency tuned to LOW_MEM ────────────
SCRAPE_PARALLEL="${SCRAPE_PARALLEL:-$([[ "$LOW_MEM" == "1" ]] && echo 2 || echo 8)}"
cat > /tmp/scrape-daemon.sh <<SCRAPESH
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="\${HOME}/.surrogate/logs/scrape-continuous.log"
mkdir -p "\$(dirname "\$LOG")"
while true; do
    START=\$(date +%s)
    bash ~/.surrogate/bin/domain-scrape-loop.sh 1500 ${SCRAPE_PARALLEL} >> "\$LOG" 2>&1
    DUR=\$(( \$(date +%s) - START ))
    if [[ \$DUR -lt 30 ]]; then sleep 30
    elif [[ \$DUR -lt 120 ]]; then sleep 15
    else sleep 5
    fi
done
SCRAPESH
chmod +x /tmp/scrape-daemon.sh
nohup /tmp/scrape-daemon.sh > "$LOG_DIR/scrape-daemon.log" 2>&1 &
echo "[$(date +%H:%M:%S)] scrape daemon parallel=${SCRAPE_PARALLEL} (LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7b. Agentic crawler β€” DISABLED on LOW_MEM (anchor takes this load) ─────
if [[ "$LOW_MEM" != "1" ]]; then
    CRAWLER_PARALLEL="${CRAWLER_PARALLEL:-6}"
    nohup bash ~/.surrogate/bin/agentic-crawler.sh "$CRAWLER_PARALLEL" \
        > "$LOG_DIR/agentic-crawler.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] agentic crawler parallel=$CRAWLER_PARALLEL" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ agentic-crawler SKIPPED (LOW_MEM); anchor handles" >> "$LOG_DIR/boot.log"
fi

# ── 7b2. GitHub-specific agentic crawler (lightweight, keep on always) ─────
nohup bash ~/.surrogate/bin/github-agentic-crawler.sh > "$LOG_DIR/github-agentic-crawler.log" 2>&1 &
echo "[$(date +%H:%M:%S)] github-agentic-crawler started" >> "$LOG_DIR/boot.log"

# ── 7b3. HF Dataset Discoverer β€” DISABLED (replaced by continuous-discoverer) ─
# Round 10 (a27499d): bin/v2/continuous-discoverer.sh covers HF + arxiv +
# Stack Exchange + GH trending in one daemon. Old hf-dataset-discoverer.sh
# is now redundant + memory pressure on cpu-basic.
echo "[$(date +%H:%M:%S)] ⚠ hf-dataset-discoverer SKIPPED (replaced by continuous-discoverer)" >> "$LOG_DIR/boot.log"

# ── 7e. auto-orchestrate-continuous β€” DISABLED on LOW_MEM (cron handles it) ─
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/auto-orchestrate-continuous.sh > "$LOG_DIR/auto-orchestrate-continuous.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-orchestrate-continuous started (4 parallel workers)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ auto-orchestrate-continuous SKIPPED (LOW_MEM); cron slot at M%20==0 covers it" >> "$LOG_DIR/boot.log"
fi

# ── 7e1. SELF-HEAL WATCHDOG β€” must start BEFORE memory-hungry workers ───────
# Monitors RAM usage every 60s; preempts youngest dataset-enrich shard if
# usage >= 85% to dodge the cpu-basic 16Gi OOM kill that would otherwise
# crash the entire container. Also restarts stuck ingest / kicks stale uploader.
nohup bash ~/.surrogate/bin/self-heal-watchdog.sh > "$LOG_DIR/self-heal-watchdog.log" 2>&1 &
echo "[$(date +%H:%M:%S)] self-heal-watchdog started (mem<85%, ingest<20m, push<10m)" >> "$LOG_DIR/boot.log"

# ── 7e2. GH-ACTIONS TICKER β€” burst-dispatch external runners every 60s ──────
# Fires workflow_dispatch on arkashira/ashiradevops-alt runner repos every
# 60s, bypassing GitHub's */5 cron minimum. Combined with 8-min runner
# timeouts, the 20-concurrent free-tier slot cap stays saturated.
# Skips silently if GH_TOKEN_ARKASHIRA / GH_TOKEN_DEVOPS aren't set as
# Space secrets β€” operator can add later without restart-required.
nohup bash ~/.surrogate/bin/gh-actions-ticker.sh > "$LOG_DIR/gh-actions-ticker.log" 2>&1 &
echo "[$(date +%H:%M:%S)] gh-actions-ticker started (60s tick, dispatches arkashira+ashiradevops-alt)" >> "$LOG_DIR/boot.log"

# ── 7e3. LLM BURST GENERATOR β€” synthetic training pairs from 8 free LLMs ────
# Cerebras + Groq + OpenRouter + Gemini + Chutes + NV NIM + Samba + Kimi.
# Each cycle fires 3 prompts at every active provider in parallel, writes
# {prompt, response} pairs to training-pairs.jsonl. Combined free-tier
# budget: ~7000+ pairs/day. Skips any provider whose key env is not set.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup python3 ~/.surrogate/bin/llm-burst-generator.py > "$LOG_DIR/llm-burst-generator.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] llm-burst-generator started (8 LLM APIs in parallel, ~7K synthetic pairs/day)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ llm-burst-generator SKIPPED (LOW_MEM); ZeroGPU synth-puller covers" >> "$LOG_DIR/boot.log"
fi
sleep 3   # Stagger spawns β€” avoid memory burst at boot

# ── 7f. PARALLEL BULK INGEST (slug-hash sharded; 6 shards on cpu-basic) ─────
# Was 16 shards but caused 'Memory limit exceeded (16Gi)' OOM. Each shard
# peaks ~1 GB while streaming via 'datasets' lib. Watchdog above provides
# a second safety net if peak still spikes.
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/bulk-ingest-parallel.sh > "$LOG_DIR/bulk-ingest-parallel.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] bulk-ingest-parallel started (6 shards, 293M total cap)" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ bulk-ingest-parallel SKIPPED (LOW_MEM); streaming-mirror-worker covers" >> "$LOG_DIR/boot.log"
fi
sleep 3

# ── 7g. PARQUET-DIRECT INGEST (skip 'datasets' library overhead, 5-10Γ— faster) ──
# Downloads parquet shards directly via HF datasets-server API + pyarrow filter.
# Targets only trillion-scale corpora where streaming is too slow.
# DLs reduced to 2 parallel β€” combined with 6 ingest shards stays under 16Gi.
PARQUET_PARALLEL=2 nohup bash ~/.surrogate/bin/parquet-direct-ingest.sh > "$LOG_DIR/parquet-direct-ingest.log" 2>&1 &
echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$LOG_DIR/boot.log"

# ── 7c. Skill-synthesis daemon β€” DISABLED on LOW_MEM (heavy LLM calls) ────
if [[ "$LOW_MEM" != "1" ]]; then
    nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
else
    echo "[$(date +%H:%M:%S)] ⚠ skill-synthesis SKIPPED (LOW_MEM); anchor's voyager-skills.py covers" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Bulk mirror coordinator + 4 parallel workers ────────────────────────
# User feedback 2026-04-29: "ทุก agent ΰΈ—ΰΈ³ΰΈ‡ΰΈ²ΰΈ™ΰΈ£ΰΉˆΰΈ§ΰΈ‘ΰΈΰΈ±ΰΈ™ แΰΈ₯ΰΈ°ΰΉ„ΰΈ‘ΰΉˆΰΉ„ΰΈ›ΰΈ—ΰΈ΅ΰΉˆΰΈ‹ΰΉ‰ΰΈ³ΰΉ†".
# Coordinator = SQLite claim queue (~/.surrogate/state/bulk-mirror-claims.db).
# Workers each pull next pending dataset, mirror+sanitize+dedup, mark done.
# 100+ massive datasets in bin/v2/bulk-datasets-massive.txt (code/security/SDLC/agent/etc).
# Lease-based claims (15 min) β€” crashes auto-expire so other workers pick up.
python3 ~/.surrogate/bin/v2/bulk-mirror-coordinator.py seed >> "$LOG_DIR/bulk-mirror-seed.log" 2>&1 || true

# Two worker types share the same coordinator queue:
#   bulk-mirror-worker.sh    β€” full-download, suits small/medium datasets
#   streaming-mirror-worker.sh β€” HF datasets streaming, suits trillion-token
# LOW_MEM tuning for cpu-basic 16GB Space (history):
#   v1: 0 bulk + 2 stream (Round 9-10 OOM tightened to 0+2)
#   v2: 0 bulk + 1 stream (Round 11-12 OOM further tightened)
#   v3 NOW: 1 bulk + 3 stream  (post Civo-pivot + 4-Space fan-out;
#                                anchor never came up so we can't rely on
#                                it for bulk, and 16GB has ~8 GB unused
#                                under the v2 setting β†’ reclaim it)
#
# Memory budget per Space (16 GB cpu-basic):
#   ~6 GB reserved: OS + redis 256mb + continuous-discoverer +
#                    dataset-enrich + auto-startup-loop + push bursts
#   ~10 GB available for harvest workers
#   3 stream Γ— 500 MB + 1 bulk Γ— 600 MB = 2.1 GB used
#   ~8 GB headroom β†’ memory-guard.sh kicks in at <3 GB free, safe
#
# Throughput delta: 4Γ— workers/Space Γ— 4 Spaces = 16Γ— total worker count
# (vs previous 1Γ—4 = 4). Combined with enrich cron M%30==5 (was M%60),
# expect 3-5Γ— commit rate before HF soft-cap kicks in.
BULK_WORKERS="${BULK_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 1 || echo 4)}"
STREAM_WORKERS="${STREAM_WORKERS:-$([[ "$LOW_MEM" == "1" ]] && echo 3 || echo 6)}"

for i in $(seq 1 "$BULK_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/bulk-mirror-worker.sh "bulk-w$i" \
        > "$LOG_DIR/bulk-worker-$i.log" 2>&1 &
done
for i in $(seq 1 "$STREAM_WORKERS"); do
    nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-w$i" \
        > "$LOG_DIR/stream-worker-$i.log" 2>&1 &
done
TOTAL_WORKERS=$((BULK_WORKERS + STREAM_WORKERS))
echo "[$(date +%H:%M:%S)] bulk-mirror coordinator + $BULK_WORKERS bulk + $STREAM_WORKERS streaming = $TOTAL_WORKERS workers (200+ datasets queued, LOW_MEM=$LOW_MEM)" >> "$LOG_DIR/boot.log"

# ── 7d2. Continuous multi-source dataset discoverer (boot daemon, never exits) ─
# Replaces aggressive-harvester cron β€” runs always, sweeps HF + arxiv + SE + GH.
if ! pgrep -f "continuous-discoverer.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/continuous-discoverer.sh \
        > "$LOG_DIR/continuous-discoverer.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] continuous-discoverer started (HF + arxiv + SE + GH, ~5min cycle)" >> "$LOG_DIR/boot.log"
fi

# ── Auto-startup-loop: 45 personae Γ— 9 LoRA clusters Γ— auto-commit + auto-push ─
# CEO/CTO/CMO/CFO/COO/PM/UX/Designer/SRE/DevOps/Marketing/SDR/AE/Growth/CS/Legal/HR/etc.
# 1 role per 15-min cycle; chained roles fire downstream automatically.
if ! pgrep -f "auto-startup-loop.sh" >/dev/null; then
    nohup bash ~/.surrogate/bin/v2/auto-startup-loop.sh \
        > "$LOG_DIR/auto-startup-loop.log" 2>&1 &
    echo "[$(date +%H:%M:%S)] auto-startup-loop started (45 personae cycle 15min, chains, auto-commit)" >> "$LOG_DIR/boot.log"
fi

# ── 7d. Train-ready pusher β€” disabled at boot for now. Caused Space
#       RUNTIME_ERROR on first deployment (2026-04-29). Script kept at
#       bin/train-ready-pusher.sh; launch manually after Space proves stable:
#         nohup bash ~/.surrogate/bin/train-ready-pusher.sh > /tmp/trp.log 2>&1 &
# nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &

# ── 7b. Cron loop β€” non-scrape daemons (scrape now runs continuously above) ─
cat > /tmp/hermes-cron.sh <<'CRONSH'
#!/bin/bash
set -a; source ~/.hermes/.env 2>/dev/null; set +a
LOG="${HOME}/.surrogate/logs/cron.log"
mkdir -p "$(dirname "$LOG")"
while true; do
    M=$(($(date +%s) / 60))
    # Cron offsets STAGGERED β€” minute=0 burst was OOM trigger.
    # Each major task picks a unique M%X==N offset so no two fire together.
    [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
    [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
    # Auto-scaler β€” spawn/kill workers based on free memory tier (burst-but-don't-die)
    [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
    # synth-puller β€” hit surrogate1 ZeroGPU /api/synth_batch every 5 min
    # Drains free PRO 25K min/mo into Magpie-style training pairs (16 domains rotate).
    [[ $((M % 5)) -eq 3 ]] && bash ~/.surrogate/bin/v2/synth-puller.sh >> "$LOG" 2>&1 &
    # push-training-to-hf gated by memory (loads big shard into RAM).
    # Anchor (24GB) takes over when capacity arrives β€” see anchor cron-loop.
    [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/push-training-to-hf.sh >> "$LOG" 2>&1 &
    # auto-orchestrate now runs CONTINUOUSLY (4 parallel workers) β€” see step 7e below.
    # Cron entry retained for legacy single-fire boost (no harm if continuous already up):
    [[ $((M % 20)) -eq 0 ]] && pgrep -f "auto-orchestrate-continuous" >/dev/null || bash ~/.surrogate/bin/auto-orchestrate-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: research-apply (pop queue β†’ orchestrate β†’ ship feature)
    [[ $((M % 30)) -eq 15 ]] && bash ~/.surrogate/bin/surrogate-research-apply.sh >> "$LOG" 2>&1 &
    # Every 60 min: keyword tuner (adapts scrape queue based on yields)
    [[ $((M % 60)) -eq 4 ]] && bash ~/.surrogate/bin/scrape-keyword-tuner.sh >> "$LOG" 2>&1 &
    # Every 6 hours: research-loop (discover new features from competitors/papers)
    [[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/surrogate-research-loop.sh >> "$LOG" 2>&1 &
    # Every 30 min: dataset enrich (was 60 min β€” bumped 2Γ— now that we have
    # 4 Spaces Γ— (3 stream + 1 bulk) = 16 workers harvesting in parallel,
    # producing more chunks per hour than the old 60-min push could drain).
    # Memory-guarded β€” full HF Hub iter is heavy.
    [[ $((M % 30)) -eq 5 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/dataset-enrich.sh >> "$LOG" 2>&1 &
    # Every 15 min: self-ingest training-pairs into FTS index (closes self-improvement)
    [[ $((M % 15)) -eq 3 ]] && bash ~/.surrogate/bin/surrogate-self-ingest.sh >> "$LOG" 2>&1 &
    # Every 30 min: build vector embeddings index (RAG semantic search)
    [[ $((M % 30)) -eq 12 ]] && bash ~/.surrogate/bin/rag-vector-builder.sh >> "$LOG" 2>&1 &
    # Every 30 min: synthetic data generation (REWORK→APPROVE DPO + distilabel rewrite)
    [[ $((M % 30)) -eq 7 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/synthetic-data-from-rework.sh >> "$LOG" 2>&1 &
    # Daily 04:00 UTC: refresh CVE feed (NVD + CISA KEV) β†’ security-knowledge dataset
    [[ $((M % 1440)) -eq 240 ]] && bash ~/.surrogate/bin/refresh-cve-feed.sh >> "$LOG" 2>&1 &
    # Daily 05:00 UTC: scrape SRE postmortems (danluu list + awesome-tech-postmortems)
    [[ $((M % 1440)) -eq 300 ]] && bash ~/.surrogate/bin/scrape-sre-postmortems.sh >> "$LOG" 2>&1 &
    # Daily 06:00 UTC: LLM-expand role keywords (sends each role's skills to
    # Cerebras/Groq β†’ +80 specific job-description-style search terms each).
    # Discoverer auto-uses the expanded list on its next cycle.
    [[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
    # Every 90 min: kick a Kaggle T4 LoRA training run on the latest dataset
    # slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
    # so we DO want to keep submitting β€” Kaggle queues if 1 already running,
    # auto-cancels older if 5+ pending. With shorter interval we keep the
    # GPU pipeline saturated.
    [[ $((M % 90)) -eq 5 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
    # Every 6 hr: Lightning AI H200 training run (free 4hr H200 quota = ~13/mo).
    # H200 141GB VRAM fits Qwen3-Coder-480B-A35B QLoRA β€” biggest free training.
    [[ $((M % 360)) -eq 45 ]] && bash ~/.surrogate/bin/lightning-trainer.sh >> "$LOG_DIR/lightning-trainer.log" 2>&1 &

    # ── Round 5 (2026-04) sustainability loops ──────────────────────────
    # Every 6 hr (offset 90): self-improve loop β€” gen problems, judge,
    # winners β†’ training data, losers β†’ reflexion-store.
    [[ $((M % 360)) -eq 90 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/self-improve-loop.sh >> "$LOG_DIR/self-improve.log" 2>&1 &
    # Every 30 min (offset 22): mine new tool-call traces from logs into
    # SFT + DPO data, plus voyager skill candidates.
    [[ $((M % 30)) -eq 22 ]] && python3 ~/.surrogate/bin/v2/tool-trace-collector.py >> "$LOG_DIR/tool-trace.log" 2>&1 &
    # Every 60 min (offset 17): export promoted voyager skills to JSONL
    # (training-data slice + inference-time retrieval source).
    [[ $((M % 60)) -eq 17 ]] && python3 ~/.surrogate/bin/v2/voyager-skills.py export >> "$LOG_DIR/voyager.log" 2>&1 &
    # Daily 07:00 UTC: active-learning batch from one bulk-mirror file.
    # Skips silently if no pool yet.
    [[ $((M % 1440)) -eq 420 ]] && {
        POOL=$(ls -t "$DATA"/bulk-mirror/*.jsonl 2>/dev/null | head -1)
        [[ -n "$POOL" ]] && python3 ~/.surrogate/bin/v2/active-learning.py \
            --pool "$POOL" --n 200 --scan 1500 \
            >> "$LOG_DIR/active-learning.log" 2>&1 &
    }
    # Daily 08:00 UTC: constitutional self-critique on yesterday's
    # winners (pulls latest self-improve winners file).
    [[ $((M % 1440)) -eq 480 ]] && {
        WIN=$(ls -t "$DATA"/v2/self-improve/winners-*.jsonl 2>/dev/null | head -1)
        [[ -n "$WIN" ]] && python3 ~/.surrogate/bin/v2/constitutional-loop.py \
            --input "$WIN" --n 200 \
            >> "$LOG_DIR/constitutional.log" 2>&1 &
    }

    # ── Round 7+8 (2026-04-30) β€” trillion-scale + harvester + enrich ──────
    # Every 30 min (offset 9): aggressive HF dataset discoverer (70-keyword sweep)
    [[ $((M % 30)) -eq 9 ]] && bash ~/.surrogate/bin/v2/aggressive-harvester.sh \
        >> "$LOG_DIR/aggressive-harvester.log" 2>&1 &
    # Every 60 min (offset 35): enrich newly-mirrored bulk files
    [[ $((M % 60)) -eq 35 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
        && bash ~/.surrogate/bin/v2/enrich-pipeline.sh \
        >> "$LOG_DIR/enrich-pipeline.log" 2>&1 &
    # Every 30 min (offset 25): spawn extra streaming worker if pool empty
    [[ $((M % 30)) -eq 25 ]] && {
        if ! pgrep -f "streaming-mirror-worker.sh" >/dev/null; then
            nohup bash ~/.surrogate/bin/v2/streaming-mirror-worker.sh "stream-cron-$(date +%s)" \
                > "$LOG_DIR/stream-worker-cron.log" 2>&1 &
        fi
    }
    # Daily 09:00 UTC: teachable-prompt filter on harvested data
    [[ $((M % 1440)) -eq 540 ]] && {
        LATEST=$(ls -t "$DATA"/v2/enriched/*.jsonl 2>/dev/null | head -1)
        [[ -n "$LATEST" ]] && python3 ~/.surrogate/bin/v2/teachable-prompt-filter.py \
            --input "$LATEST" --out "$DATA"/v2/teachable-$(date +%Y%m%d).jsonl \
            --n 1000 --keep-target 200 \
            >> "$LOG_DIR/teachable.log" 2>&1 &
    }
    # Daily 11:00 UTC: regression test suite (catches breakage post-push)
    [[ $((M % 1440)) -eq 660 ]] && bash ~/.surrogate/bin/v2/regression-test.sh --quick \
        >> "$LOG_DIR/regression.log" 2>&1 &
    # Weekly Sun 10:00 UTC: abstract-cot compress reasoning data
    [[ $((M % 10080)) -eq 600 ]] && {
        for f in "$DATA"/v2/verify-traces.jsonl "$DATA"/v2/self-improve/winners-*.jsonl; do
            [[ -f "$f" ]] || continue
            python3 ~/.surrogate/bin/v2/abstract-cot-compressor.py \
                --input "$f" --out "${f%.jsonl}-compressed.jsonl" \
                >> "$LOG_DIR/abstract-cot.log" 2>&1
        done
    }
    sleep 60
done
CRONSH
chmod +x /tmp/hermes-cron.sh
nohup /tmp/hermes-cron.sh > "$LOG_DIR/cron-master.log" 2>&1 &
echo "[$(date +%H:%M:%S)] cron loop started" >> "$LOG_DIR/boot.log"

# ── 8. Status HTTP server on :7860 (FastAPI/uvicorn β€” robust binding) ──────
set +x   # silence trace for clean uvicorn logs
echo "[$(date +%H:%M:%S)] starting status server :7860" | tee -a "$LOG_DIR/boot.log"

# Verify deps before exec β€” print what's missing rather than silent crash
python3 -c "import fastapi, uvicorn; print(f'  fastapi {fastapi.__version__} + uvicorn {uvicorn.__version__} ok')" || {
    echo "❌ fastapi/uvicorn not importable β€” falling back to plain http.server"
    exec python3 -m http.server 7860 --bind 0.0.0.0
}

# Run as PID 1 β€” uvicorn handles signals + auto-restart on crash
exec python3 ~/.surrogate/bin/hermes-status-server.py