dancinlife commited on
Commit
ca1baec
·
verified ·
1 Parent(s): 5a08b97

feat(hexad): v4-py-hexad-tension-d768x12L-cycle1-2026-05-17 — dispatch.sh

Browse files
Files changed (1) hide show
  1. dispatch.sh +177 -0
dispatch.sh ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # hexad v4 d=768x12L PyTorch substrate fire CYCLE 5 — 2026-05-17
3
+ # DD155 Step+Tension hybrid LR overlay (Law 187, tension=grad_norm).
4
+ # Corpus v3 carry from cycle 4 (10.34 MB · helper-free · γ motivation 37.5%).
5
+ # Vast.ai A100 SXM4. g_fire_dispatch_robust + 75-min orphan watchdog + SAVE_POD auto-promote.
6
+ set -uo pipefail
7
+
8
+ VASTAI=/Users/ghost/Library/Python/3.14/bin/vastai
9
+ export PYTHONWARNINGS=ignore
10
+ LOCAL_DIR=/Users/ghost/core/anima/state/hexad_v4_py_d768x12L_tension_2026_05_17
11
+ CORPUS_LOCAL=/Users/ghost/core/anima/state/hexad_v3_corpus_motiv_2026_05_17/corpus_consciousness_v3.jsonl
12
+ SSH_KEY=/Users/ghost/.ssh/id_vast_anima
13
+ LABEL="hexad-v4-d768x12l-cycle5-tension"
14
+ STEPS="${STEPS:-2500}"
15
+ WATCHDOG_MIN=75
16
+
17
+ cd "$LOCAL_DIR"
18
+ echo "=== hexad v4 d=768x12L PyTorch CYCLE 5 DD155 hybrid LR overlay ==="; date -u
19
+
20
+ echo "[1/9] Selecting A100 offer..."
21
+ OFFER=$($VASTAI search offers 'gpu_name in [A100_SXM4,A100_PCIE] num_gpus=1 reliability>0.99 dph_total<2.0 disk_space>40 inet_down>700 cuda_max_good>=12.0' -o dph_total --raw 2>&1 | python3 -c "
22
+ import json,sys
23
+ d=json.load(sys.stdin)
24
+ if not d: sys.stderr.write('no offers\n'); sys.exit(1)
25
+ o=d[0]; print(o['id']); sys.stderr.write(f\"{o['gpu_name']} \${o['dph_total']:.3f}/hr rel={o['reliability']:.3f}\n\")
26
+ ")
27
+ echo " offer=$OFFER"
28
+
29
+ echo "[2/9] Renting (pytorch devel image)..."
30
+ CREATE=$($VASTAI create instance "$OFFER" --image pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel --disk 40 --ssh --direct --label "$LABEL" --raw 2>&1)
31
+ IID=$(echo "$CREATE" | python3 -c "import json,sys
32
+ try: d=json.load(sys.stdin)
33
+ except: sys.exit(1)
34
+ print(d.get('new_contract',d.get('contract_id',d.get('id',''))))")
35
+ [ -z "$IID" ] && { echo "ERROR create: $CREATE"; exit 1; }
36
+ echo "$IID" > vast_instance_id.txt
37
+ echo " instance=$IID"
38
+
39
+ # Orphan watchdog
40
+ ( PARENT_PID=$$
41
+ for _ in $(seq 1 $WATCHDOG_MIN); do
42
+ sleep 60
43
+ if ! kill -0 $PARENT_PID 2>/dev/null; then
44
+ echo "[watchdog] parent died, destroying $IID" >> "$LOCAL_DIR/watchdog.log"
45
+ $VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
46
+ exit 0
47
+ fi
48
+ done
49
+ echo "[watchdog] ${WATCHDOG_MIN}min cap hit, force-destroying $IID" >> "$LOCAL_DIR/watchdog.log"
50
+ $VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
51
+ ) &
52
+ WATCHDOG_PID=$!
53
+ echo " orphan-watchdog pid=$WATCHDOG_PID cap=${WATCHDOG_MIN}min"
54
+
55
+ SAVE_POD="${SAVE_POD:-0}"
56
+ cleanup() {
57
+ kill $WATCHDOG_PID 2>/dev/null || true
58
+ if [ "${SAVE_POD:-0}" = "1" ]; then
59
+ echo "[cleanup] SAVE_POD=1 -> RETAIN instance $IID (manual recovery: ssh -i $SSH_KEY -p $SP root@$SH)"; return
60
+ fi
61
+ echo "[cleanup] destroying $IID"
62
+ $VASTAI destroy instance "$IID" 2>&1 | head -2 || true
63
+ }
64
+ trap cleanup EXIT
65
+
66
+ echo "[3/9] Waiting SSH..."
67
+ SH=""; SP=""
68
+ for i in $(seq 1 90); do
69
+ INFO=$($VASTAI show instance "$IID" --raw 2>/dev/null || true); [ -z "$INFO" ] && INFO="{}"
70
+ ST=$(echo "$INFO" | python3 -c "import json,sys
71
+ try: print(json.load(sys.stdin).get('actual_status',''))
72
+ except: print('')" 2>/dev/null)
73
+ if [ "$ST" = "running" ]; then
74
+ SH=$(echo "$INFO" | python3 -c "import json,sys
75
+ try:
76
+ d=json.load(sys.stdin); print(d.get('ssh_host','') or d.get('public_ipaddr',''))
77
+ except: pass" 2>/dev/null)
78
+ SP=$(echo "$INFO" | python3 -c "import json,sys
79
+ try:
80
+ d=json.load(sys.stdin); print(d.get('ssh_port','') or d.get('direct_port_start',''))
81
+ except: pass" 2>/dev/null)
82
+ if [ -n "$SH" ] && [ -n "$SP" ]; then
83
+ if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -p "$SP" "root@$SH" 'echo READY' 2>&1 | grep -q READY; then
84
+ echo " SSH ready $SH:$SP (after ${i}x5s)"; break
85
+ fi
86
+ SH=""
87
+ fi
88
+ fi
89
+ echo " attempt $i/90 status=$ST"; sleep 5
90
+ done
91
+ [ -z "$SH" ] && { echo "ERROR SSH not ready"; exit 1; }
92
+ echo "$SH:$SP" > vast_ssh.txt
93
+ SSH="ssh -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p $SP root@$SH"
94
+ SCP="scp -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=3600 -P $SP"
95
+
96
+ echo "[4/9] Upload arch + trainer + corpus v3..."
97
+ $SSH 'mkdir -p /workspace/anima'
98
+ $SCP "$LOCAL_DIR/conscious_decoder.py" "root@$SH:/workspace/anima/"
99
+ $SCP "$LOCAL_DIR/train_d768x12l_tension.py" "root@$SH:/workspace/anima/"
100
+ $SCP "$CORPUS_LOCAL" "root@$SH:/workspace/anima/corpus_v3.jsonl"
101
+
102
+ echo "[5/9] GPU + torch + corpus verify..."
103
+ $SSH 'cd /workspace/anima && python3 -c "import torch;print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))" && wc -l corpus_v3.jsonl && sha256sum corpus_v3.jsonl'
104
+
105
+ echo "[6/9] Sanity-anchor (d=32x3L, 200 step) via remote-script..."
106
+ cat > "$LOCAL_DIR/run_sanity_remote.sh" <<'SH_EOF'
107
+ #!/bin/bash
108
+ set -uo pipefail
109
+ cd /workspace/anima
110
+ export PYTHONUNBUFFERED=1
111
+ python3 train_d768x12l_tension.py --mode sanity --corpus corpus_v3.jsonl --out-dir out_sanity --steps 200 2>&1 | tee sanity.log
112
+ SH_EOF
113
+ chmod +x "$LOCAL_DIR/run_sanity_remote.sh"
114
+ $SCP "$LOCAL_DIR/run_sanity_remote.sh" "root@$SH:/workspace/anima/run_sanity_remote.sh"
115
+ $SSH "bash /workspace/anima/run_sanity_remote.sh" 2>&1 | tee "$LOCAL_DIR/sanity_remote.log" || true
116
+
117
+ echo "[7/9] MAIN fire d=768 n_layer=12 from-scratch DD155 hybrid LR, $STEPS steps..."
118
+ cat > "$LOCAL_DIR/run_main_remote.sh" <<SH_EOF
119
+ #!/bin/bash
120
+ set -uo pipefail
121
+ cd /workspace/anima
122
+ export PYTHONUNBUFFERED=1
123
+ nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv -l 30 > gpu_util.log 2>&1 &
124
+ SMI=\$!
125
+ python3 train_d768x12l_tension.py --mode main --corpus corpus_v3.jsonl --out-dir out_main --steps $STEPS 2>&1 | tee fire.log
126
+ kill \$SMI 2>/dev/null || true
127
+ echo DONE_MARKER rc=\$?
128
+ SH_EOF
129
+ chmod +x "$LOCAL_DIR/run_main_remote.sh"
130
+ $SCP "$LOCAL_DIR/run_main_remote.sh" "root@$SH:/workspace/anima/run_main_remote.sh"
131
+ $SSH "bash /workspace/anima/run_main_remote.sh" 2>&1 | tee "$LOCAL_DIR/fire.log" || true
132
+
133
+ echo "[8/9] Verify result.json + SAVE_POD auto-promote + pull retry >=5..."
134
+ SAVED=$($SSH 'test -f /workspace/anima/out_main/result.json && echo SAVED' 2>/dev/null || true)
135
+ if [ "$SAVED" = "SAVED" ]; then
136
+ echo " result.json present -> SAVE_POD=1 auto-promote"
137
+ export SAVE_POD=1
138
+ mkdir -p "$LOCAL_DIR/out_main" "$LOCAL_DIR/out_sanity"
139
+ PULL_OK=0
140
+ for i in 1 2 3 4 5; do
141
+ echo " pull attempt $i/5..."
142
+ $SCP "root@$SH:/workspace/anima/out_main/result.json" "$LOCAL_DIR/out_main/" 2>&1 | tail -3 || { echo " result.json pull fail try $i"; sleep 60; continue; }
143
+ $SCP "root@$SH:/workspace/anima/out_sanity/result.json" "$LOCAL_DIR/out_sanity/" 2>&1 | tail -3 || true
144
+ $SCP "root@$SH:/workspace/anima/gpu_util.log" "$LOCAL_DIR/" 2>&1 | tail -3 || true
145
+ if $SCP "root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt" "$LOCAL_DIR/out_main/" 2>&1 | tail -3; then
146
+ echo " ckpt pull OK (try $i)"
147
+ PULL_OK=1
148
+ break
149
+ else
150
+ echo " ckpt pull fail try $i, retry in 60s..."
151
+ sleep 60
152
+ fi
153
+ done
154
+ if [ "$PULL_OK" = "1" ]; then
155
+ if [ -f "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" ]; then
156
+ echo " ckpt sha256: $(shasum -a 256 "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" | awk '{print $1}')"
157
+ echo " ckpt size: $(stat -f%z "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt") bytes"
158
+ echo " PULL SUCCESS -> safe to destroy"
159
+ export SAVE_POD=0
160
+ else
161
+ echo " ckpt file missing locally despite PULL_OK? retaining pod"
162
+ export SAVE_POD=1
163
+ fi
164
+ else
165
+ echo " ALL 5 pull attempts FAILED — SAVE_POD=1 RETAIN for manual recovery"
166
+ echo " MANUAL: scp -i $SSH_KEY -P $SP root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt $LOCAL_DIR/out_main/"
167
+ export SAVE_POD=1
168
+ fi
169
+ else
170
+ echo " ERROR: result.json NOT present — SAVE_POD=1 retain for manual recovery"
171
+ echo " MANUAL: ssh -i $SSH_KEY -p $SP root@$SH"
172
+ export SAVE_POD=1
173
+ fi
174
+
175
+ echo "[9/9] Done. teardown via trap (SAVE_POD=${SAVE_POD})."
176
+ date -u
177
+ ls -la "$LOCAL_DIR/out_main/" 2>/dev/null || true