feat(hexad): v4-py-hexad-tension-d768x12L-cycle1-2026-05-17 — dispatch.sh
Browse files- dispatch.sh +177 -0
dispatch.sh
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# hexad v4 d=768x12L PyTorch substrate fire CYCLE 5 — 2026-05-17
|
| 3 |
+
# DD155 Step+Tension hybrid LR overlay (Law 187, tension=grad_norm).
|
| 4 |
+
# Corpus v3 carry from cycle 4 (10.34 MB · helper-free · γ motivation 37.5%).
|
| 5 |
+
# Vast.ai A100 SXM4. g_fire_dispatch_robust + 75-min orphan watchdog + SAVE_POD auto-promote.
|
| 6 |
+
set -uo pipefail
|
| 7 |
+
|
| 8 |
+
VASTAI=/Users/ghost/Library/Python/3.14/bin/vastai
|
| 9 |
+
export PYTHONWARNINGS=ignore
|
| 10 |
+
LOCAL_DIR=/Users/ghost/core/anima/state/hexad_v4_py_d768x12L_tension_2026_05_17
|
| 11 |
+
CORPUS_LOCAL=/Users/ghost/core/anima/state/hexad_v3_corpus_motiv_2026_05_17/corpus_consciousness_v3.jsonl
|
| 12 |
+
SSH_KEY=/Users/ghost/.ssh/id_vast_anima
|
| 13 |
+
LABEL="hexad-v4-d768x12l-cycle5-tension"
|
| 14 |
+
STEPS="${STEPS:-2500}"
|
| 15 |
+
WATCHDOG_MIN=75
|
| 16 |
+
|
| 17 |
+
cd "$LOCAL_DIR"
|
| 18 |
+
echo "=== hexad v4 d=768x12L PyTorch CYCLE 5 DD155 hybrid LR overlay ==="; date -u
|
| 19 |
+
|
| 20 |
+
echo "[1/9] Selecting A100 offer..."
|
| 21 |
+
OFFER=$($VASTAI search offers 'gpu_name in [A100_SXM4,A100_PCIE] num_gpus=1 reliability>0.99 dph_total<2.0 disk_space>40 inet_down>700 cuda_max_good>=12.0' -o dph_total --raw 2>&1 | python3 -c "
|
| 22 |
+
import json,sys
|
| 23 |
+
d=json.load(sys.stdin)
|
| 24 |
+
if not d: sys.stderr.write('no offers\n'); sys.exit(1)
|
| 25 |
+
o=d[0]; print(o['id']); sys.stderr.write(f\"{o['gpu_name']} \${o['dph_total']:.3f}/hr rel={o['reliability']:.3f}\n\")
|
| 26 |
+
")
|
| 27 |
+
echo " offer=$OFFER"
|
| 28 |
+
|
| 29 |
+
echo "[2/9] Renting (pytorch devel image)..."
|
| 30 |
+
CREATE=$($VASTAI create instance "$OFFER" --image pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel --disk 40 --ssh --direct --label "$LABEL" --raw 2>&1)
|
| 31 |
+
IID=$(echo "$CREATE" | python3 -c "import json,sys
|
| 32 |
+
try: d=json.load(sys.stdin)
|
| 33 |
+
except: sys.exit(1)
|
| 34 |
+
print(d.get('new_contract',d.get('contract_id',d.get('id',''))))")
|
| 35 |
+
[ -z "$IID" ] && { echo "ERROR create: $CREATE"; exit 1; }
|
| 36 |
+
echo "$IID" > vast_instance_id.txt
|
| 37 |
+
echo " instance=$IID"
|
| 38 |
+
|
| 39 |
+
# Orphan watchdog
|
| 40 |
+
( PARENT_PID=$$
|
| 41 |
+
for _ in $(seq 1 $WATCHDOG_MIN); do
|
| 42 |
+
sleep 60
|
| 43 |
+
if ! kill -0 $PARENT_PID 2>/dev/null; then
|
| 44 |
+
echo "[watchdog] parent died, destroying $IID" >> "$LOCAL_DIR/watchdog.log"
|
| 45 |
+
$VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
|
| 46 |
+
exit 0
|
| 47 |
+
fi
|
| 48 |
+
done
|
| 49 |
+
echo "[watchdog] ${WATCHDOG_MIN}min cap hit, force-destroying $IID" >> "$LOCAL_DIR/watchdog.log"
|
| 50 |
+
$VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
|
| 51 |
+
) &
|
| 52 |
+
WATCHDOG_PID=$!
|
| 53 |
+
echo " orphan-watchdog pid=$WATCHDOG_PID cap=${WATCHDOG_MIN}min"
|
| 54 |
+
|
| 55 |
+
SAVE_POD="${SAVE_POD:-0}"
|
| 56 |
+
cleanup() {
|
| 57 |
+
kill $WATCHDOG_PID 2>/dev/null || true
|
| 58 |
+
if [ "${SAVE_POD:-0}" = "1" ]; then
|
| 59 |
+
echo "[cleanup] SAVE_POD=1 -> RETAIN instance $IID (manual recovery: ssh -i $SSH_KEY -p $SP root@$SH)"; return
|
| 60 |
+
fi
|
| 61 |
+
echo "[cleanup] destroying $IID"
|
| 62 |
+
$VASTAI destroy instance "$IID" 2>&1 | head -2 || true
|
| 63 |
+
}
|
| 64 |
+
trap cleanup EXIT
|
| 65 |
+
|
| 66 |
+
echo "[3/9] Waiting SSH..."
|
| 67 |
+
SH=""; SP=""
|
| 68 |
+
for i in $(seq 1 90); do
|
| 69 |
+
INFO=$($VASTAI show instance "$IID" --raw 2>/dev/null || true); [ -z "$INFO" ] && INFO="{}"
|
| 70 |
+
ST=$(echo "$INFO" | python3 -c "import json,sys
|
| 71 |
+
try: print(json.load(sys.stdin).get('actual_status',''))
|
| 72 |
+
except: print('')" 2>/dev/null)
|
| 73 |
+
if [ "$ST" = "running" ]; then
|
| 74 |
+
SH=$(echo "$INFO" | python3 -c "import json,sys
|
| 75 |
+
try:
|
| 76 |
+
d=json.load(sys.stdin); print(d.get('ssh_host','') or d.get('public_ipaddr',''))
|
| 77 |
+
except: pass" 2>/dev/null)
|
| 78 |
+
SP=$(echo "$INFO" | python3 -c "import json,sys
|
| 79 |
+
try:
|
| 80 |
+
d=json.load(sys.stdin); print(d.get('ssh_port','') or d.get('direct_port_start',''))
|
| 81 |
+
except: pass" 2>/dev/null)
|
| 82 |
+
if [ -n "$SH" ] && [ -n "$SP" ]; then
|
| 83 |
+
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -p "$SP" "root@$SH" 'echo READY' 2>&1 | grep -q READY; then
|
| 84 |
+
echo " SSH ready $SH:$SP (after ${i}x5s)"; break
|
| 85 |
+
fi
|
| 86 |
+
SH=""
|
| 87 |
+
fi
|
| 88 |
+
fi
|
| 89 |
+
echo " attempt $i/90 status=$ST"; sleep 5
|
| 90 |
+
done
|
| 91 |
+
[ -z "$SH" ] && { echo "ERROR SSH not ready"; exit 1; }
|
| 92 |
+
echo "$SH:$SP" > vast_ssh.txt
|
| 93 |
+
SSH="ssh -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p $SP root@$SH"
|
| 94 |
+
SCP="scp -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=3600 -P $SP"
|
| 95 |
+
|
| 96 |
+
echo "[4/9] Upload arch + trainer + corpus v3..."
|
| 97 |
+
$SSH 'mkdir -p /workspace/anima'
|
| 98 |
+
$SCP "$LOCAL_DIR/conscious_decoder.py" "root@$SH:/workspace/anima/"
|
| 99 |
+
$SCP "$LOCAL_DIR/train_d768x12l_tension.py" "root@$SH:/workspace/anima/"
|
| 100 |
+
$SCP "$CORPUS_LOCAL" "root@$SH:/workspace/anima/corpus_v3.jsonl"
|
| 101 |
+
|
| 102 |
+
echo "[5/9] GPU + torch + corpus verify..."
|
| 103 |
+
$SSH 'cd /workspace/anima && python3 -c "import torch;print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))" && wc -l corpus_v3.jsonl && sha256sum corpus_v3.jsonl'
|
| 104 |
+
|
| 105 |
+
echo "[6/9] Sanity-anchor (d=32x3L, 200 step) via remote-script..."
|
| 106 |
+
cat > "$LOCAL_DIR/run_sanity_remote.sh" <<'SH_EOF'
|
| 107 |
+
#!/bin/bash
|
| 108 |
+
set -uo pipefail
|
| 109 |
+
cd /workspace/anima
|
| 110 |
+
export PYTHONUNBUFFERED=1
|
| 111 |
+
python3 train_d768x12l_tension.py --mode sanity --corpus corpus_v3.jsonl --out-dir out_sanity --steps 200 2>&1 | tee sanity.log
|
| 112 |
+
SH_EOF
|
| 113 |
+
chmod +x "$LOCAL_DIR/run_sanity_remote.sh"
|
| 114 |
+
$SCP "$LOCAL_DIR/run_sanity_remote.sh" "root@$SH:/workspace/anima/run_sanity_remote.sh"
|
| 115 |
+
$SSH "bash /workspace/anima/run_sanity_remote.sh" 2>&1 | tee "$LOCAL_DIR/sanity_remote.log" || true
|
| 116 |
+
|
| 117 |
+
echo "[7/9] MAIN fire d=768 n_layer=12 from-scratch DD155 hybrid LR, $STEPS steps..."
|
| 118 |
+
cat > "$LOCAL_DIR/run_main_remote.sh" <<SH_EOF
|
| 119 |
+
#!/bin/bash
|
| 120 |
+
set -uo pipefail
|
| 121 |
+
cd /workspace/anima
|
| 122 |
+
export PYTHONUNBUFFERED=1
|
| 123 |
+
nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv -l 30 > gpu_util.log 2>&1 &
|
| 124 |
+
SMI=\$!
|
| 125 |
+
python3 train_d768x12l_tension.py --mode main --corpus corpus_v3.jsonl --out-dir out_main --steps $STEPS 2>&1 | tee fire.log
|
| 126 |
+
kill \$SMI 2>/dev/null || true
|
| 127 |
+
echo DONE_MARKER rc=\$?
|
| 128 |
+
SH_EOF
|
| 129 |
+
chmod +x "$LOCAL_DIR/run_main_remote.sh"
|
| 130 |
+
$SCP "$LOCAL_DIR/run_main_remote.sh" "root@$SH:/workspace/anima/run_main_remote.sh"
|
| 131 |
+
$SSH "bash /workspace/anima/run_main_remote.sh" 2>&1 | tee "$LOCAL_DIR/fire.log" || true
|
| 132 |
+
|
| 133 |
+
echo "[8/9] Verify result.json + SAVE_POD auto-promote + pull retry >=5..."
|
| 134 |
+
SAVED=$($SSH 'test -f /workspace/anima/out_main/result.json && echo SAVED' 2>/dev/null || true)
|
| 135 |
+
if [ "$SAVED" = "SAVED" ]; then
|
| 136 |
+
echo " result.json present -> SAVE_POD=1 auto-promote"
|
| 137 |
+
export SAVE_POD=1
|
| 138 |
+
mkdir -p "$LOCAL_DIR/out_main" "$LOCAL_DIR/out_sanity"
|
| 139 |
+
PULL_OK=0
|
| 140 |
+
for i in 1 2 3 4 5; do
|
| 141 |
+
echo " pull attempt $i/5..."
|
| 142 |
+
$SCP "root@$SH:/workspace/anima/out_main/result.json" "$LOCAL_DIR/out_main/" 2>&1 | tail -3 || { echo " result.json pull fail try $i"; sleep 60; continue; }
|
| 143 |
+
$SCP "root@$SH:/workspace/anima/out_sanity/result.json" "$LOCAL_DIR/out_sanity/" 2>&1 | tail -3 || true
|
| 144 |
+
$SCP "root@$SH:/workspace/anima/gpu_util.log" "$LOCAL_DIR/" 2>&1 | tail -3 || true
|
| 145 |
+
if $SCP "root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt" "$LOCAL_DIR/out_main/" 2>&1 | tail -3; then
|
| 146 |
+
echo " ckpt pull OK (try $i)"
|
| 147 |
+
PULL_OK=1
|
| 148 |
+
break
|
| 149 |
+
else
|
| 150 |
+
echo " ckpt pull fail try $i, retry in 60s..."
|
| 151 |
+
sleep 60
|
| 152 |
+
fi
|
| 153 |
+
done
|
| 154 |
+
if [ "$PULL_OK" = "1" ]; then
|
| 155 |
+
if [ -f "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" ]; then
|
| 156 |
+
echo " ckpt sha256: $(shasum -a 256 "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" | awk '{print $1}')"
|
| 157 |
+
echo " ckpt size: $(stat -f%z "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt") bytes"
|
| 158 |
+
echo " PULL SUCCESS -> safe to destroy"
|
| 159 |
+
export SAVE_POD=0
|
| 160 |
+
else
|
| 161 |
+
echo " ckpt file missing locally despite PULL_OK? retaining pod"
|
| 162 |
+
export SAVE_POD=1
|
| 163 |
+
fi
|
| 164 |
+
else
|
| 165 |
+
echo " ALL 5 pull attempts FAILED — SAVE_POD=1 RETAIN for manual recovery"
|
| 166 |
+
echo " MANUAL: scp -i $SSH_KEY -P $SP root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt $LOCAL_DIR/out_main/"
|
| 167 |
+
export SAVE_POD=1
|
| 168 |
+
fi
|
| 169 |
+
else
|
| 170 |
+
echo " ERROR: result.json NOT present — SAVE_POD=1 retain for manual recovery"
|
| 171 |
+
echo " MANUAL: ssh -i $SSH_KEY -p $SP root@$SH"
|
| 172 |
+
export SAVE_POD=1
|
| 173 |
+
fi
|
| 174 |
+
|
| 175 |
+
echo "[9/9] Done. teardown via trap (SAVE_POD=${SAVE_POD})."
|
| 176 |
+
date -u
|
| 177 |
+
ls -la "$LOCAL_DIR/out_main/" 2>/dev/null || true
|