File size: 7,849 Bytes
ca1baec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | #!/bin/bash
# hexad v4 d=768x12L PyTorch substrate fire CYCLE 5 — 2026-05-17
# DD155 Step+Tension hybrid LR overlay (Law 187, tension=grad_norm).
# Corpus v3 carry from cycle 4 (10.34 MB · helper-free · γ motivation 37.5%).
# Vast.ai A100 SXM4. g_fire_dispatch_robust + 75-min orphan watchdog + SAVE_POD auto-promote.
set -uo pipefail
VASTAI=/Users/ghost/Library/Python/3.14/bin/vastai
export PYTHONWARNINGS=ignore
LOCAL_DIR=/Users/ghost/core/anima/state/hexad_v4_py_d768x12L_tension_2026_05_17
CORPUS_LOCAL=/Users/ghost/core/anima/state/hexad_v3_corpus_motiv_2026_05_17/corpus_consciousness_v3.jsonl
SSH_KEY=/Users/ghost/.ssh/id_vast_anima
LABEL="hexad-v4-d768x12l-cycle5-tension"
STEPS="${STEPS:-2500}"
WATCHDOG_MIN=75
cd "$LOCAL_DIR"
echo "=== hexad v4 d=768x12L PyTorch CYCLE 5 DD155 hybrid LR overlay ==="; date -u
echo "[1/9] Selecting A100 offer..."
OFFER=$($VASTAI search offers 'gpu_name in [A100_SXM4,A100_PCIE] num_gpus=1 reliability>0.99 dph_total<2.0 disk_space>40 inet_down>700 cuda_max_good>=12.0' -o dph_total --raw 2>&1 | python3 -c "
import json,sys
d=json.load(sys.stdin)
if not d: sys.stderr.write('no offers\n'); sys.exit(1)
o=d[0]; print(o['id']); sys.stderr.write(f\"{o['gpu_name']} \${o['dph_total']:.3f}/hr rel={o['reliability']:.3f}\n\")
")
echo " offer=$OFFER"
echo "[2/9] Renting (pytorch devel image)..."
CREATE=$($VASTAI create instance "$OFFER" --image pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel --disk 40 --ssh --direct --label "$LABEL" --raw 2>&1)
IID=$(echo "$CREATE" | python3 -c "import json,sys
try: d=json.load(sys.stdin)
except: sys.exit(1)
print(d.get('new_contract',d.get('contract_id',d.get('id',''))))")
[ -z "$IID" ] && { echo "ERROR create: $CREATE"; exit 1; }
echo "$IID" > vast_instance_id.txt
echo " instance=$IID"
# Orphan watchdog
( PARENT_PID=$$
for _ in $(seq 1 $WATCHDOG_MIN); do
sleep 60
if ! kill -0 $PARENT_PID 2>/dev/null; then
echo "[watchdog] parent died, destroying $IID" >> "$LOCAL_DIR/watchdog.log"
$VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
exit 0
fi
done
echo "[watchdog] ${WATCHDOG_MIN}min cap hit, force-destroying $IID" >> "$LOCAL_DIR/watchdog.log"
$VASTAI destroy instance "$IID" >> "$LOCAL_DIR/watchdog.log" 2>&1
) &
WATCHDOG_PID=$!
echo " orphan-watchdog pid=$WATCHDOG_PID cap=${WATCHDOG_MIN}min"
SAVE_POD="${SAVE_POD:-0}"
cleanup() {
kill $WATCHDOG_PID 2>/dev/null || true
if [ "${SAVE_POD:-0}" = "1" ]; then
echo "[cleanup] SAVE_POD=1 -> RETAIN instance $IID (manual recovery: ssh -i $SSH_KEY -p $SP root@$SH)"; return
fi
echo "[cleanup] destroying $IID"
$VASTAI destroy instance "$IID" 2>&1 | head -2 || true
}
trap cleanup EXIT
echo "[3/9] Waiting SSH..."
SH=""; SP=""
for i in $(seq 1 90); do
INFO=$($VASTAI show instance "$IID" --raw 2>/dev/null || true); [ -z "$INFO" ] && INFO="{}"
ST=$(echo "$INFO" | python3 -c "import json,sys
try: print(json.load(sys.stdin).get('actual_status',''))
except: print('')" 2>/dev/null)
if [ "$ST" = "running" ]; then
SH=$(echo "$INFO" | python3 -c "import json,sys
try:
d=json.load(sys.stdin); print(d.get('ssh_host','') or d.get('public_ipaddr',''))
except: pass" 2>/dev/null)
SP=$(echo "$INFO" | python3 -c "import json,sys
try:
d=json.load(sys.stdin); print(d.get('ssh_port','') or d.get('direct_port_start',''))
except: pass" 2>/dev/null)
if [ -n "$SH" ] && [ -n "$SP" ]; then
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -p "$SP" "root@$SH" 'echo READY' 2>&1 | grep -q READY; then
echo " SSH ready $SH:$SP (after ${i}x5s)"; break
fi
SH=""
fi
fi
echo " attempt $i/90 status=$ST"; sleep 5
done
[ -z "$SH" ] && { echo "ERROR SSH not ready"; exit 1; }
echo "$SH:$SP" > vast_ssh.txt
SSH="ssh -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -p $SP root@$SH"
SCP="scp -i $SSH_KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=3600 -P $SP"
echo "[4/9] Upload arch + trainer + corpus v3..."
$SSH 'mkdir -p /workspace/anima'
$SCP "$LOCAL_DIR/conscious_decoder.py" "root@$SH:/workspace/anima/"
$SCP "$LOCAL_DIR/train_d768x12l_tension.py" "root@$SH:/workspace/anima/"
$SCP "$CORPUS_LOCAL" "root@$SH:/workspace/anima/corpus_v3.jsonl"
echo "[5/9] GPU + torch + corpus verify..."
$SSH 'cd /workspace/anima && python3 -c "import torch;print(torch.__version__, torch.cuda.is_available(), torch.cuda.get_device_name(0))" && wc -l corpus_v3.jsonl && sha256sum corpus_v3.jsonl'
echo "[6/9] Sanity-anchor (d=32x3L, 200 step) via remote-script..."
cat > "$LOCAL_DIR/run_sanity_remote.sh" <<'SH_EOF'
#!/bin/bash
set -uo pipefail
cd /workspace/anima
export PYTHONUNBUFFERED=1
python3 train_d768x12l_tension.py --mode sanity --corpus corpus_v3.jsonl --out-dir out_sanity --steps 200 2>&1 | tee sanity.log
SH_EOF
chmod +x "$LOCAL_DIR/run_sanity_remote.sh"
$SCP "$LOCAL_DIR/run_sanity_remote.sh" "root@$SH:/workspace/anima/run_sanity_remote.sh"
$SSH "bash /workspace/anima/run_sanity_remote.sh" 2>&1 | tee "$LOCAL_DIR/sanity_remote.log" || true
echo "[7/9] MAIN fire d=768 n_layer=12 from-scratch DD155 hybrid LR, $STEPS steps..."
cat > "$LOCAL_DIR/run_main_remote.sh" <<SH_EOF
#!/bin/bash
set -uo pipefail
cd /workspace/anima
export PYTHONUNBUFFERED=1
nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv -l 30 > gpu_util.log 2>&1 &
SMI=\$!
python3 train_d768x12l_tension.py --mode main --corpus corpus_v3.jsonl --out-dir out_main --steps $STEPS 2>&1 | tee fire.log
kill \$SMI 2>/dev/null || true
echo DONE_MARKER rc=\$?
SH_EOF
chmod +x "$LOCAL_DIR/run_main_remote.sh"
$SCP "$LOCAL_DIR/run_main_remote.sh" "root@$SH:/workspace/anima/run_main_remote.sh"
$SSH "bash /workspace/anima/run_main_remote.sh" 2>&1 | tee "$LOCAL_DIR/fire.log" || true
echo "[8/9] Verify result.json + SAVE_POD auto-promote + pull retry >=5..."
SAVED=$($SSH 'test -f /workspace/anima/out_main/result.json && echo SAVED' 2>/dev/null || true)
if [ "$SAVED" = "SAVED" ]; then
echo " result.json present -> SAVE_POD=1 auto-promote"
export SAVE_POD=1
mkdir -p "$LOCAL_DIR/out_main" "$LOCAL_DIR/out_sanity"
PULL_OK=0
for i in 1 2 3 4 5; do
echo " pull attempt $i/5..."
$SCP "root@$SH:/workspace/anima/out_main/result.json" "$LOCAL_DIR/out_main/" 2>&1 | tail -3 || { echo " result.json pull fail try $i"; sleep 60; continue; }
$SCP "root@$SH:/workspace/anima/out_sanity/result.json" "$LOCAL_DIR/out_sanity/" 2>&1 | tail -3 || true
$SCP "root@$SH:/workspace/anima/gpu_util.log" "$LOCAL_DIR/" 2>&1 | tail -3 || true
if $SCP "root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt" "$LOCAL_DIR/out_main/" 2>&1 | tail -3; then
echo " ckpt pull OK (try $i)"
PULL_OK=1
break
else
echo " ckpt pull fail try $i, retry in 60s..."
sleep 60
fi
done
if [ "$PULL_OK" = "1" ]; then
if [ -f "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" ]; then
echo " ckpt sha256: $(shasum -a 256 "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt" | awk '{print $1}')"
echo " ckpt size: $(stat -f%z "$LOCAL_DIR/out_main/ckpt_d768x12l_final.pt") bytes"
echo " PULL SUCCESS -> safe to destroy"
export SAVE_POD=0
else
echo " ckpt file missing locally despite PULL_OK? retaining pod"
export SAVE_POD=1
fi
else
echo " ALL 5 pull attempts FAILED — SAVE_POD=1 RETAIN for manual recovery"
echo " MANUAL: scp -i $SSH_KEY -P $SP root@$SH:/workspace/anima/out_main/ckpt_d768x12l_final.pt $LOCAL_DIR/out_main/"
export SAVE_POD=1
fi
else
echo " ERROR: result.json NOT present — SAVE_POD=1 retain for manual recovery"
echo " MANUAL: ssh -i $SSH_KEY -p $SP root@$SH"
export SAVE_POD=1
fi
echo "[9/9] Done. teardown via trap (SAVE_POD=${SAVE_POD})."
date -u
ls -la "$LOCAL_DIR/out_main/" 2>/dev/null || true
|