File size: 4,229 Bytes
2c2dbe8 f9e2ab8 2c2dbe8 f9e2ab8 2c2dbe8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | #!/usr/bin/env bash
# Full redeploy to an existing AMD MI300X droplet.
#
# 1. Generate a fresh bearer token
# 2. scripts/deploy_droplet.sh <ip> <token> (bring up vLLM + riprap-models)
# 3. scripts/update_hf_env.sh <ip> <token> (update HF Space vars + restart)
# 4. .venv/bin/python scripts/probe_addresses.py (5/5 must pass)
#
# Usage: scripts/redeploy.sh <droplet-ip>
#
# Requires:
# HF auth — either `huggingface-cli login` (preferred) or HF_TOKEN env var
# .venv Python virtual environment with probe_addresses.py deps
# SSH access to the droplet (ssh-agent or SSH_KEY env var)
#
# Exit codes:
# 0 all three steps passed
# 1 deploy_droplet.sh failed (HF Space NOT touched)
# 1 update_hf_env.sh failed (droplet is up but HF Space NOT updated)
# 1 probe_addresses.py failed (deploy + HF update succeeded; not rolled back)
set -euo pipefail
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <droplet-ip>" >&2
exit 1
fi
IP="$1"
# Verify HF auth is available before doing the long droplet build.
# Either HF_TOKEN env or a cached CLI login works — HfApi() picks up
# whichever is set.
if ! python3 -c "
import sys
from huggingface_hub import HfApi
try:
HfApi().whoami()
except Exception as e:
print(f'HF auth check failed: {e}', file=sys.stderr)
print('Run: huggingface-cli login (or: export HF_TOKEN=...)',
file=sys.stderr)
sys.exit(1)
" >/dev/null; then
exit 1
fi
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
START_SECONDS=$SECONDS
DEPLOY_STATUS="FAIL"
HF_STATUS="FAIL"
PROBE_STATUS="FAIL"
# ---- 1. Generate a fresh bearer token ------------------------------------
# openssl rand -base64 24 produces 32 chars; strip +/= to keep URL-safe.
TOKEN=$(openssl rand -base64 24 | tr -d '/+=')
echo "==> Deploying to ${IP} with fresh token..."
echo
# ---- 2. deploy_droplet.sh ------------------------------------------------
if bash "${REPO_ROOT}/scripts/deploy_droplet.sh" "$IP" "$TOKEN"; then
DEPLOY_STATUS="PASS"
else
echo "deploy_droplet.sh failed" >&2
# Print summary before exiting so the caller sees partial state.
ELAPSED=$(( SECONDS - START_SECONDS ))
echo
echo "=== redeploy summary ==="
echo "Droplet IP : ${IP}"
echo "Token : (not set — deploy failed before token was registered)"
echo "Deploy : ${DEPLOY_STATUS}"
echo "HF Space : ${HF_STATUS}"
echo "E2E probe : ${PROBE_STATUS}"
printf "Total time : %dm%02ds\n" $(( ELAPSED / 60 )) $(( ELAPSED % 60 ))
exit 1
fi
echo
echo "==> Deploy succeeded. Updating HF Space..."
echo
# ---- 3. update_hf_env.sh -------------------------------------------------
if bash "${REPO_ROOT}/scripts/update_hf_env.sh" "$IP" "$TOKEN"; then
HF_STATUS="PASS"
else
echo "update_hf_env.sh failed. HF Space NOT updated." >&2
ELAPSED=$(( SECONDS - START_SECONDS ))
echo
echo "=== redeploy summary ==="
echo "Droplet IP : ${IP}"
echo "Token : (regenerated, see HF Space vars)"
echo "Deploy : ${DEPLOY_STATUS}"
echo "HF Space : ${HF_STATUS}"
echo "E2E probe : ${PROBE_STATUS}"
printf "Total time : %dm%02ds\n" $(( ELAPSED / 60 )) $(( ELAPSED % 60 ))
exit 1
fi
echo
echo "==> HF Space updated. Running end-to-end probe..."
echo
# ---- 4. probe_addresses.py -----------------------------------------------
# probe_addresses.py exits 0 only when 5/5 pass (from docs/DROPLET-RUNBOOK.md).
# Disable set -e for this step so we can capture the exit code and still
# print the summary.
set +e
"${REPO_ROOT}/.venv/bin/python" "${REPO_ROOT}/scripts/probe_addresses.py"
PROBE_EXIT=$?
set -e
if [ "$PROBE_EXIT" -eq 0 ]; then
PROBE_STATUS="PASS"
else
PROBE_STATUS="FAIL"
fi
# ---- 5. Summary ----------------------------------------------------------
ELAPSED=$(( SECONDS - START_SECONDS ))
echo
echo "=== redeploy summary ==="
echo "Droplet IP : ${IP}"
echo "Token : (regenerated, see HF Space vars)"
echo "Deploy : ${DEPLOY_STATUS}"
echo "HF Space : ${HF_STATUS}"
echo "E2E probe : ${PROBE_STATUS}"
printf "Total time : %dm%02ds\n" $(( ELAPSED / 60 )) $(( ELAPSED % 60 ))
# Exit 1 if probe failed; deploy + HF update already succeeded, not rolling back.
[ "$PROBE_STATUS" = "PASS" ]
|