Spaces:
Paused
Stop training failures from killing the API server (fixes 500 on Space)
Browse filesAfter the previous restructure, scripts/space_start.sh ran with
`set -eu` and an EXIT trap that killed uvicorn. If
`osint-env train-self-play` crashed for any reason (e.g. hardware
mismatch with the L40S config, missing GPU, transient HF Hub
download error) the script exited, the trap fired, and uvicorn
died with it. Visitors then saw an internal server error / Space
unavailable response on every route, including the dashboards
which are completely independent of training.
Rewrite the start script so:
- `set -e` is dropped; non-zero exits no longer cascade.
- uvicorn runs as the persistent process the script `wait`s on, so
the container stays alive as long as the API is healthy.
- training runs in a backgrounded supervised subshell that logs its
exit code to /tmp/self_play_training.log and never propagates
failure to the parent script.
- The signal trap is INT/TERM only (real platform shutdown),
forwarding signals to both children, instead of EXIT which fired
on every script-level exit.
Made-with: Cursor
- scripts/space_start.sh +51 -36
|
@@ -1,5 +1,9 @@
|
|
| 1 |
#!/bin/sh
|
| 2 |
-
set -
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
_is_true() {
|
| 5 |
case "${1:-}" in
|
|
@@ -15,37 +19,38 @@ RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}"
|
|
| 15 |
DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
|
| 16 |
SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
|
| 17 |
PORT_VALUE="${PORT:-7860}"
|
| 18 |
-
|
| 19 |
|
| 20 |
UVICORN_PID=""
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
if ! _is_true "$SERVE_API_FLAG"; then
|
| 24 |
-
echo "[space_start] RUN_SPACE_API_SERVER disabled.
|
| 25 |
-
|
| 26 |
fi
|
| 27 |
-
echo "[space_start] Starting API server
|
| 28 |
-
|
| 29 |
-
# primary process. If HF infrastructure SIGTERMs the container we still
|
| 30 |
-
# want training to receive the signal and flush a final checkpoint, not
|
| 31 |
-
# to silently die because PID 1 (uvicorn previously) exited first.
|
| 32 |
-
uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" \
|
| 33 |
-
>"${UVICORN_LOG_PATH}" 2>&1 &
|
| 34 |
UVICORN_PID=$!
|
| 35 |
echo "[space_start] uvicorn pid=${UVICORN_PID}"
|
| 36 |
}
|
| 37 |
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
|
| 40 |
echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
|
| 41 |
kill "${UVICORN_PID}" 2>/dev/null || true
|
| 42 |
-
wait "${UVICORN_PID}" 2>/dev/null || true
|
| 43 |
fi
|
| 44 |
}
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
|
| 50 |
OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
|
| 51 |
else
|
|
@@ -53,19 +58,35 @@ _train_self_play() {
|
|
| 53 |
fi
|
| 54 |
|
| 55 |
if _is_true "$DRY_RUN_FLAG"; then
|
| 56 |
-
echo "[space_start] Running self-play in dry-run mode."
|
| 57 |
# shellcheck disable=SC2086
|
| 58 |
-
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run
|
|
|
|
| 59 |
else
|
| 60 |
-
echo "[space_start] Running self-play training (
|
| 61 |
# shellcheck disable=SC2086
|
| 62 |
-
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG}
|
|
|
|
| 63 |
fi
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
}
|
| 68 |
|
|
|
|
|
|
|
| 69 |
if _is_true "$RUN_FLAG"; then
|
| 70 |
echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
|
| 71 |
echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
|
@@ -77,20 +98,14 @@ if _is_true "$RUN_FLAG"; then
|
|
| 77 |
if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
|
| 78 |
echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
|
| 79 |
fi
|
| 80 |
-
|
| 81 |
-
# Run training in the FOREGROUND so the script (and therefore PID 1)
|
| 82 |
-
# blocks until training is finished. A graceful SIGTERM from HF will
|
| 83 |
-
# propagate to the training process via the shell's signal handling
|
| 84 |
-
# and the trap above will cleanly stop uvicorn afterwards.
|
| 85 |
-
_train_self_play
|
| 86 |
-
echo "[space_start] Training finished. Keeping API server alive for log inspection."
|
| 87 |
-
if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
|
| 88 |
-
wait "${UVICORN_PID}"
|
| 89 |
-
fi
|
| 90 |
else
|
| 91 |
echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
| 96 |
fi
|
|
|
|
| 1 |
#!/bin/sh
|
| 2 |
+
# Intentionally NOT running with `set -e`: a training failure must not
|
| 3 |
+
# bring down the API server. The Space's HTTP endpoints (dashboards,
|
| 4 |
+
# /healthz, /api/environment) need to stay reachable even if the
|
| 5 |
+
# self-play job crashes.
|
| 6 |
+
set -u
|
| 7 |
|
| 8 |
_is_true() {
|
| 9 |
case "${1:-}" in
|
|
|
|
| 19 |
DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
|
| 20 |
SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
|
| 21 |
PORT_VALUE="${PORT:-7860}"
|
| 22 |
+
TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}"
|
| 23 |
|
| 24 |
UVICORN_PID=""
|
| 25 |
+
TRAIN_PID=""
|
| 26 |
|
| 27 |
+
_start_api_server_foreground_or_die() {
|
| 28 |
if ! _is_true "$SERVE_API_FLAG"; then
|
| 29 |
+
echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve."
|
| 30 |
+
exit 0
|
| 31 |
fi
|
| 32 |
+
echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}."
|
| 33 |
+
uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" &
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
UVICORN_PID=$!
|
| 35 |
echo "[space_start] uvicorn pid=${UVICORN_PID}"
|
| 36 |
}
|
| 37 |
|
| 38 |
+
_stop_children() {
|
| 39 |
+
if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then
|
| 40 |
+
echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}."
|
| 41 |
+
kill "${TRAIN_PID}" 2>/dev/null || true
|
| 42 |
+
fi
|
| 43 |
if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
|
| 44 |
echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
|
| 45 |
kill "${UVICORN_PID}" 2>/dev/null || true
|
|
|
|
| 46 |
fi
|
| 47 |
}
|
| 48 |
|
| 49 |
+
# Only forward shutdown signals; do NOT kill children on every EXIT
|
| 50 |
+
# (otherwise a crashed training run would tear down uvicorn too).
|
| 51 |
+
trap '_stop_children; exit 0' INT TERM
|
| 52 |
|
| 53 |
+
_run_training_supervised() {
|
| 54 |
if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
|
| 55 |
OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
|
| 56 |
else
|
|
|
|
| 58 |
fi
|
| 59 |
|
| 60 |
if _is_true "$DRY_RUN_FLAG"; then
|
| 61 |
+
echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH})."
|
| 62 |
# shellcheck disable=SC2086
|
| 63 |
+
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
|
| 64 |
+
> "${TRAIN_LOG_PATH}" 2>&1 &
|
| 65 |
else
|
| 66 |
+
echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH})."
|
| 67 |
# shellcheck disable=SC2086
|
| 68 |
+
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
|
| 69 |
+
> "${TRAIN_LOG_PATH}" 2>&1 &
|
| 70 |
fi
|
| 71 |
+
TRAIN_PID=$!
|
| 72 |
+
echo "[space_start] training pid=${TRAIN_PID}"
|
| 73 |
|
| 74 |
+
# Watcher subshell: if training exits with non-zero status, log the
|
| 75 |
+
# failure but do NOT propagate it to the parent script. Uvicorn must
|
| 76 |
+
# keep serving so the dashboards stay reachable.
|
| 77 |
+
(
|
| 78 |
+
wait "${TRAIN_PID}" 2>/dev/null
|
| 79 |
+
rc=$?
|
| 80 |
+
if [ "${rc}" -eq 0 ]; then
|
| 81 |
+
echo "[space_start] Self-play training finished cleanly (rc=0)."
|
| 82 |
+
else
|
| 83 |
+
echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}."
|
| 84 |
+
fi
|
| 85 |
+
) &
|
| 86 |
}
|
| 87 |
|
| 88 |
+
_start_api_server_foreground_or_die
|
| 89 |
+
|
| 90 |
if _is_true "$RUN_FLAG"; then
|
| 91 |
echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
|
| 92 |
echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
|
|
|
| 98 |
if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
|
| 99 |
echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
|
| 100 |
fi
|
| 101 |
+
_run_training_supervised
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
else
|
| 103 |
echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
# Block on uvicorn so the container stays alive as long as the API
|
| 107 |
+
# server is healthy. If uvicorn exits (e.g. real platform shutdown),
|
| 108 |
+
# we exit the script normally.
|
| 109 |
+
if [ -n "${UVICORN_PID}" ]; then
|
| 110 |
+
wait "${UVICORN_PID}"
|
| 111 |
fi
|