siddeshwar-kagatikar commited on
Commit
04ad851
·
1 Parent(s): 7e4ee5e

Stop training failures from killing the API server (fixes 500 on Space)

Browse files

After the previous restructure, scripts/space_start.sh ran with
`set -eu` and an EXIT trap that killed uvicorn. If
`osint-env train-self-play` crashed for any reason (e.g. hardware
mismatch with the L40S config, missing GPU, transient HF Hub
download error) the script exited, the trap fired, and uvicorn
died with it. Visitors then saw an internal server error / Space
unavailable response on every route, including the dashboards
which are completely independent of training.

Rewrite the start script so:
- `set -e` is dropped; non-zero exits no longer cascade.
- uvicorn runs as the persistent process the script `wait`s on, so
the container stays alive as long as the API is healthy.
- training runs in a backgrounded supervised subshell that logs its
exit code to /tmp/self_play_training.log and never propagates
failure to the parent script.
- The signal trap is INT/TERM only (real platform shutdown),
forwarding signals to both children, instead of EXIT which fired
on every script-level exit.

Made-with: Cursor

Files changed (1) hide show
  1. scripts/space_start.sh +51 -36
scripts/space_start.sh CHANGED
@@ -1,5 +1,9 @@
1
  #!/bin/sh
2
- set -eu
 
 
 
 
3
 
4
  _is_true() {
5
  case "${1:-}" in
@@ -15,37 +19,38 @@ RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}"
15
  DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
16
  SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
17
  PORT_VALUE="${PORT:-7860}"
18
- UVICORN_LOG_PATH="${UVICORN_LOG_PATH:-/tmp/uvicorn.log}"
19
 
20
  UVICORN_PID=""
 
21
 
22
- _start_api_server_background() {
23
  if ! _is_true "$SERVE_API_FLAG"; then
24
- echo "[space_start] RUN_SPACE_API_SERVER disabled. Skipping API server."
25
- return
26
  fi
27
- echo "[space_start] Starting API server in background on port ${PORT_VALUE} (logs: ${UVICORN_LOG_PATH})."
28
- # API server runs in background ONLY for HF healthchecks. Training is the
29
- # primary process. If HF infrastructure SIGTERMs the container we still
30
- # want training to receive the signal and flush a final checkpoint, not
31
- # to silently die because PID 1 (uvicorn previously) exited first.
32
- uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" \
33
- >"${UVICORN_LOG_PATH}" 2>&1 &
34
  UVICORN_PID=$!
35
  echo "[space_start] uvicorn pid=${UVICORN_PID}"
36
  }
37
 
38
- _stop_api_server() {
 
 
 
 
39
  if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
40
  echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
41
  kill "${UVICORN_PID}" 2>/dev/null || true
42
- wait "${UVICORN_PID}" 2>/dev/null || true
43
  fi
44
  }
45
 
46
- trap '_stop_api_server' EXIT INT TERM
 
 
47
 
48
- _train_self_play() {
49
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
50
  OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
51
  else
@@ -53,19 +58,35 @@ _train_self_play() {
53
  fi
54
 
55
  if _is_true "$DRY_RUN_FLAG"; then
56
- echo "[space_start] Running self-play in dry-run mode."
57
  # shellcheck disable=SC2086
58
- osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run
 
59
  else
60
- echo "[space_start] Running self-play training (foreground)."
61
  # shellcheck disable=SC2086
62
- osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG}
 
63
  fi
 
 
64
 
65
- echo "[space_start] Self-play command completed."
66
- echo "[space_start] Training end: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
 
 
 
 
 
 
 
 
 
 
67
  }
68
 
 
 
69
  if _is_true "$RUN_FLAG"; then
70
  echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
71
  echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
@@ -77,20 +98,14 @@ if _is_true "$RUN_FLAG"; then
77
  if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
78
  echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
79
  fi
80
- _start_api_server_background
81
- # Run training in the FOREGROUND so the script (and therefore PID 1)
82
- # blocks until training is finished. A graceful SIGTERM from HF will
83
- # propagate to the training process via the shell's signal handling
84
- # and the trap above will cleanly stop uvicorn afterwards.
85
- _train_self_play
86
- echo "[space_start] Training finished. Keeping API server alive for log inspection."
87
- if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
88
- wait "${UVICORN_PID}"
89
- fi
90
  else
91
  echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
92
- _start_api_server_background
93
- if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
94
- wait "${UVICORN_PID}"
95
- fi
 
 
 
96
  fi
 
1
  #!/bin/sh
2
+ # Intentionally NOT running with `set -e`: a training failure must not
3
+ # bring down the API server. The Space's HTTP endpoints (dashboards,
4
+ # /healthz, /api/environment) need to stay reachable even if the
5
+ # self-play job crashes.
6
+ set -u
7
 
8
  _is_true() {
9
  case "${1:-}" in
 
19
  DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
20
  SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
21
  PORT_VALUE="${PORT:-7860}"
22
+ TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}"
23
 
24
  UVICORN_PID=""
25
+ TRAIN_PID=""
26
 
27
+ _start_api_server_foreground_or_die() {
28
  if ! _is_true "$SERVE_API_FLAG"; then
29
+ echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve."
30
+ exit 0
31
  fi
32
+ echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}."
33
+ uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" &
 
 
 
 
 
34
  UVICORN_PID=$!
35
  echo "[space_start] uvicorn pid=${UVICORN_PID}"
36
  }
37
 
38
+ _stop_children() {
39
+ if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then
40
+ echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}."
41
+ kill "${TRAIN_PID}" 2>/dev/null || true
42
+ fi
43
  if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
44
  echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
45
  kill "${UVICORN_PID}" 2>/dev/null || true
 
46
  fi
47
  }
48
 
49
+ # Only forward shutdown signals; do NOT kill children on every EXIT
50
+ # (otherwise a crashed training run would tear down uvicorn too).
51
+ trap '_stop_children; exit 0' INT TERM
52
 
53
+ _run_training_supervised() {
54
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
55
  OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
56
  else
 
58
  fi
59
 
60
  if _is_true "$DRY_RUN_FLAG"; then
61
+ echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH})."
62
  # shellcheck disable=SC2086
63
+ osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
64
+ > "${TRAIN_LOG_PATH}" 2>&1 &
65
  else
66
+ echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH})."
67
  # shellcheck disable=SC2086
68
+ osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
69
+ > "${TRAIN_LOG_PATH}" 2>&1 &
70
  fi
71
+ TRAIN_PID=$!
72
+ echo "[space_start] training pid=${TRAIN_PID}"
73
 
74
+ # Watcher subshell: if training exits with non-zero status, log the
75
+ # failure but do NOT propagate it to the parent script. Uvicorn must
76
+ # keep serving so the dashboards stay reachable.
77
+ (
78
+ wait "${TRAIN_PID}" 2>/dev/null
79
+ rc=$?
80
+ if [ "${rc}" -eq 0 ]; then
81
+ echo "[space_start] Self-play training finished cleanly (rc=0)."
82
+ else
83
+ echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}."
84
+ fi
85
+ ) &
86
  }
87
 
88
+ _start_api_server_foreground_or_die
89
+
90
  if _is_true "$RUN_FLAG"; then
91
  echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
92
  echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
 
98
  if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
99
  echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
100
  fi
101
+ _run_training_supervised
 
 
 
 
 
 
 
 
 
102
  else
103
  echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
104
+ fi
105
+
106
+ # Block on uvicorn so the container stays alive as long as the API
107
+ # server is healthy. If uvicorn exits (e.g. real platform shutdown),
108
+ # we exit the script normally.
109
+ if [ -n "${UVICORN_PID}" ]; then
110
+ wait "${UVICORN_PID}"
111
  fi