Spaces:
Paused
Paused
File size: 4,604 Bytes
d814291 04ad851 d814291 e44cdee fe1f842 e44cdee d814291 2e14f6d 04ad851 2e14f6d 04ad851 2e14f6d 04ad851 2e14f6d 04ad851 2e14f6d 04ad851 2e14f6d 04ad851 2e14f6d 04ad851 fe1f842 8828fdd 04ad851 fe1f842 d814291 8828fdd d814291 8828fdd fe1f842 8828fdd d814291 8828fdd fe1f842 8828fdd d814291 04ad851 8828fdd fe1f842 04ad851 fe1f842 04ad851 fe1f842 e44cdee 04ad851 d814291 04ad851 d814291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | #!/bin/sh
# Intentionally NOT running with `set -e`: a training failure must not
# bring down the API server. The Space's HTTP endpoints (dashboards,
# /healthz, /api/environment) need to stay reachable even if the
# self-play job crashes.
set -u
_is_true() {
case "${1:-}" in
1|true|TRUE|yes|YES|on|ON) return 0 ;;
*) return 1 ;;
esac
}
ENV_CONFIG_PATH="${TRAIN_ENV_CONFIG_PATH:-config/shared_config.json}"
TRAIN_CONFIG_PATH="${TRAIN_SELF_PLAY_CONFIG_PATH:-config/self_play_training_hf_l40s_full.json}"
TRAIN_OUTPUT_DIR="${TRAIN_SELF_PLAY_OUTPUT_DIR:-}"
RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}"
DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
PORT_VALUE="${PORT:-7860}"
TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}"
UVICORN_PID=""
TRAIN_PID=""
_start_api_server_foreground_or_die() {
if ! _is_true "$SERVE_API_FLAG"; then
echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve."
exit 0
fi
echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}."
uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" &
UVICORN_PID=$!
echo "[space_start] uvicorn pid=${UVICORN_PID}"
}
_stop_children() {
if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then
echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}."
kill "${TRAIN_PID}" 2>/dev/null || true
fi
if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
kill "${UVICORN_PID}" 2>/dev/null || true
fi
}
# Only forward shutdown signals; do NOT kill children on every EXIT
# (otherwise a crashed training run would tear down uvicorn too).
trap '_stop_children; exit 0' INT TERM
_resolve_train_cmd() {
# Prefer the installed console script when it is on PATH, otherwise
# fall back to `python -m osint_env.cli`. The fallback avoids rc=127
# ("command not found") in case the user-site bin dir is missing
# from PATH for some reason.
if command -v osint-env >/dev/null 2>&1; then
TRAIN_CMD="osint-env"
else
echo "[space_start] 'osint-env' not on PATH; falling back to 'python -m osint_env.cli'."
TRAIN_CMD="python -m osint_env.cli"
fi
}
_run_training_supervised() {
if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
else
OUTPUT_ARG=""
fi
_resolve_train_cmd
: > "${TRAIN_LOG_PATH}" || true
if _is_true "$DRY_RUN_FLAG"; then
echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
# shellcheck disable=SC2086
${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
2>&1 | tee -a "${TRAIN_LOG_PATH}" &
else
echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
# shellcheck disable=SC2086
${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
2>&1 | tee -a "${TRAIN_LOG_PATH}" &
fi
TRAIN_PID=$!
echo "[space_start] training pid=${TRAIN_PID} (cmd: ${TRAIN_CMD} train-self-play)"
# Watcher subshell: if training exits with non-zero status, log the
# failure but do NOT propagate it to the parent script. Uvicorn must
# keep serving so the dashboards stay reachable.
(
wait "${TRAIN_PID}" 2>/dev/null
rc=$?
if [ "${rc}" -eq 0 ]; then
echo "[space_start] Self-play training finished cleanly (rc=0)."
else
echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}."
fi
) &
}
_start_api_server_foreground_or_die
if _is_true "$RUN_FLAG"; then
echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
echo "[space_start] Env config: ${ENV_CONFIG_PATH}"
echo "[space_start] Train config: ${TRAIN_CONFIG_PATH}"
if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
echo "[space_start] Train output dir: ${TRAIN_OUTPUT_DIR}"
fi
if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
fi
_run_training_supervised
else
echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
fi
# Block on uvicorn so the container stays alive as long as the API
# server is healthy. If uvicorn exits (e.g. real platform shutdown),
# we exit the script normally.
if [ -n "${UVICORN_PID}" ]; then
wait "${UVICORN_PID}"
fi
|