Spaces:
Paused
Paused
| # Intentionally NOT running with `set -e`: a training failure must not | |
| # bring down the API server. The Space's HTTP endpoints (dashboards, | |
| # /healthz, /api/environment) need to stay reachable even if the | |
| # self-play job crashes. | |
| set -u | |
| _is_true() { | |
| case "${1:-}" in | |
| 1|true|TRUE|yes|YES|on|ON) return 0 ;; | |
| *) return 1 ;; | |
| esac | |
| } | |
| ENV_CONFIG_PATH="${TRAIN_ENV_CONFIG_PATH:-config/shared_config.json}" | |
| TRAIN_CONFIG_PATH="${TRAIN_SELF_PLAY_CONFIG_PATH:-config/self_play_training_hf_l40s_full.json}" | |
| TRAIN_OUTPUT_DIR="${TRAIN_SELF_PLAY_OUTPUT_DIR:-}" | |
| RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}" | |
| DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}" | |
| SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}" | |
| PORT_VALUE="${PORT:-7860}" | |
| TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}" | |
| UVICORN_PID="" | |
| TRAIN_PID="" | |
| _start_api_server_foreground_or_die() { | |
| if ! _is_true "$SERVE_API_FLAG"; then | |
| echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve." | |
| exit 0 | |
| fi | |
| echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}." | |
| uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" & | |
| UVICORN_PID=$! | |
| echo "[space_start] uvicorn pid=${UVICORN_PID}" | |
| } | |
| _stop_children() { | |
| if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then | |
| echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}." | |
| kill "${TRAIN_PID}" 2>/dev/null || true | |
| fi | |
| if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then | |
| echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}." | |
| kill "${UVICORN_PID}" 2>/dev/null || true | |
| fi | |
| } | |
| # Only forward shutdown signals; do NOT kill children on every EXIT | |
| # (otherwise a crashed training run would tear down uvicorn too). | |
| trap '_stop_children; exit 0' INT TERM | |
| _resolve_train_cmd() { | |
| # Prefer the installed console script when it is on PATH, otherwise | |
| # fall back to `python -m osint_env.cli`. The fallback avoids rc=127 | |
| # ("command not found") in case the user-site bin dir is missing | |
| # from PATH for some reason. | |
| if command -v osint-env >/dev/null 2>&1; then | |
| TRAIN_CMD="osint-env" | |
| else | |
| echo "[space_start] 'osint-env' not on PATH; falling back to 'python -m osint_env.cli'." | |
| TRAIN_CMD="python -m osint_env.cli" | |
| fi | |
| } | |
| _run_training_supervised() { | |
| if [ -n "${TRAIN_OUTPUT_DIR}" ]; then | |
| OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}" | |
| else | |
| OUTPUT_ARG="" | |
| fi | |
| _resolve_train_cmd | |
| : > "${TRAIN_LOG_PATH}" || true | |
| if _is_true "$DRY_RUN_FLAG"; then | |
| echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)." | |
| # shellcheck disable=SC2086 | |
| ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \ | |
| 2>&1 | tee -a "${TRAIN_LOG_PATH}" & | |
| else | |
| echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)." | |
| # shellcheck disable=SC2086 | |
| ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \ | |
| 2>&1 | tee -a "${TRAIN_LOG_PATH}" & | |
| fi | |
| TRAIN_PID=$! | |
| echo "[space_start] training pid=${TRAIN_PID} (cmd: ${TRAIN_CMD} train-self-play)" | |
| # Watcher subshell: if training exits with non-zero status, log the | |
| # failure but do NOT propagate it to the parent script. Uvicorn must | |
| # keep serving so the dashboards stay reachable. | |
| ( | |
| wait "${TRAIN_PID}" 2>/dev/null | |
| rc=$? | |
| if [ "${rc}" -eq 0 ]; then | |
| echo "[space_start] Self-play training finished cleanly (rc=0)." | |
| else | |
| echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}." | |
| fi | |
| ) & | |
| } | |
| _start_api_server_foreground_or_die | |
| if _is_true "$RUN_FLAG"; then | |
| echo "[space_start] RUN_SELF_PLAY_TRAINING enabled." | |
| echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" | |
| echo "[space_start] Env config: ${ENV_CONFIG_PATH}" | |
| echo "[space_start] Train config: ${TRAIN_CONFIG_PATH}" | |
| if [ -n "${TRAIN_OUTPUT_DIR}" ]; then | |
| echo "[space_start] Train output dir: ${TRAIN_OUTPUT_DIR}" | |
| fi | |
| if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then | |
| echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}" | |
| fi | |
| _run_training_supervised | |
| else | |
| echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run." | |
| fi | |
| # Block on uvicorn so the container stays alive as long as the API | |
| # server is healthy. If uvicorn exits (e.g. real platform shutdown), | |
| # we exit the script normally. | |
| if [ -n "${UVICORN_PID}" ]; then | |
| wait "${UVICORN_PID}" | |
| fi | |