#!/bin/sh # Intentionally NOT running with `set -e`: a training failure must not # bring down the API server. The Space's HTTP endpoints (dashboards, # /healthz, /api/environment) need to stay reachable even if the # self-play job crashes. set -u _is_true() { case "${1:-}" in 1|true|TRUE|yes|YES|on|ON) return 0 ;; *) return 1 ;; esac } ENV_CONFIG_PATH="${TRAIN_ENV_CONFIG_PATH:-config/shared_config.json}" TRAIN_CONFIG_PATH="${TRAIN_SELF_PLAY_CONFIG_PATH:-config/self_play_training_hf_l40s_full.json}" TRAIN_OUTPUT_DIR="${TRAIN_SELF_PLAY_OUTPUT_DIR:-}" RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}" DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}" SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}" PORT_VALUE="${PORT:-7860}" TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}" UVICORN_PID="" TRAIN_PID="" _start_api_server_foreground_or_die() { if ! _is_true "$SERVE_API_FLAG"; then echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve." exit 0 fi echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}." uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" & UVICORN_PID=$! echo "[space_start] uvicorn pid=${UVICORN_PID}" } _stop_children() { if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}." kill "${TRAIN_PID}" 2>/dev/null || true fi if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}." kill "${UVICORN_PID}" 2>/dev/null || true fi } # Only forward shutdown signals; do NOT kill children on every EXIT # (otherwise a crashed training run would tear down uvicorn too). trap '_stop_children; exit 0' INT TERM _resolve_train_cmd() { # Prefer the installed console script when it is on PATH, otherwise # fall back to `python -m osint_env.cli`. The fallback avoids rc=127 # ("command not found") in case the user-site bin dir is missing # from PATH for some reason. if command -v osint-env >/dev/null 2>&1; then TRAIN_CMD="osint-env" else echo "[space_start] 'osint-env' not on PATH; falling back to 'python -m osint_env.cli'." TRAIN_CMD="python -m osint_env.cli" fi } _run_training_supervised() { if [ -n "${TRAIN_OUTPUT_DIR}" ]; then OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}" else OUTPUT_ARG="" fi _resolve_train_cmd : > "${TRAIN_LOG_PATH}" || true if _is_true "$DRY_RUN_FLAG"; then echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)." # shellcheck disable=SC2086 ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \ 2>&1 | tee -a "${TRAIN_LOG_PATH}" & else echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)." # shellcheck disable=SC2086 ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \ 2>&1 | tee -a "${TRAIN_LOG_PATH}" & fi TRAIN_PID=$! echo "[space_start] training pid=${TRAIN_PID} (cmd: ${TRAIN_CMD} train-self-play)" # Watcher subshell: if training exits with non-zero status, log the # failure but do NOT propagate it to the parent script. Uvicorn must # keep serving so the dashboards stay reachable. ( wait "${TRAIN_PID}" 2>/dev/null rc=$? if [ "${rc}" -eq 0 ]; then echo "[space_start] Self-play training finished cleanly (rc=0)." else echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}." fi ) & } _start_api_server_foreground_or_die if _is_true "$RUN_FLAG"; then echo "[space_start] RUN_SELF_PLAY_TRAINING enabled." echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")" echo "[space_start] Env config: ${ENV_CONFIG_PATH}" echo "[space_start] Train config: ${TRAIN_CONFIG_PATH}" if [ -n "${TRAIN_OUTPUT_DIR}" ]; then echo "[space_start] Train output dir: ${TRAIN_OUTPUT_DIR}" fi if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}" fi _run_training_supervised else echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run." fi # Block on uvicorn so the container stays alive as long as the API # server is healthy. If uvicorn exits (e.g. real platform shutdown), # we exit the script normally. if [ -n "${UVICORN_PID}" ]; then wait "${UVICORN_PID}" fi