File size: 4,604 Bytes
d814291
04ad851
 
 
 
 
d814291
 
 
 
 
 
 
 
 
e44cdee
fe1f842
e44cdee
d814291
2e14f6d
 
04ad851
2e14f6d
 
04ad851
2e14f6d
04ad851
2e14f6d
04ad851
 
2e14f6d
04ad851
 
2e14f6d
 
 
 
04ad851
 
 
 
 
2e14f6d
 
 
 
 
 
04ad851
 
 
fe1f842
8828fdd
 
 
 
 
 
 
 
 
 
 
 
 
04ad851
fe1f842
 
 
 
 
d814291
8828fdd
 
 
 
d814291
8828fdd
fe1f842
8828fdd
 
d814291
8828fdd
fe1f842
8828fdd
 
d814291
04ad851
8828fdd
fe1f842
04ad851
 
 
 
 
 
 
 
 
 
 
 
fe1f842
 
04ad851
 
fe1f842
 
 
 
 
 
 
 
e44cdee
 
 
04ad851
d814291
 
04ad851
 
 
 
 
 
 
d814291
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/sh
# Intentionally NOT running with `set -e`: a training failure must not
# bring down the API server. The Space's HTTP endpoints (dashboards,
# /healthz, /api/environment) need to stay reachable even if the
# self-play job crashes.
set -u

_is_true() {
  case "${1:-}" in
    1|true|TRUE|yes|YES|on|ON) return 0 ;;
    *) return 1 ;;
  esac
}

ENV_CONFIG_PATH="${TRAIN_ENV_CONFIG_PATH:-config/shared_config.json}"
TRAIN_CONFIG_PATH="${TRAIN_SELF_PLAY_CONFIG_PATH:-config/self_play_training_hf_l40s_full.json}"
TRAIN_OUTPUT_DIR="${TRAIN_SELF_PLAY_OUTPUT_DIR:-}"
RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-1}"
DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
SERVE_API_FLAG="${RUN_SPACE_API_SERVER:-1}"
PORT_VALUE="${PORT:-7860}"
TRAIN_LOG_PATH="${TRAIN_LOG_PATH:-/tmp/self_play_training.log}"

UVICORN_PID=""
TRAIN_PID=""

_start_api_server_foreground_or_die() {
  if ! _is_true "$SERVE_API_FLAG"; then
    echo "[space_start] RUN_SPACE_API_SERVER disabled. Nothing to serve."
    exit 0
  fi
  echo "[space_start] Starting API server (foreground) on port ${PORT_VALUE}."
  uvicorn server:app --host 0.0.0.0 --port "${PORT_VALUE}" &
  UVICORN_PID=$!
  echo "[space_start] uvicorn pid=${UVICORN_PID}"
}

_stop_children() {
  if [ -n "${TRAIN_PID}" ] && kill -0 "${TRAIN_PID}" 2>/dev/null; then
    echo "[space_start] Forwarding shutdown to training pid=${TRAIN_PID}."
    kill "${TRAIN_PID}" 2>/dev/null || true
  fi
  if [ -n "${UVICORN_PID}" ] && kill -0 "${UVICORN_PID}" 2>/dev/null; then
    echo "[space_start] Stopping uvicorn pid=${UVICORN_PID}."
    kill "${UVICORN_PID}" 2>/dev/null || true
  fi
}

# Only forward shutdown signals; do NOT kill children on every EXIT
# (otherwise a crashed training run would tear down uvicorn too).
trap '_stop_children; exit 0' INT TERM

_resolve_train_cmd() {
  # Prefer the installed console script when it is on PATH, otherwise
  # fall back to `python -m osint_env.cli`. The fallback avoids rc=127
  # ("command not found") in case the user-site bin dir is missing
  # from PATH for some reason.
  if command -v osint-env >/dev/null 2>&1; then
    TRAIN_CMD="osint-env"
  else
    echo "[space_start] 'osint-env' not on PATH; falling back to 'python -m osint_env.cli'."
    TRAIN_CMD="python -m osint_env.cli"
  fi
}

_run_training_supervised() {
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
    OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
  else
    OUTPUT_ARG=""
  fi

  _resolve_train_cmd

  : > "${TRAIN_LOG_PATH}" || true

  if _is_true "$DRY_RUN_FLAG"; then
    echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
    # shellcheck disable=SC2086
    ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
      2>&1 | tee -a "${TRAIN_LOG_PATH}" &
  else
    echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
    # shellcheck disable=SC2086
    ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
      2>&1 | tee -a "${TRAIN_LOG_PATH}" &
  fi
  TRAIN_PID=$!
  echo "[space_start] training pid=${TRAIN_PID} (cmd: ${TRAIN_CMD} train-self-play)"

  # Watcher subshell: if training exits with non-zero status, log the
  # failure but do NOT propagate it to the parent script. Uvicorn must
  # keep serving so the dashboards stay reachable.
  (
    wait "${TRAIN_PID}" 2>/dev/null
    rc=$?
    if [ "${rc}" -eq 0 ]; then
      echo "[space_start] Self-play training finished cleanly (rc=0)."
    else
      echo "[space_start] Self-play training exited rc=${rc}. API server will stay up; see ${TRAIN_LOG_PATH}."
    fi
  ) &
}

_start_api_server_foreground_or_die

if _is_true "$RUN_FLAG"; then
  echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
  echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
  echo "[space_start] Env config: ${ENV_CONFIG_PATH}"
  echo "[space_start] Train config: ${TRAIN_CONFIG_PATH}"
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
    echo "[space_start] Train output dir: ${TRAIN_OUTPUT_DIR}"
  fi
  if [ -n "${OSINT_HF_CHECKPOINT_REPO_ID:-}" ]; then
    echo "[space_start] HF checkpoint repo: ${OSINT_HF_CHECKPOINT_REPO_ID}"
  fi
  _run_training_supervised
else
  echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
fi

# Block on uvicorn so the container stays alive as long as the API
# server is healthy. If uvicorn exits (e.g. real platform shutdown),
# we exit the script normally.
if [ -n "${UVICORN_PID}" ]; then
  wait "${UVICORN_PID}"
fi