siddeshwar-kagatikar commited on
Commit
8828fdd
·
1 Parent(s): 274f638

Fix training rc=127 by using python -m fallback and tee logs to stdout

Browse files
Files changed (1) hide show
  1. scripts/space_start.sh +24 -7
scripts/space_start.sh CHANGED
@@ -50,6 +50,19 @@ _stop_children() {
50
  # (otherwise a crashed training run would tear down uvicorn too).
51
  trap '_stop_children; exit 0' INT TERM
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  _run_training_supervised() {
54
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
55
  OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
@@ -57,19 +70,23 @@ _run_training_supervised() {
57
  OUTPUT_ARG=""
58
  fi
59
 
 
 
 
 
60
  if _is_true "$DRY_RUN_FLAG"; then
61
- echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH})."
62
  # shellcheck disable=SC2086
63
- osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
64
- > "${TRAIN_LOG_PATH}" 2>&1 &
65
  else
66
- echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH})."
67
  # shellcheck disable=SC2086
68
- osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
69
- > "${TRAIN_LOG_PATH}" 2>&1 &
70
  fi
71
  TRAIN_PID=$!
72
- echo "[space_start] training pid=${TRAIN_PID}"
73
 
74
  # Watcher subshell: if training exits with non-zero status, log the
75
  # failure but do NOT propagate it to the parent script. Uvicorn must
 
50
  # (otherwise a crashed training run would tear down uvicorn too).
51
  trap '_stop_children; exit 0' INT TERM
52
 
53
+ _resolve_train_cmd() {
54
+ # Prefer the installed console script when it is on PATH, otherwise
55
+ # fall back to `python -m osint_env.cli`. The fallback avoids rc=127
56
+ # ("command not found") in case the user-site bin dir is missing
57
+ # from PATH for some reason.
58
+ if command -v osint-env >/dev/null 2>&1; then
59
+ TRAIN_CMD="osint-env"
60
+ else
61
+ echo "[space_start] 'osint-env' not on PATH; falling back to 'python -m osint_env.cli'."
62
+ TRAIN_CMD="python -m osint_env.cli"
63
+ fi
64
+ }
65
+
66
  _run_training_supervised() {
67
  if [ -n "${TRAIN_OUTPUT_DIR}" ]; then
68
  OUTPUT_ARG="--train-output-dir ${TRAIN_OUTPUT_DIR}"
 
70
  OUTPUT_ARG=""
71
  fi
72
 
73
+ _resolve_train_cmd
74
+
75
+ : > "${TRAIN_LOG_PATH}" || true
76
+
77
  if _is_true "$DRY_RUN_FLAG"; then
78
+ echo "[space_start] Running self-play in dry-run mode (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
79
  # shellcheck disable=SC2086
80
+ ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} --dry-run \
81
+ 2>&1 | tee -a "${TRAIN_LOG_PATH}" &
82
  else
83
+ echo "[space_start] Running self-play training in background (logs: ${TRAIN_LOG_PATH}; mirrored to stdout)."
84
  # shellcheck disable=SC2086
85
+ ${TRAIN_CMD} train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" ${OUTPUT_ARG} \
86
+ 2>&1 | tee -a "${TRAIN_LOG_PATH}" &
87
  fi
88
  TRAIN_PID=$!
89
+ echo "[space_start] training pid=${TRAIN_PID} (cmd: ${TRAIN_CMD} train-self-play)"
90
 
91
  # Watcher subshell: if training exits with non-zero status, log the
92
  # failure but do NOT propagate it to the parent script. Uvicorn must