File size: 2,675 Bytes
d597642 8c92f05 78575eb 8c92f05 78575eb 8c92f05 d597642 8c92f05 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | #!/usr/bin/env bash
# OpenSleuth training Space entrypoint.
#
# Starts a tiny background HTTP server on $PORT (default 7860) so the HF
# Spaces health probe is satisfied, then runs the actual training script in
# the foreground. All training logs go to stdout and are visible in the
# Space's "Container logs" tab.
set -euo pipefail
PORT="${PORT:-7860}"
log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; }
# 1. Background heartbeat HTTP server. Just returns 200 OK on every request.
log "starting heartbeat server on :${PORT}"
python -c "
import http.server, socketserver, os, threading, time
class H(http.server.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header('Content-Type','text/plain')
self.end_headers()
self.wfile.write(b'opensleuth-trainer alive\n')
def log_message(self, *a, **kw): pass
port = int(os.environ.get('PORT','7860'))
srv = socketserver.TCPServer(('0.0.0.0', port), H)
threading.Thread(target=srv.serve_forever, daemon=True).start()
print(f'[heartbeat] listening on :{port}', flush=True)
while True: time.sleep(3600)
" &
HB_PID=$!
# Give the heartbeat a moment to bind before the orchestrator probes it.
sleep 2
# 2. Run training. If a sentinel file exists, training already completed in a
# previous container start (the Space orchestrator restarts containers
# that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
# on duplicate runs. Set FORCE_TRAIN=1 to override.
#
# v0.4 update: when FORCE_TRAIN=1 is set, we explicitly *delete* the old
# sentinel up-front. Without this the sentinel from a previous v0.2 run
# (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on
# Space restart. The sentinel only ever gets re-touched after a fresh
# successful training run completes below.
SENTINEL="/data/.opensleuth-trained"
if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then
log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train."
rm -f "$SENTINEL"
fi
if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
sleep infinity
fi
log "starting training (PID $$)"
log "GPU info:"
python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
if python /app/train.py; then
log "training succeeded; writing sentinel and idling so the container stays alive."
touch "$SENTINEL"
sleep infinity
else
log "training failed (exit $?); container will exit so the error surfaces."
exit 1
fi
|