#!/usr/bin/env bash # OpenSleuth training Space entrypoint. # # Starts a tiny background HTTP server on $PORT (default 7860) so the HF # Spaces health probe is satisfied, then runs the actual training script in # the foreground. All training logs go to stdout and are visible in the # Space's "Container logs" tab. set -euo pipefail PORT="${PORT:-7860}" log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; } # 1. Background heartbeat HTTP server. Just returns 200 OK on every request. log "starting heartbeat server on :${PORT}" python -c " import http.server, socketserver, os, threading, time class H(http.server.BaseHTTPRequestHandler): def do_GET(self): self.send_response(200) self.send_header('Content-Type','text/plain') self.end_headers() self.wfile.write(b'opensleuth-trainer alive\n') def log_message(self, *a, **kw): pass port = int(os.environ.get('PORT','7860')) srv = socketserver.TCPServer(('0.0.0.0', port), H) threading.Thread(target=srv.serve_forever, daemon=True).start() print(f'[heartbeat] listening on :{port}', flush=True) while True: time.sleep(3600) " & HB_PID=$! # Give the heartbeat a moment to bind before the orchestrator probes it. sleep 2 # 2. Run training. If a sentinel file exists, training already completed in a # previous container start (the Space orchestrator restarts containers # that exit cleanly), so just idle on the heartbeat to avoid wasting GPU # on duplicate runs. Set FORCE_TRAIN=1 to override. # # v0.4 update: when FORCE_TRAIN=1 is set, we explicitly *delete* the old # sentinel up-front. Without this the sentinel from a previous v0.2 run # (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on # Space restart. The sentinel only ever gets re-touched after a fresh # successful training run completes below. SENTINEL="/data/.opensleuth-trained" if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train." rm -f "$SENTINEL" fi if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..." sleep infinity fi log "starting training (PID $$)" log "GPU info:" python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')" if python /app/train.py; then log "training succeeded; writing sentinel and idling so the container stays alive." touch "$SENTINEL" sleep infinity else log "training failed (exit $?); container will exit so the error surfaces." exit 1 fi