anugrah55's picture
trainer v0.4: switch to Qwen2.5-3B-Instruct, dynamic task discovery, delegated probe sampling, difficulty-weighted rollouts, push to opensleuth-qwen2.5-3b-grpo-v2; sentinel cleared on FORCE_TRAIN=1.
78575eb verified
#!/usr/bin/env bash
# OpenSleuth training Space entrypoint.
#
# Starts a tiny background HTTP server on $PORT (default 7860) so the HF
# Spaces health probe is satisfied, then runs the actual training script in
# the foreground. All training logs go to stdout and are visible in the
# Space's "Container logs" tab.
set -euo pipefail
PORT="${PORT:-7860}"
log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; }
# 1. Background heartbeat HTTP server. Just returns 200 OK on every request.
log "starting heartbeat server on :${PORT}"
python -c "
import http.server, socketserver, os, threading, time
class H(http.server.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header('Content-Type','text/plain')
self.end_headers()
self.wfile.write(b'opensleuth-trainer alive\n')
def log_message(self, *a, **kw): pass
port = int(os.environ.get('PORT','7860'))
srv = socketserver.TCPServer(('0.0.0.0', port), H)
threading.Thread(target=srv.serve_forever, daemon=True).start()
print(f'[heartbeat] listening on :{port}', flush=True)
while True: time.sleep(3600)
" &
HB_PID=$!
# Give the heartbeat a moment to bind before the orchestrator probes it.
sleep 2
# 2. Run training. If a sentinel file exists, training already completed in a
# previous container start (the Space orchestrator restarts containers
# that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
# on duplicate runs. Set FORCE_TRAIN=1 to override.
#
# v0.4 update: when FORCE_TRAIN=1 is set, we explicitly *delete* the old
# sentinel up-front. Without this the sentinel from a previous v0.2 run
# (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on
# Space restart. The sentinel only ever gets re-touched after a fresh
# successful training run completes below.
SENTINEL="/data/.opensleuth-trained"
if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then
log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train."
rm -f "$SENTINEL"
fi
if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
sleep infinity
fi
log "starting training (PID $$)"
log "GPU info:"
python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
if python /app/train.py; then
log "training succeeded; writing sentinel and idling so the container stays alive."
touch "$SENTINEL"
sleep infinity
else
log "training failed (exit $?); container will exit so the error surfaces."
exit 1
fi