File size: 2,675 Bytes
d597642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c92f05
 
 
 
78575eb
 
 
 
 
 
8c92f05
78575eb
 
 
 
8c92f05
 
 
 
 
d597642
 
 
 
8c92f05
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env bash
# OpenSleuth training Space entrypoint.
#
# Starts a tiny background HTTP server on $PORT (default 7860) so the HF
# Spaces health probe is satisfied, then runs the actual training script in
# the foreground. All training logs go to stdout and are visible in the
# Space's "Container logs" tab.
set -euo pipefail

PORT="${PORT:-7860}"

log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; }

# 1. Background heartbeat HTTP server. Just returns 200 OK on every request.
log "starting heartbeat server on :${PORT}"
python -c "
import http.server, socketserver, os, threading, time
class H(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.send_header('Content-Type','text/plain')
        self.end_headers()
        self.wfile.write(b'opensleuth-trainer alive\n')
    def log_message(self, *a, **kw): pass
port = int(os.environ.get('PORT','7860'))
srv = socketserver.TCPServer(('0.0.0.0', port), H)
threading.Thread(target=srv.serve_forever, daemon=True).start()
print(f'[heartbeat] listening on :{port}', flush=True)
while True: time.sleep(3600)
" &
HB_PID=$!

# Give the heartbeat a moment to bind before the orchestrator probes it.
sleep 2

# 2. Run training. If a sentinel file exists, training already completed in a
#    previous container start (the Space orchestrator restarts containers
#    that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
#    on duplicate runs. Set FORCE_TRAIN=1 to override.
#
# v0.4 update: when FORCE_TRAIN=1 is set, we explicitly *delete* the old
# sentinel up-front. Without this the sentinel from a previous v0.2 run
# (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on
# Space restart. The sentinel only ever gets re-touched after a fresh
# successful training run completes below.
SENTINEL="/data/.opensleuth-trained"
if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then
    log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train."
    rm -f "$SENTINEL"
fi
if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
    log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
    sleep infinity
fi

log "starting training (PID $$)"
log "GPU info:"
python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"

if python /app/train.py; then
    log "training succeeded; writing sentinel and idling so the container stays alive."
    touch "$SENTINEL"
    sleep infinity
else
    log "training failed (exit $?); container will exit so the error surfaces."
    exit 1
fi