| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| PORT="${PORT:-7860}" |
|
|
| log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; } |
|
|
| |
| log "starting heartbeat server on :${PORT}" |
| python -c " |
| import http.server, socketserver, os, threading, time |
| class H(http.server.BaseHTTPRequestHandler): |
| def do_GET(self): |
| self.send_response(200) |
| self.send_header('Content-Type','text/plain') |
| self.end_headers() |
| self.wfile.write(b'opensleuth-trainer alive\n') |
| def log_message(self, *a, **kw): pass |
| port = int(os.environ.get('PORT','7860')) |
| srv = socketserver.TCPServer(('0.0.0.0', port), H) |
| threading.Thread(target=srv.serve_forever, daemon=True).start() |
| print(f'[heartbeat] listening on :{port}', flush=True) |
| while True: time.sleep(3600) |
| " & |
| HB_PID=$! |
|
|
| |
| sleep 2 |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| SENTINEL="/data/.opensleuth-trained" |
| if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then |
| log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train." |
| rm -f "$SENTINEL" |
| fi |
| if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then |
| log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..." |
| sleep infinity |
| fi |
|
|
| log "starting training (PID $$)" |
| log "GPU info:" |
| python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')" |
|
|
| if python /app/train.py; then |
| log "training succeeded; writing sentinel and idling so the container stays alive." |
| touch "$SENTINEL" |
| sleep infinity |
| else |
| log "training failed (exit $?); container will exit so the error surfaces." |
| exit 1 |
| fi |
|
|