anugrah55 commited on
Commit
8c92f05
·
verified ·
1 Parent(s): ae04e19

entrypoint: skip retraining if sentinel exists, idle on heartbeat after training succeeds (prevents auto-restart loop burning GPU)

Browse files
Files changed (1) hide show
  1. entrypoint.sh +18 -3
entrypoint.sh CHANGED
@@ -33,10 +33,25 @@ HB_PID=$!
33
  # Give the heartbeat a moment to bind before the orchestrator probes it.
34
  sleep 2
35
 
36
- # 2. Run training in the foreground. Crash here = container exits, which is
37
- # what we want: HF will mark the Space failed and surface the error.
 
 
 
 
 
 
 
 
38
  log "starting training (PID $$)"
39
  log "GPU info:"
40
  python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
41
 
42
- exec python /app/train.py
 
 
 
 
 
 
 
 
33
  # Give the heartbeat a moment to bind before the orchestrator probes it.
34
  sleep 2
35
 
36
+ # 2. Run training. If a sentinel file exists, training already completed in a
37
+ # previous container start (the Space orchestrator restarts containers
38
+ # that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
39
+ # on duplicate runs. Set FORCE_TRAIN=1 to override.
40
+ SENTINEL="/data/.opensleuth-trained"
41
+ if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
42
+ log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
43
+ sleep infinity
44
+ fi
45
+
46
  log "starting training (PID $$)"
47
  log "GPU info:"
48
  python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
49
 
50
+ if python /app/train.py; then
51
+ log "training succeeded; writing sentinel and idling so the container stays alive."
52
+ touch "$SENTINEL"
53
+ sleep infinity
54
+ else
55
+ log "training failed (exit $?); container will exit so the error surfaces."
56
+ exit 1
57
+ fi