Spaces:

anugrah55
/

opensleuth-training-gemini-cli

Paused

anugrah55 commited on 13 days ago

Commit

8c92f05

verified ·

1 Parent(s): ae04e19

entrypoint: skip retraining if sentinel exists, idle on heartbeat after training succeeds (prevents auto-restart loop burning GPU)

Files changed (1) hide show

entrypoint.sh CHANGED Viewed

@@ -33,10 +33,25 @@ HB_PID=$!
 # Give the heartbeat a moment to bind before the orchestrator probes it.
 sleep 2
-# 2. Run training in the foreground. Crash here = container exits, which is
-#    what we want: HF will mark the Space failed and surface the error.
 log "starting training (PID $$)"
 log "GPU info:"
 python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
-exec python /app/train.py

 # Give the heartbeat a moment to bind before the orchestrator probes it.
 sleep 2
+# 2. Run training. If a sentinel file exists, training already completed in a
+#    previous container start (the Space orchestrator restarts containers
+#    that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
+#    on duplicate runs. Set FORCE_TRAIN=1 to override.
+SENTINEL="/data/.opensleuth-trained"
+if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
+    log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
+    sleep infinity
+fi
 log "starting training (PID $$)"
 log "GPU info:"
 python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"
+if python /app/train.py; then
+    log "training succeeded; writing sentinel and idling so the container stays alive."
+    touch "$SENTINEL"
+    sleep infinity
+else
+    log "training failed (exit $?); container will exit so the error surfaces."
+    exit 1
+fi