Fix health check timeout: start UI server in background before training
Browse files- entrypoint.sh +17 -1
- run_training.py +10 -4
entrypoint.sh
CHANGED
|
@@ -21,7 +21,23 @@ if [ "$MODE" = "training" ]; then
|
|
| 21 |
echo "========================================"
|
| 22 |
echo " OpenGrid — GRPO Training Mode"
|
| 23 |
echo "========================================"
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
else
|
| 26 |
echo "========================================"
|
| 27 |
echo " OpenGrid — Control Room Server"
|
|
|
|
| 21 |
echo "========================================"
|
| 22 |
echo " OpenGrid — GRPO Training Mode"
|
| 23 |
echo "========================================"
|
| 24 |
+
|
| 25 |
+
# Start the UI server in background IMMEDIATELY so HF health check passes.
|
| 26 |
+
# Training output is written to training/outputs/ and the UI will serve it
|
| 27 |
+
# once training completes. The server stays alive throughout training.
|
| 28 |
+
echo "Starting background UI server on port 7860 (health check)..."
|
| 29 |
+
uvicorn app:app --host 0.0.0.0 --port 7860 &
|
| 30 |
+
UI_PID=$!
|
| 31 |
+
|
| 32 |
+
# Give server a moment to bind the port before training grabs GPU memory
|
| 33 |
+
sleep 5
|
| 34 |
+
|
| 35 |
+
# Run training (foreground)
|
| 36 |
+
python run_training.py
|
| 37 |
+
|
| 38 |
+
# Training finished — server is already running, just wait for it
|
| 39 |
+
echo "Training complete. UI server (PID $UI_PID) continues serving results."
|
| 40 |
+
wait $UI_PID
|
| 41 |
else
|
| 42 |
echo "========================================"
|
| 43 |
echo " OpenGrid — Control Room Server"
|
run_training.py
CHANGED
|
@@ -414,7 +414,13 @@ if __name__ == "__main__":
|
|
| 414 |
|
| 415 |
# Start the full UI server (not a mini results server)
|
| 416 |
# This serves the control room + training results on port 7860
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
# Start the full UI server (not a mini results server)
|
| 416 |
# This serves the control room + training results on port 7860
|
| 417 |
+
# NOTE: In training mode, entrypoint.sh starts the server in background
|
| 418 |
+
# before training. This block is kept for standalone execution only.
|
| 419 |
+
if os.environ.get("OPENGRID_MODE") != "training":
|
| 420 |
+
print("\nTraining done. Starting full UI server on port 7860...")
|
| 421 |
+
import uvicorn
|
| 422 |
+
from app import app
|
| 423 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 424 |
+
else:
|
| 425 |
+
print("\nTraining done. UI server already running in background.")
|
| 426 |
+
|