K446 commited on
Commit
89992e4
·
1 Parent(s): 81257d9

Fix health check timeout: start UI server in background before training

Browse files
Files changed (2) hide show
  1. entrypoint.sh +17 -1
  2. run_training.py +10 -4
entrypoint.sh CHANGED
@@ -21,7 +21,23 @@ if [ "$MODE" = "training" ]; then
21
  echo "========================================"
22
  echo " OpenGrid — GRPO Training Mode"
23
  echo "========================================"
24
- exec python run_training.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  else
26
  echo "========================================"
27
  echo " OpenGrid — Control Room Server"
 
21
  echo "========================================"
22
  echo " OpenGrid — GRPO Training Mode"
23
  echo "========================================"
24
+
25
+ # Start the UI server in background IMMEDIATELY so HF health check passes.
26
+ # Training output is written to training/outputs/ and the UI will serve it
27
+ # once training completes. The server stays alive throughout training.
28
+ echo "Starting background UI server on port 7860 (health check)..."
29
+ uvicorn app:app --host 0.0.0.0 --port 7860 &
30
+ UI_PID=$!
31
+
32
+ # Give server a moment to bind the port before training grabs GPU memory
33
+ sleep 5
34
+
35
+ # Run training (foreground)
36
+ python run_training.py
37
+
38
+ # Training finished — server is already running, just wait for it
39
+ echo "Training complete. UI server (PID $UI_PID) continues serving results."
40
+ wait $UI_PID
41
  else
42
  echo "========================================"
43
  echo " OpenGrid — Control Room Server"
run_training.py CHANGED
@@ -414,7 +414,13 @@ if __name__ == "__main__":
414
 
415
  # Start the full UI server (not a mini results server)
416
  # This serves the control room + training results on port 7860
417
- print("\nTraining done. Starting full UI server on port 7860...")
418
- import uvicorn
419
- from app import app
420
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
414
 
415
  # Start the full UI server (not a mini results server)
416
  # This serves the control room + training results on port 7860
417
+ # NOTE: In training mode, entrypoint.sh starts the server in background
418
+ # before training. This block is kept for standalone execution only.
419
+ if os.environ.get("OPENGRID_MODE") != "training":
420
+ print("\nTraining done. Starting full UI server on port 7860...")
421
+ import uvicorn
422
+ from app import app
423
+ uvicorn.run(app, host="0.0.0.0", port=7860)
424
+ else:
425
+ print("\nTraining done. UI server already running in background.")
426
+