| # OpenGrid entrypoint — switches between UI server and GRPO training | |
| set -e | |
| # Dynamically find all pip-installed NVIDIA library paths | |
| NVIDIA_LIBS=$(python -c " | |
| import glob, os | |
| paths = glob.glob('/home/user/.local/lib/python3.10/site-packages/nvidia/*/lib') | |
| print(':'.join(paths)) | |
| " 2>/dev/null || echo "") | |
| if [ -n "$NVIDIA_LIBS" ]; then | |
| export LD_LIBRARY_PATH="${NVIDIA_LIBS}:${LD_LIBRARY_PATH}" | |
| echo "Set LD_LIBRARY_PATH with NVIDIA libs: $NVIDIA_LIBS" | |
| fi | |
| MODE="${OPENGRID_MODE:-server}" | |
| if [ "$MODE" = "training" ]; then | |
| echo "========================================" | |
| echo " OpenGrid — GRPO Training Mode" | |
| echo "========================================" | |
| # Start the UI server in background IMMEDIATELY so HF health check passes. | |
| # Training output is written to training/outputs/ and the UI will serve it | |
| # once training completes. The server stays alive throughout training. | |
| echo "Starting background UI server on port 7860 (health check)..." | |
| uvicorn app:app --host 0.0.0.0 --port 7860 & | |
| UI_PID=$! | |
| # Give server a moment to bind the port before training grabs GPU memory | |
| sleep 5 | |
| # Run training (foreground) | |
| python run_training.py | |
| # Training finished — server is already running, just wait for it | |
| echo "Training complete. UI server (PID $UI_PID) continues serving results." | |
| wait $UI_PID | |
| else | |
| echo "========================================" | |
| echo " OpenGrid — Control Room Server" | |
| echo "========================================" | |
| exec uvicorn app:app --host 0.0.0.0 --port 7860 | |
| fi | |