File size: 1,599 Bytes
1dfed79 c7e8b79 1dfed79 89992e4 1dfed79 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | #!/bin/bash
# OpenGrid entrypoint — switches between UI server and GRPO training
set -e
# Dynamically find all pip-installed NVIDIA library paths
NVIDIA_LIBS=$(python -c "
import glob, os
paths = glob.glob('/home/user/.local/lib/python3.10/site-packages/nvidia/*/lib')
print(':'.join(paths))
" 2>/dev/null || echo "")
if [ -n "$NVIDIA_LIBS" ]; then
export LD_LIBRARY_PATH="${NVIDIA_LIBS}:${LD_LIBRARY_PATH}"
echo "Set LD_LIBRARY_PATH with NVIDIA libs: $NVIDIA_LIBS"
fi
MODE="${OPENGRID_MODE:-server}"
if [ "$MODE" = "training" ]; then
echo "========================================"
echo " OpenGrid — GRPO Training Mode"
echo "========================================"
# Start the UI server in background IMMEDIATELY so HF health check passes.
# Training output is written to training/outputs/ and the UI will serve it
# once training completes. The server stays alive throughout training.
echo "Starting background UI server on port 7860 (health check)..."
uvicorn app:app --host 0.0.0.0 --port 7860 &
UI_PID=$!
# Give server a moment to bind the port before training grabs GPU memory
sleep 5
# Run training (foreground)
python run_training.py
# Training finished — server is already running, just wait for it
echo "Training complete. UI server (PID $UI_PID) continues serving results."
wait $UI_PID
else
echo "========================================"
echo " OpenGrid — Control Room Server"
echo "========================================"
exec uvicorn app:app --host 0.0.0.0 --port 7860
fi
|