File size: 1,599 Bytes
1dfed79
 
 
 
 
c7e8b79
 
 
 
 
 
 
 
 
 
 
 
1dfed79
 
 
 
 
 
89992e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dfed79
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
# OpenGrid entrypoint — switches between UI server and GRPO training

set -e

# Dynamically find all pip-installed NVIDIA library paths
NVIDIA_LIBS=$(python -c "
import glob, os
paths = glob.glob('/home/user/.local/lib/python3.10/site-packages/nvidia/*/lib')
print(':'.join(paths))
" 2>/dev/null || echo "")

if [ -n "$NVIDIA_LIBS" ]; then
    export LD_LIBRARY_PATH="${NVIDIA_LIBS}:${LD_LIBRARY_PATH}"
    echo "Set LD_LIBRARY_PATH with NVIDIA libs: $NVIDIA_LIBS"
fi

MODE="${OPENGRID_MODE:-server}"

if [ "$MODE" = "training" ]; then
    echo "========================================"
    echo "  OpenGrid — GRPO Training Mode"
    echo "========================================"

    # Start the UI server in background IMMEDIATELY so HF health check passes.
    # Training output is written to training/outputs/ and the UI will serve it
    # once training completes. The server stays alive throughout training.
    echo "Starting background UI server on port 7860 (health check)..."
    uvicorn app:app --host 0.0.0.0 --port 7860 &
    UI_PID=$!

    # Give server a moment to bind the port before training grabs GPU memory
    sleep 5

    # Run training (foreground)
    python run_training.py

    # Training finished — server is already running, just wait for it
    echo "Training complete. UI server (PID $UI_PID) continues serving results."
    wait $UI_PID
else
    echo "========================================"
    echo "  OpenGrid — Control Room Server"
    echo "========================================"
    exec uvicorn app:app --host 0.0.0.0 --port 7860
fi