File size: 4,111 Bytes
d2b2154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env bash
# Boot the two in-Space vLLMs SEQUENTIALLY, then exec uvicorn for the
# physix FastAPI server (which serves the API + the React SPA).
#
# Why sequential, not parallel:
#   The first deploy attempt booted both vLLMs in parallel and the second
#   one died with "No available memory for the cache blocks." Reason: vLLM
#   reads `nvidia-smi`-style free memory at startup and reserves
#   `--gpu-memory-utilization * (free at this moment)` worth of VRAM.
#   When two processes start simultaneously, both see "all 24 GB free" and
#   both try to grab ~10 GB; the second one to finalize loses. Booting
#   sequentially makes the second one observe the post-first-process free
#   memory, so its allocation is sized correctly.
#
# Why --gpu-memory-utilization 0.40 each (= 80% total):
#   On L4 (24 GB), 40% = ~9.6 GB per process. Qwen2.5-3B fp16 weights are
#   ~6.2 GB; that leaves ~3.4 GB per process for KV cache + activations,
#   which sustains max_model_len=4096 with comfortable margin. The 20%
#   reserve covers CUDA workspace + uvicorn + Python heap. Pushing this
#   much higher (e.g. 0.45 each) is what failed on the first deploy
#   because once you account for the ~600 MB CUDA context + the second
#   process's overhead, weights+KV no longer fit.

set -euo pipefail

QWEN_MODEL="${QWEN_MODEL:-Qwen/Qwen2.5-3B-Instruct}"
PHYSIX_MODEL="${PHYSIX_MODEL:-Pratyush-01/physix-3b-rl}"
QWEN_GPU_FRAC="${QWEN_GPU_FRAC:-0.40}"
PHYSIX_GPU_FRAC="${PHYSIX_GPU_FRAC:-0.40}"
MAX_LEN="${MAX_LEN:-4096}"

LOG_DIR=/tmp/logs
mkdir -p "$LOG_DIR"

# Forward signals so HF's "Pause" / "Restart" actually shuts everything
# down cleanly — otherwise CUDA memory leaks across container restarts.
PIDS=()
cleanup() {
    echo "[entrypoint] SIGTERM/SIGINT — killing children: ${PIDS[*]:-}" >&2
    for pid in "${PIDS[@]:-}"; do
        kill -TERM "$pid" 2>/dev/null || true
    done
    wait || true
    exit 0
}
trap cleanup TERM INT

wait_healthy() {
    local name="$1" port="$2" pid="$3" budget="${4:-300}"
    local deadline=$((SECONDS + budget))
    while (( SECONDS < deadline )); do
        if ! kill -0 "$pid" 2>/dev/null; then
            echo "[entrypoint] FATAL: $name (pid $pid) died during boot. Tail of log:" >&2
            tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
            return 1
        fi
        if curl -fsS "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
            echo "[entrypoint] $name healthy on :$port (after ${SECONDS}s)"
            return 0
        fi
        sleep 5
    done
    echo "[entrypoint] FATAL: $name failed to become healthy in ${budget}s" >&2
    tail -n 80 "$LOG_DIR/${name}.log" >&2 || true
    return 1
}

echo "[entrypoint] step 1/3 — booting vLLM(qwen) = $QWEN_MODEL on :8001 (gpu=${QWEN_GPU_FRAC})"
# vllm/vllm-openai image only ships `python3` — no `python` symlink.
python3 -m vllm.entrypoints.openai.api_server \
    --model "$QWEN_MODEL" \
    --served-model-name "$QWEN_MODEL" \
    --host 0.0.0.0 --port 8001 \
    --gpu-memory-utilization "$QWEN_GPU_FRAC" \
    --max-model-len "$MAX_LEN" \
    --dtype auto \
    --disable-log-requests \
    > "$LOG_DIR/qwen.log" 2>&1 &
QWEN_PID=$!
PIDS+=("$QWEN_PID")
wait_healthy qwen 8001 "$QWEN_PID" 300

echo "[entrypoint] step 2/3 — booting vLLM(physix) = $PHYSIX_MODEL on :8002 (gpu=${PHYSIX_GPU_FRAC})"
python3 -m vllm.entrypoints.openai.api_server \
    --model "$PHYSIX_MODEL" \
    --served-model-name "$PHYSIX_MODEL" \
    --host 0.0.0.0 --port 8002 \
    --gpu-memory-utilization "$PHYSIX_GPU_FRAC" \
    --max-model-len "$MAX_LEN" \
    --dtype auto \
    --disable-log-requests \
    > "$LOG_DIR/physix.log" 2>&1 &
PHYSIX_PID=$!
PIDS+=("$PHYSIX_PID")
wait_healthy physix 8002 "$PHYSIX_PID" 300

echo "[entrypoint] step 3/3 — both vLLMs healthy; starting uvicorn on :${PORT}"
# `exec` so uvicorn becomes PID 1's foreground job and HF Spaces sees
# our process as healthy. The trap above forwards termination back to
# the vLLM children when the Space is paused.
exec python3 -m uvicorn _space_app:app \
    --host 0.0.0.0 --port "${PORT:-7860}" \
    --log-level info