only 3 t/s by duo sparks - sglang , what is missed from commands?
only 3 t/s by duo spark - sglang , what is missed from commands?
services:
sglang:
image: scitrera/dgx-spark-sglang:0.5.9-dev1-329817e2-t5
container_name: sglang-hhqwen2572b
volumes:
- ${HOME}/.cache/huggingface:/root/.cache/huggingface
# If you use modelscope, you need mount this directory
# - ${HOME}/.cache/modelscope:/root/.cache/modelscope
restart: always
network_mode: host # required by RDMA
privileged: true # required by RDMA
# Or you can only publish port 30000
# ports:
# - 50000:50000
environment:
- CUDA_HOME="/usr/local/cuda"
- MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
- NCCL_HOME="$HOME/nccl/build/"
- LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
- UCX_NET_DEVICES=enp1s0f1np1
- NCCL_SOCKET_IFNAME=enp1s0f1np1
- OMPI_MCA_btl_tcp_if_include=enp1s0f1np1
- GLOO_SOCKET_IFNAME=enp1s0f1np1
entrypoint: python3 -m sglang.launch_server
command: --model-path huihui-ai/Qwen2.5-72B-Instruct-abliterated
--host 0.0.0.0
--port 30000
--trust-remote-code
--mem-fraction-static 0.95
--context-length 262144
--attention-backend triton
--reasoning-parser qwen3
--tool-call-parser qwen3_coder
--tp-size 2
--nnodes 2
--node-rank 0
--dist-init-addr 10.0.0.1:50000
ulimits:
memlock: -1
stack: 67108864
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]