Unable to run model
#1
by boshko - opened
4bit quant is working fine, but for this model vLlm is throwing an error (below) on 2 x 3090 with the following start arguments:
image: vllm/vllm-openai:latest
...
command:
"--model", "cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ-8bit", # Works fine >
"--served-model-name", "Qwen3-Coder",
"--max-model-len", "62000", # max context size for this model max-num-seq>
"--gpu-memory-utilization", "0.90", # percentage of GPU memory usable by >
"--tensor-parallel-size", "2", # number of GPUs used for inference
"--dtype", "float16",
"--enable-auto-tool-choice",
"--tool-call-parser", "hermes",
"--max-num-seqs", "4", # maximum number of concurrent inference requests
"--uvicorn-log-level", "info",
Error:
(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [compressed_tensors_wNa16.py:95] Using MarlinLinearKernel for CompressedTensorsWNA16
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [compressed_tensors_wNa16.py:95] Using MarlinLinearKernel for CompressedTensorsWNA16
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [cuda.py:328] Using Flash Attention backend on V1 engine.
(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [cuda.py:328] Using Flash Attention backend on V1 engine.
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] WorkerProc failed to start.
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] Traceback (most recent call last):
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] worker = WorkerProc(*args, **kwargs)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.worker.load_model()
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 212, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model_runner.load_model(eep_scale_up=eep_scale_up)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1986, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model = model_loader.load_model(
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 44, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] model = initialize_model(vllm_config=vllm_config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 63, in initialize_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] return model_class(vllm_config=vllm_config, prefix=prefix)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 588, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model = Qwen3MoeModel(vllm_config=vllm_config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line
183, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 380, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.start_layer, self.end_layer, self.layers = make_layers(
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py",
line 641, in make_layers
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 382, in <lambda>
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] lambda prefix: Qwen3MoeDecoderLayer(config=config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 319, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.mlp = Qwen3MoeSparseMoeBlock(config=config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 138, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.experts = FusedMoE(num_experts=self.n_routed_experts,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py", line 845, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] else quant_config.get_quant_method(self, prefix))
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 121, in get_quant_method
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] return CompressedTensorsMoEMethod.get_moe_method(self, layer)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 72, in get_moe_method
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] KeyError: 'Linear'
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] WorkerProc failed to start.
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] Traceback (most recent call last):
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] worker = WorkerProc(*args, **kwargs)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.worker.load_model()
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 212, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model_runner.load_model(eep_scale_up=eep_scale_up)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1986, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model = model_loader.load_model(
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 44, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] model = initialize_model(vllm_config=vllm_config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 63, in initialize_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] return model_class(vllm_config=vllm_config, prefix=prefix)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 588, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.model = Qwen3MoeModel(vllm_config=vllm_config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line
183, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 380, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.start_layer, self.end_layer, self.layers = make_layers(
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py",
line 641, in make_layers
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 382, in <lambda>
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] lambda prefix: Qwen3MoeDecoderLayer(config=config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 319, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.mlp = Qwen3MoeSparseMoeBlock(config=config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 138, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] self.experts = FusedMoE(num_experts=self.n_routed_experts,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py", line 845, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] else quant_config.get_quant_method(self, prefix))
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 121, in get_quant_method
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] return CompressedTensorsMoEMethod.get_moe_method(self, layer)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 72, in get_moe_method
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] KeyError: 'Linear'
4633b3315e44:245:245 [1] NCCL INFO cudaDriverVersion 12090
4633b3315e44:245:245 [1] NCCL INFO Bootstrap: Using eth0:172.18.0.2<0>
4633b3315e44:245:245 [1] NCCL INFO NCCL version 2.26.2+cuda12.2
4633b3315e44:245:245 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
4633b3315e44:245:245 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
4633b3315e44:245:245 [1] NCCL INFO NET/Socket : Using [0]eth0:172.18.0.2<0>
4633b3315e44:245:245 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
4633b3315e44:245:245 [1] NCCL INFO Using network Socket
4633b3315e44:245:245 [1] NCCL INFO ncclCommInitRank comm 0x2dcc6e30 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 65000 commId 0xf920a949a8413d48 - Init START
4633b3315e44:245:245 [1] NCCL INFO RAS client listening socket at ::1<28028>
4633b3315e44:245:245 [1] NCCL INFO Bootstrap timings total 0.000827 (create 0.000025, send 0.000075, recv 0.000347, ring 0.000020, delay 0.000000)
4633b3315e44:245:245 [1] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.
4633b3315e44:245:245 [1] NCCL INFO Setting affinity for GPU 1 to 0f,ffffffff
4633b3315e44:245:245 [1] NCCL INFO comm 0x2dcc6e30 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
4633b3315e44:245:245 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
4633b3315e44:245:245 [1] NCCL INFO P2P Chunksize set to 131072
4633b3315e44:245:309 [1] NCCL INFO [Proxy Service] Device 1 CPU core 12
4633b3315e44:245:310 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 14
4633b3315e44:245:245 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
4633b3315e44:245:245 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
4633b3315e44:245:245 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
4633b3315e44:245:245 [1] NCCL INFO Connected all trees
4633b3315e44:245:312 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 17
4633b3315e44:245:245 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 5124633b3315e44:245:245 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
4633b3315e44:245:245 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
4633b3315e44:245:245 [1] NCCL INFO ncclCommInitRank comm 0x2dcc6e30 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 65000 commId 0xf920a949a8413d48 - Init COMPLETE
4633b3315e44:245:245 [1] NCCL INFO Init timings - ncclCommInitRank: rank 1 nranks 2 total 0.18 (kernels 0.12, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo
0.01, graphs 0.00, connections 0.05, rest 0.01)
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [multiproc_executor.py:520] Parent
process exited, terminating worker
(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [multiproc_executor.py:520] Parent
process exited, terminating worker
4633b3315e44:244:244 [0] NCCL INFO Bootstrap: Using eth0:172.18.0.2<0>
4633b3315e44:244:244 [0] NCCL INFO cudaDriverVersion 12090
4633b3315e44:244:244 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
4633b3315e44:244:244 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
4633b3315e44:244:244 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
4633b3315e44:244:244 [0] NCCL INFO NET/Socket : Using [0]eth0:172.18.0.2<0>
4633b3315e44:244:244 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
4633b3315e44:244:244 [0] NCCL INFO Using network Socket
4633b3315e44:244:244 [0] NCCL INFO ncclCommInitRank comm 0x1f0ea920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 17000 commId 0xf920a949a8413d48 - Init START
4633b3315e44:244:244 [0] NCCL INFO RAS client listening socket at ::1<28028>
4633b3315e44:244:244 [0] NCCL INFO Bootstrap timings total 0.002867 (create 0.000033, send 0.000111, recv 0.002231, ring 0.000019, delay 0.000000)
4633b3315e44:244:244 [0] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.
4633b3315e44:244:244 [0] NCCL INFO Setting affinity for GPU 0 to 0f,ffffffff
4633b3315e44:244:244 [0] NCCL INFO comm 0x1f0ea920 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
4633b3315e44:244:244 [0] NCCL INFO Channel 00/02 : 0 1
4633b3315e44:244:244 [0] NCCL INFO Channel 01/02 : 0 1
4633b3315e44:244:244 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
4633b3315e44:244:244 [0] NCCL INFO P2P Chunksize set to 131072
4633b3315e44:244:244 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 0 directMode 0
4633b3315e44:244:308 [0] NCCL INFO [Proxy Service] Device 0 CPU core 11
4633b3315e44:244:311 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 16
4633b3315e44:244:244 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
4633b3315e44:244:244 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
4633b3315e44:244:244 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
4633b3315e44:244:244 [0] NCCL INFO Connected all trees
4633b3315e44:244:313 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 27
4633b3315e44:244:244 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 5124633b3315e44:244:244 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
4633b3315e44:244:244 [0] NCCL INFO CC Off, workFifoBytes 1048576
4633b3315e44:244:244 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
4633b3315e44:244:244 [0] NCCL INFO ncclCommInitRank comm 0x1f0ea920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 17000 commId 0xf920a949a8413d48 - Init COMPLETE
4633b3315e44:244:244 [0] NCCL INFO Init timings - ncclCommInitRank: rank 0 nranks 2 total 0.18 (kernels 0.12, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo
0.01, graphs 0.00, connections 0.05, rest 0.00)
[rank0]:[W902 08:10:08.508797746 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(EngineCore_0 pid=166) Process EngineCore_0:
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] EngineCore failed to start.
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] Traceback (most recent call last):
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 691, in run_engine_core
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 492, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 80, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] self.model_executor = executor_class(vllm_config)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]
^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] self._init_executor()
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in
wait_for_ready
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] raise e from None
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] Exception: WorkerProc
initialization failed due to an exception in a background process. See stack trace for root cause.
(EngineCore_0 pid=166) Traceback (most recent call last):
(EngineCore_0 pid=166) File "/usr/lib/python3.12/multiprocessing/process.py",
line 314, in _bootstrap
(EngineCore_0 pid=166) self.run()
(EngineCore_0 pid=166) File "/usr/lib/python3.12/multiprocessing/process.py",
line 108, in run
(EngineCore_0 pid=166) self._target(*self._args, **self._kwargs)
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 704, in run_engine_core
(EngineCore_0 pid=166) raise e
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 691, in run_engine_core
(EngineCore_0 pid=166) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=166) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 492, in __init__
(EngineCore_0 pid=166) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 80, in __init__
(EngineCore_0 pid=166) self.model_executor = executor_class(vllm_config)
(EngineCore_0 pid=166) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_0 pid=166) self._init_executor()
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
(EngineCore_0 pid=166) self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_0 pid=166) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in wait_for_ready
(EngineCore_0 pid=166) raise e from None
(EngineCore_0 pid=166) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=1) File "<frozen runpy>", line 88, in _run_code
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1920, in <module>
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in
run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in
run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1850, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1870, in run_server_worker
(APIServer pid=1) async with build_async_engine_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 178, in build_async_engine_client
(APIServer pid=1) async with build_async_engine_client_from_engine_args(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 220, in build_async_engine_client_from_engine_args
(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 1557, in inner
(APIServer pid=1) return fn(*args, **kwargs)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 174, in from_vllm_config
(APIServer pid=1) return cls(
(APIServer pid=1) ^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 120, in __init__
(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=1) return AsyncMPClient(*client_args)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 767, in __init__
(APIServer pid=1) super().__init__(
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 446, in __init__
(APIServer pid=1) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=1) next(self.gen)
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 706, in launch_core_engines
(APIServer pid=1) wait_for_engine_startup(
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 759, in wait_for_engine_startup
(APIServer pid=1) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
Working as expected now, thanks!
boshko changed discussion status to closed