Unable to run model

#1
by boshko - opened

4bit quant is working fine, but for this model vLlm is throwing an error (below) on 2 x 3090 with the following start arguments:

 image: vllm/vllm-openai:latest
 ...
 command:
      "--model", "cpatonn/Qwen3-Coder-30B-A3B-Instruct-AWQ-8bit", # Works fine >
      "--served-model-name", "Qwen3-Coder",
      "--max-model-len", "62000", # max context size for this model max-num-seq>
      "--gpu-memory-utilization", "0.90", # percentage of GPU memory usable by >
      "--tensor-parallel-size", "2", # number of GPUs used for inference
      "--dtype", "float16",
      "--enable-auto-tool-choice",
      "--tool-call-parser", "hermes",
      "--max-num-seqs", "4", # maximum number of concurrent inference requests
      "--uvicorn-log-level", "info",

Error:

(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [compressed_tensors_wNa16.py:95] Using MarlinLinearKernel for CompressedTensorsWNA16
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [compressed_tensors_wNa16.py:95] Using MarlinLinearKernel for CompressedTensorsWNA16
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [cuda.py:328] Using Flash Attention backend on V1 engine.
(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [cuda.py:328] Using Flash Attention backend on V1 engine.
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] WorkerProc failed to start.
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] Traceback (most recent call last):
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     worker = WorkerProc(*args, **kwargs)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.worker.load_model()
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 212, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model_runner.load_model(eep_scale_up=eep_scale_up)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1986, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model = model_loader.load_model(
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
           ^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 44, in load_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     model = initialize_model(vllm_config=vllm_config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 63, in initialize_model
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     return model_class(vllm_config=vllm_config, prefix=prefix)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 588, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model = Qwen3MoeModel(vllm_config=vllm_config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line
183, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 380, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.start_layer, self.end_layer, self.layers = make_layers(
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
                                              ^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py",
line 641, in make_layers
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 382, in <lambda>
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     lambda prefix: Qwen3MoeDecoderLayer(config=config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 319, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.mlp = Qwen3MoeSparseMoeBlock(config=config,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 138, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.experts = FusedMoE(num_experts=self.n_routed_experts,
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py", line 845, in __init__
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     else quant_config.get_quant_method(self, prefix))
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 121, in get_quant_method
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     return CompressedTensorsMoEMethod.get_moe_method(self, layer)
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 72, in get_moe_method
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
(VllmWorker TP1 pid=245) ERROR 09-02 08:10:07 [multiproc_executor.py:559] KeyError: 'Linear'
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] WorkerProc failed to start.
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] Traceback (most recent call last):
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     worker = WorkerProc(*args, **kwargs)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.worker.load_model()
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 212, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model_runner.load_model(eep_scale_up=eep_scale_up)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1986, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model = model_loader.load_model(
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
           ^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 44, in load_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     model = initialize_model(vllm_config=vllm_config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 63, in initialize_model
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     return model_class(vllm_config=vllm_config, prefix=prefix)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 588, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.model = Qwen3MoeModel(vllm_config=vllm_config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line
183, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 380, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.start_layer, self.end_layer, self.layers = make_layers(
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
                                              ^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py",
line 641, in make_layers
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 382, in <lambda>
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     lambda prefix: Qwen3MoeDecoderLayer(config=config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 319, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.mlp = Qwen3MoeSparseMoeBlock(config=config,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 138, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     self.experts = FusedMoE(num_experts=self.n_routed_experts,
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py", line 845, in __init__
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     else quant_config.get_quant_method(self, prefix))
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 121, in get_quant_method
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     return CompressedTensorsMoEMethod.get_moe_method(self, layer)
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 72, in get_moe_method
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]     weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559]
             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
(VllmWorker TP0 pid=244) ERROR 09-02 08:10:07 [multiproc_executor.py:559] KeyError: 'Linear'
4633b3315e44:245:245 [1] NCCL INFO cudaDriverVersion 12090
4633b3315e44:245:245 [1] NCCL INFO Bootstrap: Using eth0:172.18.0.2<0>
4633b3315e44:245:245 [1] NCCL INFO NCCL version 2.26.2+cuda12.2
4633b3315e44:245:245 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
4633b3315e44:245:245 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
4633b3315e44:245:245 [1] NCCL INFO NET/Socket : Using [0]eth0:172.18.0.2<0>
4633b3315e44:245:245 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
4633b3315e44:245:245 [1] NCCL INFO Using network Socket
4633b3315e44:245:245 [1] NCCL INFO ncclCommInitRank comm 0x2dcc6e30 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 65000 commId 0xf920a949a8413d48 - Init START
4633b3315e44:245:245 [1] NCCL INFO RAS client listening socket at ::1<28028>
4633b3315e44:245:245 [1] NCCL INFO Bootstrap timings total 0.000827 (create 0.000025, send 0.000075, recv 0.000347, ring 0.000020, delay 0.000000)
4633b3315e44:245:245 [1] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.
4633b3315e44:245:245 [1] NCCL INFO Setting affinity for GPU 1 to 0f,ffffffff
4633b3315e44:245:245 [1] NCCL INFO comm 0x2dcc6e30 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
4633b3315e44:245:245 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
4633b3315e44:245:245 [1] NCCL INFO P2P Chunksize set to 131072
4633b3315e44:245:309 [1] NCCL INFO [Proxy Service] Device 1 CPU core 12
4633b3315e44:245:310 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 14
4633b3315e44:245:245 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
4633b3315e44:245:245 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
4633b3315e44:245:245 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
4633b3315e44:245:245 [1] NCCL INFO Connected all trees
4633b3315e44:245:312 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 17
4633b3315e44:245:245 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 5124633b3315e44:245:245 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
4633b3315e44:245:245 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
4633b3315e44:245:245 [1] NCCL INFO ncclCommInitRank comm 0x2dcc6e30 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 65000 commId 0xf920a949a8413d48 - Init COMPLETE
4633b3315e44:245:245 [1] NCCL INFO Init timings - ncclCommInitRank: rank 1 nranks 2 total 0.18 (kernels 0.12, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo
0.01, graphs 0.00, connections 0.05, rest 0.01)
(VllmWorker TP0 pid=244) INFO 09-02 08:10:07 [multiproc_executor.py:520] Parent
process exited, terminating worker
(VllmWorker TP1 pid=245) INFO 09-02 08:10:07 [multiproc_executor.py:520] Parent
process exited, terminating worker
4633b3315e44:244:244 [0] NCCL INFO Bootstrap: Using eth0:172.18.0.2<0>
4633b3315e44:244:244 [0] NCCL INFO cudaDriverVersion 12090
4633b3315e44:244:244 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
4633b3315e44:244:244 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
4633b3315e44:244:244 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
4633b3315e44:244:244 [0] NCCL INFO NET/Socket : Using [0]eth0:172.18.0.2<0>
4633b3315e44:244:244 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
4633b3315e44:244:244 [0] NCCL INFO Using network Socket
4633b3315e44:244:244 [0] NCCL INFO ncclCommInitRank comm 0x1f0ea920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 17000 commId 0xf920a949a8413d48 - Init START
4633b3315e44:244:244 [0] NCCL INFO RAS client listening socket at ::1<28028>
4633b3315e44:244:244 [0] NCCL INFO Bootstrap timings total 0.002867 (create 0.000033, send 0.000111, recv 0.002231, ring 0.000019, delay 0.000000)
4633b3315e44:244:244 [0] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.
4633b3315e44:244:244 [0] NCCL INFO Setting affinity for GPU 0 to 0f,ffffffff
4633b3315e44:244:244 [0] NCCL INFO comm 0x1f0ea920 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
4633b3315e44:244:244 [0] NCCL INFO Channel 00/02 : 0 1
4633b3315e44:244:244 [0] NCCL INFO Channel 01/02 : 0 1
4633b3315e44:244:244 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
4633b3315e44:244:244 [0] NCCL INFO P2P Chunksize set to 131072
4633b3315e44:244:244 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 0 directMode 0
4633b3315e44:244:308 [0] NCCL INFO [Proxy Service] Device 0 CPU core 11
4633b3315e44:244:311 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 16
4633b3315e44:244:244 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
4633b3315e44:244:244 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
4633b3315e44:244:244 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
4633b3315e44:244:244 [0] NCCL INFO Connected all trees
4633b3315e44:244:313 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 27
4633b3315e44:244:244 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 5124633b3315e44:244:244 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
4633b3315e44:244:244 [0] NCCL INFO CC Off, workFifoBytes 1048576
4633b3315e44:244:244 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
4633b3315e44:244:244 [0] NCCL INFO ncclCommInitRank comm 0x1f0ea920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 17000 commId 0xf920a949a8413d48 - Init COMPLETE
4633b3315e44:244:244 [0] NCCL INFO Init timings - ncclCommInitRank: rank 0 nranks 2 total 0.18 (kernels 0.12, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo
0.01, graphs 0.00, connections 0.05, rest 0.00)
[rank0]:[W902 08:10:08.508797746 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(EngineCore_0 pid=166) Process EngineCore_0:
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] EngineCore failed to start.
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] Traceback (most recent call last):
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 691, in run_engine_core
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 492, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 80, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     self.model_executor = executor_class(vllm_config)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     self._init_executor()
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in
wait_for_ready
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700]     raise e from None
(EngineCore_0 pid=166) ERROR 09-02 08:10:09 [core.py:700] Exception: WorkerProc
initialization failed due to an exception in a background process. See stack trace for root cause.
(EngineCore_0 pid=166) Traceback (most recent call last):
(EngineCore_0 pid=166)   File "/usr/lib/python3.12/multiprocessing/process.py",
line 314, in _bootstrap
(EngineCore_0 pid=166)     self.run()
(EngineCore_0 pid=166)   File "/usr/lib/python3.12/multiprocessing/process.py",
line 108, in run
(EngineCore_0 pid=166)     self._target(*self._args, **self._kwargs)
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 704, in run_engine_core
(EngineCore_0 pid=166)     raise e
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 691, in run_engine_core
(EngineCore_0 pid=166)     engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=166)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 492, in __init__
(EngineCore_0 pid=166)     super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 80, in __init__
(EngineCore_0 pid=166)     self.model_executor = executor_class(vllm_config)
(EngineCore_0 pid=166)                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_0 pid=166)     self._init_executor()
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
(EngineCore_0 pid=166)     self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_0 pid=166)                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=166)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in wait_for_ready
(EngineCore_0 pid=166)     raise e from None
(EngineCore_0 pid=166) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1)   File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=1)   File "<frozen runpy>", line 88, in _run_code
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1920, in <module>
(APIServer pid=1)     uvloop.run(run_server(args))
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
(APIServer pid=1)     return __asyncio.run(
(APIServer pid=1)            ^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 195, in
run
(APIServer pid=1)     return runner.run(main)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 118, in
run
(APIServer pid=1)     return self._loop.run_until_complete(task)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=1)     return await main
(APIServer pid=1)            ^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1850, in run_server
(APIServer pid=1)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1870, in run_server_worker
(APIServer pid=1)     async with build_async_engine_client(
(APIServer pid=1)                ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1)     return await anext(self.gen)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 178, in build_async_engine_client
(APIServer pid=1)     async with build_async_engine_client_from_engine_args(
(APIServer pid=1)                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1)     return await anext(self.gen)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 220, in build_async_engine_client_from_engine_args
(APIServer pid=1)     async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1)                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 1557, in inner
(APIServer pid=1)     return fn(*args, **kwargs)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 174, in from_vllm_config
(APIServer pid=1)     return cls(
(APIServer pid=1)            ^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 120, in __init__
(APIServer pid=1)     self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1)                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=1)     return AsyncMPClient(*client_args)
(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 767, in __init__
(APIServer pid=1)     super().__init__(
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 446, in __init__
(APIServer pid=1)     with launch_core_engines(vllm_config, executor_class,
(APIServer pid=1)          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=1)     next(self.gen)
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 706, in launch_core_engines
(APIServer pid=1)     wait_for_engine_startup(
(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 759, in wait_for_engine_startup
(APIServer pid=1)     raise RuntimeError("Engine core initialization failed. "
(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
  warnings.warn('resource_tracker: There appear to be %d '
cyankiwi org

Hi @boshko , thank you for bringing this to my attention. I forgot one of the final steps after quantizing the model.

Please redownload the config.json file for this model again. It should work fine, and please please let me know if there are any errors occurred.

Thanks, Ton.

Working as expected now, thanks!

boshko changed discussion status to closed

Sign up or log in to comment