The official VLLM example starts normal inference error

by SongXiaoMao - opened 7 days ago

vllm serve /home/cheng/model/Ling-2.6-flash-int4
--port 8000
--served-model-name gpt-3.5-turbo
--trust-remote-code --tensor-parallel-size 4
--gpu-memory-utilization 0.85

(APIServer pid=15569) INFO: Started server process [15569]
(APIServer pid=15569) INFO: Waiting for application startup.
(APIServer pid=15569) INFO: Application startup complete.
(APIServer pid=15569) INFO: 192.168.66.17:4688 - "POST /v1/chat/completions HTTP/1.1" 200 OK
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] WorkerProc hit an exception.
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] Traceback (most recent call last):
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 957, in worker_busy_loop
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = func(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/worker_base.py", line 337, in execute_model
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.worker.execute_model(scheduler_output)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_worker.py", line 806, in execute_model
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = self.model_runner.execute_model(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 4055, in execute_model
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] model_output = self._model_forward(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 3528, in _model_forward
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.model(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/cuda_graph.py", line 254, in call
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.runnable(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._call_impl(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return forward_call(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 1175, in forward
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden_states = self.model(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/decorators.py", line 520, in call
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.aot_compiled_fn(self, *args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_dynamo/aot_compile.py", line 224, in call
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.fn(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 961, in forward
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] def forward(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/caching.py", line 215, in call
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.optimized_call(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 155, in execution_fn
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 5, in __vllm_inlined_submods__1
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_ops.py", line 1269, in call
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._op(*args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 472, in linear_attention
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self._forward(hidden_states=hidden_states, output=output, positions=positions)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 723, in _forward
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden = self._prefill_and_mix_infer(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 750, in prefill_and_mix_infer
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return linear_attention_prefill_and_mix(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 194, in linear_attention_prefill_and_mix
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] out_slice = prefix_fn(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 239, in jit_linear_forward_prefix
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output, kv_history = lightning_attention(
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 585, in lightning_attention
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] o, kv = lightning_attention(q1, k1, v, ed, kv_history)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/autograd/function.py", line 596, in apply
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return super().apply(*args, **kwargs) # type: ignore[misc]
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 465, in forward
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] _fwd_kv_parallel[grid](
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 370, in
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 743, in run
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 482, in launch_metadata
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self.init_handles()
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 456, in init_handles
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 448, in raise
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise err
(Worker_TP3 pid=15749) ERROR 04-30 09:19:49 [multiproc_executor.py:962] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 131584, Hardware limit: 101376. Reducing block sizes or num_stages may help.
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] WorkerProc hit an exception.
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] Traceback (most recent call last):
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 957, in worker_busy_loop
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = func(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/worker_base.py", line 337, in execute_model
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.worker.execute_model(scheduler_output)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_worker.py", line 806, in execute_model
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = self.model_runner.execute_model(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 4055, in execute_model
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] model_output = self._model_forward(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 3528, in _model_forward
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.model(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/cuda_graph.py", line 254, in call
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.runnable(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._call_impl(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return forward_call(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 1175, in forward
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden_states = self.model(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/decorators.py", line 520, in call
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.aot_compiled_fn(self, *args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_dynamo/aot_compile.py", line 224, in call
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.fn(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 961, in forward
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] def forward(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/caching.py", line 215, in call
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.optimized_call(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 155, in execution_fn
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 5, in __vllm_inlined_submods__1
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_ops.py", line 1269, in call
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._op(*args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 472, in linear_attention
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self._forward(hidden_states=hidden_states, output=output, positions=positions)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 723, in _forward
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden = self._prefill_and_mix_infer(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 750, in prefill_and_mix_infer
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return linear_attention_prefill_and_mix(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 194, in linear_attention_prefill_and_mix
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] out_slice = prefix_fn(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 239, in jit_linear_forward_prefix
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output, kv_history = lightning_attention(
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 585, in lightning_attention
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] o, kv = lightning_attention(q1, k1, v, ed, kv_history)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/autograd/function.py", line 596, in apply
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return super().apply(*args, **kwargs) # type: ignore[misc]
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 465, in forward
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] _fwd_kv_parallel[grid](
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 370, in
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 743, in run
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 482, in launch_metadata
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self.init_handles()
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 456, in init_handles
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 448, in raise
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise err
(Worker_TP2 pid=15748) ERROR 04-30 09:19:49 [multiproc_executor.py:962] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 131584, Hardware limit: 101376. Reducing block sizes or num_stages may help.
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] WorkerProc hit an exception.
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] Traceback (most recent call last):
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 957, in worker_busy_loop
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = func(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/worker_base.py", line 337, in execute_model
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.worker.execute_model(scheduler_output)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_worker.py", line 806, in execute_model
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = self.model_runner.execute_model(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 4055, in execute_model
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] model_output = self._model_forward(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 3528, in _model_forward
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.model(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/cuda_graph.py", line 254, in call
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.runnable(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._call_impl(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return forward_call(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 1175, in forward
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden_states = self.model(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/decorators.py", line 520, in call
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.aot_compiled_fn(self, *args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_dynamo/aot_compile.py", line 224, in call
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.fn(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 961, in forward
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] def forward(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/caching.py", line 215, in call
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.optimized_call(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 155, in execution_fn
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 5, in __vllm_inlined_submods__1
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_ops.py", line 1269, in call
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._op(*args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 472, in linear_attention
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self._forward(hidden_states=hidden_states, output=output, positions=positions)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 723, in _forward
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden = self._prefill_and_mix_infer(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 750, in prefill_and_mix_infer
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return linear_attention_prefill_and_mix(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 194, in linear_attention_prefill_and_mix
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] out_slice = prefix_fn(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 239, in jit_linear_forward_prefix
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output, kv_history = lightning_attention(
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 585, in lightning_attention
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] o, kv = lightning_attention(q1, k1, v, ed, kv_history)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/autograd/function.py", line 596, in apply
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return super().apply(*args, **kwargs) # type: ignore[misc]
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 465, in forward
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] _fwd_kv_parallel[grid](
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 370, in
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 743, in run
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 482, in launch_metadata
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self.init_handles()
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 456, in init_handles
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 448, in raise
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise err
(Worker_TP1 pid=15747) ERROR 04-30 09:19:49 [multiproc_executor.py:962] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 131584, Hardware limit: 101376. Reducing block sizes or num_stages may help.
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] WorkerProc hit an exception.
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] Traceback (most recent call last):
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 957, in worker_busy_loop
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = func(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/worker_base.py", line 337, in execute_model
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.worker.execute_model(scheduler_output)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_worker.py", line 806, in execute_model
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output = self.model_runner.execute_model(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return func(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 4055, in execute_model
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] model_output = self._model_forward(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/v1/worker/gpu_model_runner.py", line 3528, in _model_forward
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.model(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/cuda_graph.py", line 254, in call
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.runnable(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._call_impl(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return forward_call(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 1175, in forward
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden_states = self.model(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/decorators.py", line 520, in call
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.aot_compiled_fn(self, *args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_dynamo/aot_compile.py", line 224, in call
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.fn(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 961, in forward
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] def forward(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/compilation/caching.py", line 215, in call
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self.optimized_call(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 155, in execution_fn
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "", line 5, in __vllm_inlined_submods__1
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/_ops.py", line 1269, in call
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return self._op(*args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 472, in linear_attention
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self._forward(hidden_states=hidden_states, output=output, positions=positions)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 723, in _forward
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] hidden = self._prefill_and_mix_infer(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/models/bailing_moe_linear.py", line 750, in prefill_and_mix_infer
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return linear_attention_prefill_and_mix(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 194, in linear_attention_prefill_and_mix
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] out_slice = prefix_fn(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/mamba/linear_attn.py", line 239, in jit_linear_forward_prefix
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] output, kv_history = lightning_attention(
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 585, in lightning_attention
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] o, kv = lightning_attention(q1, k1, v, ed, kv_history)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/torch/autograd/function.py", line 596, in apply
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return super().apply(*args, **kwargs) # type: ignore[misc]
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/model/vllm/vllm/model_executor/layers/lightning_attn.py", line 465, in forward
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] _fwd_kv_parallel[grid](
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 370, in
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/runtime/jit.py", line 743, in run
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 482, in launch_metadata
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] self.init_handles()
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 456, in init_handles
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] File "/home/cheng/my_ling_env/lib/python3.10/site-packages/triton/compiler/compiler.py", line 448, in raise
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] raise err
(Worker_TP0 pid=15746) ERROR 04-30 09:19:49 [multiproc_executor.py:962] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 131584, Hardware limit: 101376. Reducing block sizes or num_stages may help.
(EngineCore pid=15691) ERROR 04-30 09:19:49 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.20.1rc1.dev88+g0ab67c022) with config: model='/home/cheng/model/Ling-2.6-flash-int4', speculative_config=None, tokenizer='/home/cheng/model/Ling-2.6-flash-int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, quantization_config=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=gpt-3.5-turbo, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'ir_enable_torch_wrap': True, 'splitting_ops': ['vllm::unified_attention_with_output', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::gdn_attention_core_xpu', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::deepseek_v4_attention', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_vision_items_per_batch': 0, 'encoder_cudagraph_max_frames_per_batch': None, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': False, 'static_all_moe_layers': []}, kernel_config=KernelConfig(ir_op_priority=IrOpPriorityConfig(rms_norm=['native']), enable_flashinfer_autotune=True, moe_backend='auto'),
(EngineCore pid=15691) ERROR 04-30 09:19:49 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=chatcmpl-ac1b9c9911ec3444-b880aab7,prompt_token_ids_len=19,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.7, top_p=0.8, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[156892], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=131053, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([1], [2], [3], [4], [5], [6], [7], [8]),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[],resumed_req_ids=set(),new_token_ids_lens=[],all_token_ids_lens={},new_block_ids=[],num_computed_tokens=[],num_output_tokens=[]), num_scheduled_tokens={chatcmpl-ac1b9c9911ec3444-b880aab7: 19}, total_num_scheduled_tokens=19, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0, 0, 0, 0, 0, 0, 0, 0], finished_req_ids=[], free_encoder_mm_hashes=[], preempted_req_ids=[], has_structured_output_requests=false, pending_structured_output_tokens=false, num_invalid_spec_tokens=null, kv_connector_metadata=null, ec_connector_metadata=null, new_block_ids_to_zero=null)
(EngineCore pid=15691) ERROR 04-30 09:19:49 [dump_input.py:81] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, num_skipped_waiting_reqs=0, step_counter=0, current_wave=0, kv_cache_usage=0.0025789813023855412, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0, preempted_requests=0, preempted_queries=0, preempted_hits=0), connector_prefix_cache_stats=None, kv_cache_eviction_events=[], spec_decoding_stats=None, kv_connector_stats=None, waiting_lora_adapters={}, running_lora_adapters={}, cudagraph_stats=None, perf_stats=None)
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] EngineCore encountered a fatal error.
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] Traceback (most recent call last):
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/engine/core.py", line 1129, in run_engine_core
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] engine_core.run_busy_loop()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/engine/core.py", line 1170, in run_busy_loop
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] self._process_engine_step()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/engine/core.py", line 1209, in _process_engine_step
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] outputs, model_executed = self.step_fn()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/engine/core.py", line 525, in step_with_batch_queue
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] exec_model_fut.result()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 90, in result
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] return super().result()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/usr/lib/python3.10/concurrent/futures/_base.py", line 451, in result
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] return self.__get_result()
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] raise self._exception
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 94, in _wait_for_response
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] response = self.aggregate(self.get_response())
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] File "/home/cheng/model/vllm/vllm/v1/executor/multiproc_executor.py", line 390, in get_response
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] raise RuntimeError(
(EngineCore pid=15691) ERROR 04-30 09:19:49 [core.py:1138] RuntimeError: Worker failed with error 'out of resource: shared memory, Required: 131584, Hardware limit: 101376. Reducing block sizes or num_stages may help.', please check the stack trace above for the root cause
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] AsyncLLM output_handler failed.
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] Traceback (most recent call last):
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] File "/home/cheng/model/vllm/vllm/v1/engine/async_llm.py", line 655, in output_handler
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] outputs = await engine_core.get_output_async()
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] File "/home/cheng/model/vllm/vllm/v1/engine/core_client.py", line 998, in get_output_async
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] raise self._format_exception(outputs) from None
(APIServer pid=15569) ERROR 04-30 09:19:49 [async_llm.py:699] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(Worker_TP0 pid=15746) INFO 04-30 09:19:49 [multiproc_executor.py:775] Parent process exited, terminating worker queues
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] Error in chat completion stream generator.
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] Traceback (most recent call last):
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] File "/home/cheng/model/vllm/vllm/entrypoints/openai/chat_completion/serving.py", line 513, in chat_completion_stream_generator
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] async for res in result_generator:
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] File "/home/cheng/model/vllm/vllm/v1/engine/async_llm.py", line 574, in generate
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] out = q.get_nowait() or await q.get()
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] File "/home/cheng/model/vllm/vllm/v1/engine/output_processor.py", line 85, in get
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] raise output
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] File "/home/cheng/model/vllm/vllm/v1/engine/async_llm.py", line 655, in output_handler
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] outputs = await engine_core.get_output_async()
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] File "/home/cheng/model/vllm/vllm/v1/engine/core_client.py", line 998, in get_output_async
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] raise self._format_exception(outputs) from None
(APIServer pid=15569) ERROR 04-30 09:19:49 [serving.py:1137] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(APIServer pid=15569) INFO: Shutting down
(APIServer pid=15569) INFO: Waiting for application shutdown.
(APIServer pid=15569) INFO: Application shutdown complete.
(APIServer pid=15569) INFO: Finished server process [15569]
(my_ling_env) cheng@cheng:/model/vllm$ (Worker_TP1 pid=15747) INFO 04-30 09:19:52 [multiproc_executor.py:872] WorkerProc shutting down.
(Worker_TP2 pid=15748) INFO 04-30 09:19:52 [multiproc_executor.py:872] WorkerProc shutting down.
(Worker_TP0 pid=15746) INFO 04-30 09:19:52 [multiproc_executor.py:872] WorkerProc shutting down.
(Worker_TP3 pid=15749) INFO 04-30 09:19:52 [multiproc_executor.py:872] WorkerProc shutting down.
^C
(my_ling_env) cheng@cheng:/model/vllm$ nvidia-smi
Thu Apr 30 09:20:12 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.126.09 Driver Version: 580.126.09 CUDA Version: 13.0 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3090 Off | 00000000:18:00.0 Off | N/A |
| 38% 61C P8 33W / 180W | 18725MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 3090 Off | 00000000:51:00.0 Off | N/A |
| 36% 57C P8 30W / 180W | 18725MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA GeForce RTX 3090 Off | 00000000:8A:00.0 Off | N/A |
| 36% 46C P8 23W / 180W | 18725MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA GeForce RTX 3090 Off | 00000000:C3:00.0 Off | N/A |
| 36% 59C P8 20W / 180W | 18725MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2743 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 15746 C VLLM::Worker_TP0 18702MiB |
| 1 N/A N/A 2743 G /usr/lib/xorg/Xorg 4MiB |
| 1 N/A N/A 15747 C VLLM::Worker_TP1 18702MiB |
| 2 N/A N/A 2743 G /usr/lib/xorg/Xorg 4MiB |
| 2 N/A N/A 15748 C VLLM::Worker_TP2 18702MiB |
| 3 N/A N/A 2743 G /usr/lib/xorg/Xorg 4MiB |
| 3 N/A N/A 15749 C VLLM::Worker_TP3 18702MiB |
+-----------------------------------------------------------------------------------------+
(my_ling_env) cheng@cheng:~/model/vllm$

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment