Appreciate the model , Prefix caching not working
Hello,
I built the custom vllm as mentioned in the docs. The model is able to run my hermes so I find it pretty impressive, however I found tool calling issues with pi agent apparently.
I noticed that it is not prefix caching, I saw a log in vllm that this model doesn't support Prefix caching.
Is this because of the new architecture? Looking forward for the updates!
(APIServer pid=2620540) INFO 05-16 12:47:32 [utils.py:240] non-default args: {'model_tag': 'Zyphra/ZAYA1-8B', 'enable_auto_tool_choice': True, 'tool_call_parser': 'zaya_xml', 'port': 8010, 'model': 'Zyphra/ZAYA1-8B', 'dtype': 'bfloat16', 'max_model_len': 131072, 'reasoning_parser': 'qwen3', 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': True, 'mamba_cache_dtype': 'float32', 'max_num_seqs': 8}
'''
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] EngineCore failed to start.
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] Traceback (most recent call last):
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 1110, in run_engine_core
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] return func(*args, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 876, in init
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] super().init(
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 118, in init
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] self.model_executor = executor_class(vllm_config)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] return func(*args, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/executor/abstract.py", line 109, in init
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] self._init_executor()
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/executor/uniproc_executor.py", line 52, in _init_executor
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] self.driver_worker.load_model()
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 343, in load_model
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] self.model_runner.load_model(load_dummy_weights=load_dummy_weights)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] return func(*args, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4858, in load_model
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] self.model = model_loader.load_model(
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] return func(*args, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] model = initialize_model(
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] return func(*args, **kwargs)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/model_executor/model_loader/utils.py", line 61, in initialize_model
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] model = model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] File "/home/nimish/.vllm/lib/python3.12/site-packages/vllm/model_executor/models/zaya.py", line 773, in init
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] assert not cache_config.enable_prefix_caching, (
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=2620712) ERROR 05-16 12:47:42 [core.py:1136] AssertionError: Zaya currently does not support prefix caching
'''