issue with vllm running
i could able to run the model , but the output is not coming correctly , i wonder something wrong with input tensor
log
(EngineCore_DP0 pid=35123) /workspace/user/.venv/lib/python3.12/site-packages/vllm/model_executor/layers/fla/ops/utils.py:113: UserWarning: Input tensor shape suggests potential format mismatch: seq_len (16) < num_heads (64). This may indicate the inputs were passed in head-first format [B, H, T, ...] when head_first=False was specified. Please verify your input tensor format matches the expected shape [B, T, H, ...].
(EngineCore_DP0 pid=35123) return fn(*contiguous_args, **contiguous_kwargs)
(EngineCore_DP0 pid=35123) /workspace/user/.venv/lib/python3.12/site-packages/vllm/model_executor/layers/fla/ops/utils.py:113: UserWarning: Input tensor shape suggests potential format mismatch: seq_len (16) < num_heads (64). This may indicate the inputs were passed in head-first format [B, H, T, ...] when head_first=False was specified. Please verify your input tensor format matches the expected shape [B, T, H, ...].
(EngineCore_DP0 pid=35123) return fn(*contiguous_args, **contiguous_kwargs)
command
vllm serve Qwen/Qwen3.5-122B-A10B-FP8 --port 8070 --gpu-memory-utilization 0.95 --served-model-name qwen --mm-encoder-tp-mode data --allowed-local-media-path /workspace --attention-backend FLASH_ATTN
and i've followed all instructions given by you .
running in vllm nightly
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293]
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] β β ββ ββ
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] ββ ββ β β β βββ β version 0.16.0rc2.dev496+g4a9c07a0a
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] ββββ β β β β model Qwen/Qwen3.5-122B-A10B-FP8
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] ββ βββββ βββββ β β
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293]
output
root@9e309957d443:/workspace/user# curl http://localhost:8070/v1/chat/completions
-H "Content-Type: application/json"
-d '{
"model": "qwen",
"messages": [{"role": "user", "content": "Say hello in one word."}],
"max_tokens": 20,
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20
}'
{"id":"chatcmpl-b8456ddecf863d54","object":"chat.completion","created":1772092311,"model":"qwen","choices":[{"index":0,"message":{"role":"assistant","content":"do\n\n\n\n","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning":null},"logprobs":null,"finish_reason":"stop","stop_reason":null,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":16,"total_tokens":20,"completion_tokens":4,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}
Looking for help from you .
Create a file Makefile:
# Load .env if present (API_KEY, HF_TOKEN, etc.)
-include .env
IMAGE ?= vllm/vllm-openai:qwen3_5
API_KEY ?= local-dev-key
GPU_MEM_UTIL ?= 0.92
GPU_NO ?= 0
GPU_UUID := $(shell nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader,nounits -i $(GPU_NO))
GPU_DEVICE ?= "device=$(GPU_UUID)"
HF_HOME ?= /var/lib/docker/container_volumes/hf_models
VLLM_CACHE_ROOT ?= $(HF_HOME)/vllm_cache
CONTAINER ?= qwen3.5
PORT ?= 8909
MODEL ?= Qwen/Qwen3.5-122B-A10B-FP8
SERVED_NAME ?= qwen3.5-122B-A10B-FP8
SHM_SIZE ?= 32g
MAX_SEQS ?= 1
BATCH_TOKS ?= 4176
DTYPE ?= bfloat16
ATTN_BACKEND ?= "FLASH_ATTN"
TP ?= 1
SEED ?= 42
DATA_DIR ?= /var/lib/docker/projects/sample_data
EAGER_ENABLED ?= 0
# RoPE scaling
ROPE_ENABLED ?= 0
ROPE_SCALING_FACTOR ?= 2.0
ROPE_SCALING_JSON = {"text_config": {"rope_parameters": {"mrope_interleaved": true, "mrope_section": [11, 11, 10], "rope_type": "linear", "rope_theta": 10000000, "partial_rotary_factor": 0.25, "factor": $(ROPE_SCALING_FACTOR), "original_max_position_embeddings": 262144}}}
MAX_LEN ?= $(if $(filter 1,$(ROPE_ENABLED)),$(shell echo "$(ROPE_SCALING_FACTOR) * 262144 / 1" | bc),262144)
.PHONY: start stop logs status test
start:
docker rm -f $(CONTAINER) >/dev/null 2>&1 || true
docker run -d --name $(CONTAINER) \
--label "host.pwd=$$PWD" \
--gpus $(GPU_DEVICE) --ipc=host --shm-size $(SHM_SIZE) \
--ulimit memlock=-1 --ulimit stack=67108864 \
--restart unless-stopped \
-p $(PORT):8000 \
-e HF_HOME="$(HF_HOME)" \
-e HF_TOKEN \
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
-e TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1 \
-e VLLM_CACHE_ROOT="$(VLLM_CACHE_ROOT)" \
-v $(HF_HOME):$(HF_HOME) \
-v $(VLLM_CACHE_ROOT):$(VLLM_CACHE_ROOT) \
-v $(DATA_DIR):/sample_data:ro \
$(IMAGE) \
--model "$(MODEL)" \
--served-model-name "$(SERVED_NAME)" \
--host 0.0.0.0 --port 8000 \
--api-key "$(API_KEY)" \
--dtype "$(DTYPE)" \
--gpu-memory-utilization "$(GPU_MEM_UTIL)" \
--max-model-len "$(MAX_LEN)" \
--enable-chunked-prefill \
--max-num-seqs "$(MAX_SEQS)" \
--max-num-batched-tokens "$(BATCH_TOKS)" \
--attention-backend "$(ATTN_BACKEND)" \
--mm-processor-kwargs '{"size": {"longest_edge": 234881024, "shortest_edge": 4096}}' \
--enable-auto-tool-choice \
--reasoning-parser deepseek_r1 \
--tool-call-parser qwen3_coder \
--enable-prefix-caching \
$(if $(filter 1,$(EAGER_ENABLED)),--enforce-eager) \
-tp $(TP) \
--seed $(SEED) \
--allowed-local-media-path /sample_data \
--media-io-kwargs '{"video":{"num_frames":-1}}' \
$(if $(filter 1,$(ROPE_ENABLED)),--hf-overrides '$(ROPE_SCALING_JSON)')
stop:
docker rm -f $(CONTAINER)
logs:
docker logs -f $(CONTAINER)
status:
@echo "== $(CONTAINER) =="
@docker ps --filter "name=$(CONTAINER)"
@echo
@curl -H "Authorization: Bearer $(API_KEY)" -s http://localhost:$(PORT)/v1/models | head -c 1200 | jq || true
@echo
test:
OPENAI_BASE_URL=http://127.0.0.1:$(PORT)/v1 OPENAI_API_KEY=$(API_KEY) uvx --from openai python -m smoke_test || \
OPENAI_BASE_URL=http://127.0.0.1:$(PORT)/v1 OPENAI_API_KEY=$(API_KEY) uv run smoke_test.py
Set the correct HF_HOME and PORT for your environment etc.
This solution assumes Docker + NVIDIA Container Toolkit installed on Linux.
Then run: make start -> starts the container, loads image, loads model, starts inference server perfectly
Then run: make logs -> follows the logs
Then run: make stop -> stops the server
Should work out of the box. Pls. tell me if something is wrong. Still working on RoPE.
You can verify every model feature locally by running: uv run smoke_test.py all
Save as smoke_test.py
# /// script
# requires-python = ">=3.10"
# dependencies = ["openai"]
# ///
"""Qwen3.5 smoke test β vision + tool-calling via OpenAI-compatible API.
Usage:
uv run smoke_test.py [vision|tools|reasoning|all]
Environment (loaded from .env, or set manually):
OPENAI_BASE_URL e.g. http://127.0.0.1:8909/v1
OPENAI_API_KEY e.g. local-dev-key
"""
import base64
import json
import os
import random
import subprocess
import sys
from pathlib import Path
from openai import OpenAI
def _load_dotenv(path: Path = Path(__file__).parent / ".env") -> None:
"""Minimal .env loader β no external deps."""
if not path.is_file():
return
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, val = line.partition("=")
key, val = key.strip(), val.strip()
if key:
os.environ.setdefault(key, val)
_load_dotenv()
# ---------------------------------------------------------------------------
# Tool implementations
# ---------------------------------------------------------------------------
def add_number(a: str, b: str) -> float:
return float(a) + float(b)
def multiply_number(a: str, b: str) -> float:
return float(a) * float(b)
def substract_number(a: str, b: str) -> float:
return float(a) - float(b)
def write_a_story() -> str:
return random.choice([
"A long time ago in a galaxy far far away...",
"There were 2 friends who loved sloths and code...",
"The world was ending because every sloth evolved to have superhuman intelligence...",
"Unbeknownst to one friend, the other accidentally coded a program to evolve sloths...",
])
def terminal(command: str) -> str:
blocked = ["rm", "sudo", "dd", "chmod"]
if any(kw in command for kw in blocked):
msg = f"Cannot execute commands containing {blocked} since they are dangerous"
print(msg)
return msg
print(f"Executing terminal command `{command}`")
try:
result = subprocess.run(
command, capture_output=True, text=True, shell=True, check=True,
)
return result.stdout
except subprocess.CalledProcessError as e:
return f"Command failed: {e.stderr}"
def python_exec(code: str) -> str:
data: dict = {}
exec(code, data) # noqa: S102 β local smoke test only
data.pop("__builtins__", None)
return str(data)
MAP_FN = {
"add_number": add_number,
"multiply_number": multiply_number,
"substract_number": substract_number,
"write_a_story": write_a_story,
"terminal": terminal,
"python": python_exec,
}
# ---------------------------------------------------------------------------
# Tool schemas (OpenAI function-calling format)
# ---------------------------------------------------------------------------
tools = [
{
"type": "function",
"function": {
"name": "add_number",
"description": "Add two numbers.",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "string", "description": "The first number."},
"b": {"type": "string", "description": "The second number."},
},
"required": ["a", "b"],
},
},
},
{
"type": "function",
"function": {
"name": "multiply_number",
"description": "Multiply two numbers.",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "string", "description": "The first number."},
"b": {"type": "string", "description": "The second number."},
},
"required": ["a", "b"],
},
},
},
{
"type": "function",
"function": {
"name": "substract_number",
"description": "Substract two numbers.",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "string", "description": "The first number."},
"b": {"type": "string", "description": "The second number."},
},
"required": ["a", "b"],
},
},
},
{
"type": "function",
"function": {
"name": "write_a_story",
"description": "Writes a random story.",
"parameters": {"type": "object", "properties": {}, "required": []},
},
},
{
"type": "function",
"function": {
"name": "terminal",
"description": "Perform operations from the terminal.",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "The command you wish to launch, e.g. `ls`, `echo hello`, ...",
},
},
"required": ["command"],
},
},
},
{
"type": "function",
"function": {
"name": "python",
"description": "Call a Python interpreter with some Python code that will be ran.",
"parameters": {
"type": "object",
"properties": {
"code": {"type": "string", "description": "The Python code to run"},
},
"required": ["code"],
},
},
},
]
# ---------------------------------------------------------------------------
# Test helpers
# ---------------------------------------------------------------------------
def get_client() -> tuple[OpenAI, str]:
"""Create OpenAI client and discover the served model name."""
# API_KEY from .env is the server-side key name; map it for the OpenAI client
if not os.environ.get("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = os.environ.get("API_KEY", "local-dev-key")
client = OpenAI() # reads OPENAI_BASE_URL & OPENAI_API_KEY from env
model_name = next(iter(client.models.list())).id
print(f"Using model = {model_name}")
return client, model_name
def _extract_content(message) -> tuple[str | None, str | None]:
"""Extract (content, reasoning) from a chat completion message.
With --reasoning-parser, the model's thinking goes into the 'reasoning'
field and the final answer into 'content'. When thinking is disabled
the model may still route everything through reasoning, so we handle both.
"""
content = message.content
raw = message.model_extra or {}
reasoning = raw.get("reasoning") or raw.get("reasoning_content")
return content, reasoning
def load_image_as_data_url(path: str) -> str:
"""Read a local image and return a base64 data URL."""
img_path = Path(path)
if not img_path.exists():
raise FileNotFoundError(f"Image not found: {img_path}")
data = img_path.read_bytes()
suffix = img_path.suffix.lower().lstrip(".")
mime = {"jpg": "image/jpeg", "jpeg": "image/jpeg", "png": "image/png", "gif": "image/gif", "webp": "image/webp"}.get(suffix, "image/png")
b64 = base64.b64encode(data).decode()
return f"data:{mime};base64,{b64}"
def smoke_test_vision(client: OpenAI, model_name: str) -> None:
"""Send a vision request with a local qwen.png image (base64-encoded)."""
print("\n" + "=" * 60)
print("VISION TEST")
print("=" * 60)
# Load local qwen.png (next to this script)
script_dir = Path(__file__).resolve().parent
image_url = load_image_as_data_url(str(script_dir / "qwen.png"))
print(f"Loaded qwen.png ({len(image_url)} chars base64)")
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
{"type": "text", "text": "What do you see in this image? Describe it briefly."},
],
}
]
response = client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=32768,
temperature=0.7,
top_p=0.8,
presence_penalty=1.5,
extra_body={
"top_k": 20,
"chat_template_kwargs": {"enable_thinking": False},
},
)
content, reasoning = _extract_content(response.choices[0].message)
print(f"Response: {content or reasoning}")
def smoke_test_reasoning(client: OpenAI, model_name: str) -> None:
"""Test thinking (reasoning) mode with a math/logic problem."""
print("\n" + "=" * 60)
print("REASONING (THINKING) TEST")
print("=" * 60)
messages = [
{
"role": "user",
"content": "How many r's are in the word 'strawberry'? Think step by step.",
}
]
# Thinking mode params
response = client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=32768,
temperature=1.0,
top_p=0.95,
presence_penalty=1.5,
extra_body={
"top_k": 20,
"chat_template_kwargs": {"enable_thinking": True},
},
)
content, reasoning = _extract_content(response.choices[0].message)
if reasoning:
# Show a truncated version of the thinking
preview = reasoning[:500] + ("..." if len(reasoning) > 500 else "")
print(f"Thinking ({len(reasoning)} chars): {preview}")
print(f"Answer: {content or '(empty β check reasoning above)'}")
def smoke_test_nonthinking(client: OpenAI, model_name: str) -> None:
"""Test instruct (non-thinking) mode."""
print("\n" + "=" * 60)
print("NON-THINKING (INSTRUCT) TEST")
print("=" * 60)
messages = [
{
"role": "user",
"content": "Explain in 2-3 sentences what a MoE (Mixture of Experts) model is.",
}
]
# Non-thinking mode params
response = client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=512,
temperature=0.7,
top_p=0.8,
presence_penalty=1.5,
extra_body={
"top_k": 20,
"chat_template_kwargs": {"enable_thinking": False},
},
)
content, reasoning = _extract_content(response.choices[0].message)
print(f"Response: {content or reasoning}")
def smoke_test_tools(
client: OpenAI,
model_name: str,
temperature: float = 0.6,
top_p: float = 0.95,
top_k: int = 20,
min_p: float = 0.0,
repetition_penalty: float = 1.0,
) -> None:
"""Run a multi-turn tool-calling loop."""
print("\n" + "=" * 60)
print("TOOL-CALLING TEST")
print("=" * 60)
messages = [
{
"role": "user",
"content": "What is (12 + 8) * 3? Use the calculator tools to compute step by step.",
}
]
has_tool_calls = True
round_num = 0
while has_tool_calls:
round_num += 1
print(f"\n--- Round {round_num} ---")
print(f"Messages ({len(messages)}): {json.dumps(messages[-1], indent=2, default=str)}")
response = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
tools=tools,
tool_choice="auto",
extra_body={
"top_k": top_k,
"min_p": min_p,
"repetition_penalty": repetition_penalty,
},
)
choice = response.choices[0].message
tool_calls = choice.tool_calls or []
content = choice.content or ""
tool_calls_dict = [tc.to_dict() for tc in tool_calls] if tool_calls else []
messages.append({
"role": "assistant",
"tool_calls": tool_calls_dict,
"content": content,
})
if content:
print(f"Assistant: {content}")
if not tool_calls:
has_tool_calls = False
else:
for tc in tool_calls:
fx = tc.function.name
args = json.loads(tc.function.arguments)
print(f" Tool call: {fx}({args})")
out = MAP_FN[fx](**args)
print(f" Result: {out}")
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"name": fx,
"content": str(out),
})
print("\nFinal answer:", messages[-1].get("content", "(no content)"))
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
mode = sys.argv[1] if len(sys.argv) > 1 else "all"
client, model_name = get_client()
if mode in ("vision", "all"):
smoke_test_vision(client, model_name)
if mode in ("reasoning", "all"):
smoke_test_reasoning(client, model_name)
if mode in ("nonthinking", "all"):
smoke_test_nonthinking(client, model_name)
if mode in ("tools", "all"):
smoke_test_tools(client, model_name)
print("\nSmoke test complete!")
if __name__ == "__main__":
main()