issue with vllm running

#12
by aathi1324 - opened

i could able to run the model , but the output is not coming correctly , i wonder something wrong with input tensor

log
(EngineCore_DP0 pid=35123) /workspace/user/.venv/lib/python3.12/site-packages/vllm/model_executor/layers/fla/ops/utils.py:113: UserWarning: Input tensor shape suggests potential format mismatch: seq_len (16) < num_heads (64). This may indicate the inputs were passed in head-first format [B, H, T, ...] when head_first=False was specified. Please verify your input tensor format matches the expected shape [B, T, H, ...].
(EngineCore_DP0 pid=35123) return fn(*contiguous_args, **contiguous_kwargs)
(EngineCore_DP0 pid=35123) /workspace/user/.venv/lib/python3.12/site-packages/vllm/model_executor/layers/fla/ops/utils.py:113: UserWarning: Input tensor shape suggests potential format mismatch: seq_len (16) < num_heads (64). This may indicate the inputs were passed in head-first format [B, H, T, ...] when head_first=False was specified. Please verify your input tensor format matches the expected shape [B, T, H, ...].
(EngineCore_DP0 pid=35123) return fn(*contiguous_args, **contiguous_kwargs)

command
vllm serve Qwen/Qwen3.5-122B-A10B-FP8 --port 8070 --gpu-memory-utilization 0.95 --served-model-name qwen --mm-encoder-tp-mode data --allowed-local-media-path /workspace --attention-backend FLASH_ATTN

and i've followed all instructions given by you .

running in vllm nightly

(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293]
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] β–ˆ β–ˆ β–ˆβ–„ β–„β–ˆ
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] β–„β–„ β–„β–ˆ β–ˆ β–ˆ β–ˆ β–€β–„β–€ β–ˆ version 0.16.0rc2.dev496+g4a9c07a0a
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] β–ˆβ–„β–ˆβ–€ β–ˆ β–ˆ β–ˆ β–ˆ model Qwen/Qwen3.5-122B-A10B-FP8
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293] β–€β–€ β–€β–€β–€β–€β–€ β–€β–€β–€β–€β–€ β–€ β–€
(APIServer pid=34901) INFO 02-26 07:50:11 [utils.py:293]

output

root@9e309957d443:/workspace/user# curl http://localhost:8070/v1/chat/completions
-H "Content-Type: application/json"
-d '{
"model": "qwen",
"messages": [{"role": "user", "content": "Say hello in one word."}],
"max_tokens": 20,
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20
}'
{"id":"chatcmpl-b8456ddecf863d54","object":"chat.completion","created":1772092311,"model":"qwen","choices":[{"index":0,"message":{"role":"assistant","content":"do\n\n\n\n","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning":null},"logprobs":null,"finish_reason":"stop","stop_reason":null,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":16,"total_tokens":20,"completion_tokens":4,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}

Looking for help from you .

Create a file Makefile:

# Load .env if present (API_KEY, HF_TOKEN, etc.)
-include .env

IMAGE                 ?= vllm/vllm-openai:qwen3_5
API_KEY               ?= local-dev-key
GPU_MEM_UTIL          ?= 0.92
GPU_NO                ?= 0
GPU_UUID              := $(shell nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader,nounits -i $(GPU_NO))
GPU_DEVICE            ?= "device=$(GPU_UUID)"
HF_HOME               ?= /var/lib/docker/container_volumes/hf_models
VLLM_CACHE_ROOT       ?= $(HF_HOME)/vllm_cache
CONTAINER             ?= qwen3.5
PORT                  ?= 8909
MODEL                 ?= Qwen/Qwen3.5-122B-A10B-FP8
SERVED_NAME           ?= qwen3.5-122B-A10B-FP8
SHM_SIZE              ?= 32g
MAX_SEQS              ?= 1
BATCH_TOKS            ?= 4176
DTYPE                 ?= bfloat16
ATTN_BACKEND          ?= "FLASH_ATTN"
TP                    ?= 1
SEED                  ?= 42
DATA_DIR              ?= /var/lib/docker/projects/sample_data
EAGER_ENABLED         ?= 0

# RoPE scaling
ROPE_ENABLED          ?= 0
ROPE_SCALING_FACTOR   ?= 2.0
ROPE_SCALING_JSON     = {"text_config": {"rope_parameters": {"mrope_interleaved": true, "mrope_section": [11, 11, 10], "rope_type": "linear", "rope_theta": 10000000, "partial_rotary_factor": 0.25, "factor": $(ROPE_SCALING_FACTOR), "original_max_position_embeddings": 262144}}}
MAX_LEN               ?= $(if $(filter 1,$(ROPE_ENABLED)),$(shell echo "$(ROPE_SCALING_FACTOR) * 262144 / 1" | bc),262144)

.PHONY: start stop logs status test

start:
    docker rm -f $(CONTAINER) >/dev/null 2>&1 || true
    docker run -d --name $(CONTAINER) \
      --label "host.pwd=$$PWD" \
      --gpus $(GPU_DEVICE) --ipc=host --shm-size $(SHM_SIZE) \
      --ulimit memlock=-1 --ulimit stack=67108864 \
      --restart unless-stopped \
      -p $(PORT):8000 \
      -e HF_HOME="$(HF_HOME)" \
      -e HF_TOKEN \
      -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
      -e TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1 \
      -e VLLM_CACHE_ROOT="$(VLLM_CACHE_ROOT)" \
      -v $(HF_HOME):$(HF_HOME) \
      -v $(VLLM_CACHE_ROOT):$(VLLM_CACHE_ROOT) \
      -v $(DATA_DIR):/sample_data:ro \
      $(IMAGE) \
        --model "$(MODEL)" \
        --served-model-name "$(SERVED_NAME)" \
        --host 0.0.0.0 --port 8000 \
        --api-key "$(API_KEY)" \
        --dtype "$(DTYPE)" \
        --gpu-memory-utilization "$(GPU_MEM_UTIL)" \
        --max-model-len "$(MAX_LEN)" \
        --enable-chunked-prefill \
        --max-num-seqs "$(MAX_SEQS)" \
        --max-num-batched-tokens "$(BATCH_TOKS)" \
        --attention-backend "$(ATTN_BACKEND)" \
        --mm-processor-kwargs '{"size": {"longest_edge": 234881024, "shortest_edge": 4096}}' \
        --enable-auto-tool-choice \
        --reasoning-parser deepseek_r1 \
        --tool-call-parser qwen3_coder \
        --enable-prefix-caching \
        $(if $(filter 1,$(EAGER_ENABLED)),--enforce-eager) \
        -tp $(TP) \
        --seed $(SEED) \
        --allowed-local-media-path /sample_data \
        --media-io-kwargs '{"video":{"num_frames":-1}}' \
        $(if $(filter 1,$(ROPE_ENABLED)),--hf-overrides '$(ROPE_SCALING_JSON)')

stop:
    docker rm -f $(CONTAINER)

logs:
    docker logs -f $(CONTAINER)

status:
    @echo  "== $(CONTAINER) =="
    @docker  ps --filter "name=$(CONTAINER)"
    @echo 
    @curl  -H "Authorization: Bearer $(API_KEY)" -s http://localhost:$(PORT)/v1/models | head -c 1200 | jq || true
    @echo 

test:
    OPENAI_BASE_URL=http://127.0.0.1:$(PORT)/v1 OPENAI_API_KEY=$(API_KEY) uvx --from openai python -m smoke_test || \
    OPENAI_BASE_URL=http://127.0.0.1:$(PORT)/v1 OPENAI_API_KEY=$(API_KEY) uv run smoke_test.py

Set the correct HF_HOME and PORT for your environment etc.

This solution assumes Docker + NVIDIA Container Toolkit installed on Linux.

Then run: make start -> starts the container, loads image, loads model, starts inference server perfectly
Then run: make logs -> follows the logs
Then run: make stop -> stops the server

Should work out of the box. Pls. tell me if something is wrong. Still working on RoPE.

You can verify every model feature locally by running: uv run smoke_test.py all

Save as smoke_test.py

# /// script
# requires-python = ">=3.10"
# dependencies = ["openai"]
# ///
"""Qwen3.5 smoke test β€” vision + tool-calling via OpenAI-compatible API.

Usage:
    uv run smoke_test.py [vision|tools|reasoning|all]

Environment (loaded from .env, or set manually):
    OPENAI_BASE_URL   e.g. http://127.0.0.1:8909/v1
    OPENAI_API_KEY    e.g. local-dev-key
"""

import base64
import json
import os
import random
import subprocess
import sys
from pathlib import Path

from openai import OpenAI


def _load_dotenv(path: Path = Path(__file__).parent / ".env") -> None:
    """Minimal .env loader β€” no external deps."""
    if not path.is_file():
        return
    for line in path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        key, _, val = line.partition("=")
        key, val = key.strip(), val.strip()
        if key:
            os.environ.setdefault(key, val)

_load_dotenv()

# ---------------------------------------------------------------------------
# Tool implementations
# ---------------------------------------------------------------------------

def add_number(a: str, b: str) -> float:
    return float(a) + float(b)

def multiply_number(a: str, b: str) -> float:
    return float(a) * float(b)

def substract_number(a: str, b: str) -> float:
    return float(a) - float(b)

def write_a_story() -> str:
    return random.choice([
        "A long time ago in a galaxy far far away...",
        "There were 2 friends who loved sloths and code...",
        "The world was ending because every sloth evolved to have superhuman intelligence...",
        "Unbeknownst to one friend, the other accidentally coded a program to evolve sloths...",
    ])

def terminal(command: str) -> str:
    blocked = ["rm", "sudo", "dd", "chmod"]
    if any(kw in command for kw in blocked):
        msg = f"Cannot execute commands containing {blocked} since they are dangerous"
        print(msg)
        return msg
    print(f"Executing terminal command `{command}`")
    try:
        result = subprocess.run(
            command, capture_output=True, text=True, shell=True, check=True,
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        return f"Command failed: {e.stderr}"

def python_exec(code: str) -> str:
    data: dict = {}
    exec(code, data)  # noqa: S102 β€” local smoke test only
    data.pop("__builtins__", None)
    return str(data)

MAP_FN = {
    "add_number": add_number,
    "multiply_number": multiply_number,
    "substract_number": substract_number,
    "write_a_story": write_a_story,
    "terminal": terminal,
    "python": python_exec,
}

# ---------------------------------------------------------------------------
# Tool schemas (OpenAI function-calling format)
# ---------------------------------------------------------------------------

tools = [
    {
        "type": "function",
        "function": {
            "name": "add_number",
            "description": "Add two numbers.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "string", "description": "The first number."},
                    "b": {"type": "string", "description": "The second number."},
                },
                "required": ["a", "b"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "multiply_number",
            "description": "Multiply two numbers.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "string", "description": "The first number."},
                    "b": {"type": "string", "description": "The second number."},
                },
                "required": ["a", "b"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "substract_number",
            "description": "Substract two numbers.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "string", "description": "The first number."},
                    "b": {"type": "string", "description": "The second number."},
                },
                "required": ["a", "b"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "write_a_story",
            "description": "Writes a random story.",
            "parameters": {"type": "object", "properties": {}, "required": []},
        },
    },
    {
        "type": "function",
        "function": {
            "name": "terminal",
            "description": "Perform operations from the terminal.",
            "parameters": {
                "type": "object",
                "properties": {
                    "command": {
                        "type": "string",
                        "description": "The command you wish to launch, e.g. `ls`, `echo hello`, ...",
                    },
                },
                "required": ["command"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "python",
            "description": "Call a Python interpreter with some Python code that will be ran.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {"type": "string", "description": "The Python code to run"},
                },
                "required": ["code"],
            },
        },
    },
]

# ---------------------------------------------------------------------------
# Test helpers
# ---------------------------------------------------------------------------

def get_client() -> tuple[OpenAI, str]:
    """Create OpenAI client and discover the served model name."""
    # API_KEY from .env is the server-side key name; map it for the OpenAI client
    if not os.environ.get("OPENAI_API_KEY"):
        os.environ["OPENAI_API_KEY"] = os.environ.get("API_KEY", "local-dev-key")
    client = OpenAI()  # reads OPENAI_BASE_URL & OPENAI_API_KEY from env
    model_name = next(iter(client.models.list())).id
    print(f"Using model = {model_name}")
    return client, model_name


def _extract_content(message) -> tuple[str | None, str | None]:
    """Extract (content, reasoning) from a chat completion message.

    With --reasoning-parser, the model's thinking goes into the 'reasoning'
    field and the final answer into 'content'.  When thinking is disabled
    the model may still route everything through reasoning, so we handle both.
    """
    content = message.content
    raw = message.model_extra or {}
    reasoning = raw.get("reasoning") or raw.get("reasoning_content")
    return content, reasoning


def load_image_as_data_url(path: str) -> str:
    """Read a local image and return a base64 data URL."""
    img_path = Path(path)
    if not img_path.exists():
        raise FileNotFoundError(f"Image not found: {img_path}")
    data = img_path.read_bytes()
    suffix = img_path.suffix.lower().lstrip(".")
    mime = {"jpg": "image/jpeg", "jpeg": "image/jpeg", "png": "image/png", "gif": "image/gif", "webp": "image/webp"}.get(suffix, "image/png")
    b64 = base64.b64encode(data).decode()
    return f"data:{mime};base64,{b64}"


def smoke_test_vision(client: OpenAI, model_name: str) -> None:
    """Send a vision request with a local qwen.png image (base64-encoded)."""
    print("\n" + "=" * 60)
    print("VISION TEST")
    print("=" * 60)

    # Load local qwen.png (next to this script)
    script_dir = Path(__file__).resolve().parent
    image_url = load_image_as_data_url(str(script_dir / "qwen.png"))
    print(f"Loaded qwen.png ({len(image_url)} chars base64)")

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": image_url},
                },
                {"type": "text", "text": "What do you see in this image? Describe it briefly."},
            ],
        }
    ]

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=32768,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1.5,
        extra_body={
            "top_k": 20,
            "chat_template_kwargs": {"enable_thinking": False},
        },
    )
    content, reasoning = _extract_content(response.choices[0].message)
    print(f"Response: {content or reasoning}")


def smoke_test_reasoning(client: OpenAI, model_name: str) -> None:
    """Test thinking (reasoning) mode with a math/logic problem."""
    print("\n" + "=" * 60)
    print("REASONING (THINKING) TEST")
    print("=" * 60)

    messages = [
        {
            "role": "user",
            "content": "How many r's are in the word 'strawberry'? Think step by step.",
        }
    ]

    # Thinking mode params
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=32768,
        temperature=1.0,
        top_p=0.95,
        presence_penalty=1.5,
        extra_body={
            "top_k": 20,
            "chat_template_kwargs": {"enable_thinking": True},
        },
    )
    content, reasoning = _extract_content(response.choices[0].message)
    if reasoning:
        # Show a truncated version of the thinking
        preview = reasoning[:500] + ("..." if len(reasoning) > 500 else "")
        print(f"Thinking ({len(reasoning)} chars): {preview}")
    print(f"Answer: {content or '(empty β€” check reasoning above)'}")


def smoke_test_nonthinking(client: OpenAI, model_name: str) -> None:
    """Test instruct (non-thinking) mode."""
    print("\n" + "=" * 60)
    print("NON-THINKING (INSTRUCT) TEST")
    print("=" * 60)

    messages = [
        {
            "role": "user",
            "content": "Explain in 2-3 sentences what a MoE (Mixture of Experts) model is.",
        }
    ]

    # Non-thinking mode params
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1.5,
        extra_body={
            "top_k": 20,
            "chat_template_kwargs": {"enable_thinking": False},
        },
    )
    content, reasoning = _extract_content(response.choices[0].message)
    print(f"Response: {content or reasoning}")


def smoke_test_tools(
    client: OpenAI,
    model_name: str,
    temperature: float = 0.6,
    top_p: float = 0.95,
    top_k: int = 20,
    min_p: float = 0.0,
    repetition_penalty: float = 1.0,
) -> None:
    """Run a multi-turn tool-calling loop."""
    print("\n" + "=" * 60)
    print("TOOL-CALLING TEST")
    print("=" * 60)

    messages = [
        {
            "role": "user",
            "content": "What is (12 + 8) * 3? Use the calculator tools to compute step by step.",
        }
    ]

    has_tool_calls = True
    round_num = 0
    while has_tool_calls:
        round_num += 1
        print(f"\n--- Round {round_num} ---")
        print(f"Messages ({len(messages)}): {json.dumps(messages[-1], indent=2, default=str)}")

        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            tools=tools,
            tool_choice="auto",
            extra_body={
                "top_k": top_k,
                "min_p": min_p,
                "repetition_penalty": repetition_penalty,
            },
        )

        choice = response.choices[0].message
        tool_calls = choice.tool_calls or []
        content = choice.content or ""

        tool_calls_dict = [tc.to_dict() for tc in tool_calls] if tool_calls else []
        messages.append({
            "role": "assistant",
            "tool_calls": tool_calls_dict,
            "content": content,
        })

        if content:
            print(f"Assistant: {content}")

        if not tool_calls:
            has_tool_calls = False
        else:
            for tc in tool_calls:
                fx = tc.function.name
                args = json.loads(tc.function.arguments)
                print(f"  Tool call: {fx}({args})")
                out = MAP_FN[fx](**args)
                print(f"  Result: {out}")
                messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "name": fx,
                    "content": str(out),
                })

    print("\nFinal answer:", messages[-1].get("content", "(no content)"))


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    mode = sys.argv[1] if len(sys.argv) > 1 else "all"
    client, model_name = get_client()

    if mode in ("vision", "all"):
        smoke_test_vision(client, model_name)
    if mode in ("reasoning", "all"):
        smoke_test_reasoning(client, model_name)
    if mode in ("nonthinking", "all"):
        smoke_test_nonthinking(client, model_name)
    if mode in ("tools", "all"):
        smoke_test_tools(client, model_name)

    print("\nSmoke test complete!")


if __name__ == "__main__":
    main()

Sign up or log in to comment