The same, wrong results regardless input images

by tund6789 - opened Nov 25, 2025

Nov 25, 2025

Hi team, could you please take a look at this:

Issue: FP8-quantized Llava-Onevision, Qwen2-VL models produce identical outputs for all input images (e.g., "blue" for red/blue/green images), while non-quantized models work correctly. FP8-quantized Qwen2.5-VL models work well.
Reproduction: Load nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic, nm-testing/Qwen2-VL-7B-Instruct-FP8-dynamic with vLLM, test with different colored images (red/blue/green) - all produce identical outputs. RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic answers correctly.
- Expected: Different outputs for different images (works with non-quantized)
- Actual: All images produce "blue" (identical outputs)
Environment: vLLM 0.11.2, llmcompressor 0.8.1, NVIDIA L40S (CC 8.9)
References:
Failing: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic , https://huggingface.co/nm-testing/Qwen2-VL-7B-Instruct-FP8-dynamic
Working: https://huggingface.co/RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic

Code: Qwen2-VL-based models

#!/usr/bin/env python3
"""
Test Neural Magic's nm-testing FP8 vision-language model.

Model: nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
This model was quantized using the same approach as your script - WITHOUT calibration data.
Reference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
"""

import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor

print("="*80)
print("TESTING NM-TESTING FP8 LLAVA-ONEVISION MODEL")
print("="*80)

model_path = "nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic"

print(f"\nModel: {model_path}")
print("\nLoading model with vLLM...")

try:
    # Load model with vLLM - auto-detect quantization from config
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.6,
        max_num_seqs=1,
        max_model_len=4096,
        enable_prefix_caching=False,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
    )
    
    print("✓ Model loaded successfully\n")
    
    # Load processor
    processor = AutoProcessor.from_pretrained(model_path)
    
    # Create test images - clearly distinguishable
    test_cases = [
        ("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
        ("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
        ("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
    ]
    
    prompt_text = "What color is this image? Answer in one word."
    
    print("Testing with 3 different colored images:")
    print("-" * 80)
    
    outputs_list = []
    
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=30,
    )
    
    for test_name, img in test_cases:
        # Prepare conversation
        conversation = [{
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        }]
        
        # Apply chat template
        text_prompt = processor.apply_chat_template(
            conversation, 
            add_generation_prompt=True,
            tokenize=False
        )
        
        # Create vLLM input
        vllm_input = {
            "prompt": text_prompt,
            "multi_modal_data": {"image": img},
        }
        
        outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
        output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
        
        outputs_list.append(output_text)
        print(f"  {test_name:15s} -> {output_text}")
    
    print("-" * 80)
    
    # Check if outputs are different
    unique_outputs = len(set(outputs_list))
    if unique_outputs == 1:
        print("\n❌ PROBLEM: All outputs are IDENTICAL")
    elif unique_outputs == len(outputs_list):
        print("\n✅ SUCCESS: All outputs are DIFFERENT")
    else:
        print(f"\n⚠️  PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
    
except Exception as e:
    print(f"\n❌ ERROR: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)
print("\nReference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic")

Code: Qwen2.5-VLmodel

#!/usr/bin/env python3
"""
Test RedHat's professionally quantized FP8 vision-language model
to see if FP8 can work at all with vLLM for multimodal models
"""

import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor

print("="*80)
print("TESTING REDHAT FP8 VISION-LANGUAGE MODEL")
print("="*80)

model_path = "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic"

print(f"\nModel: {model_path}")
print("Loading model with vLLM...")

try:
    # Load model with vLLM - auto-detect quantization from config
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.6,
        max_num_seqs=1,
        max_model_len=4096,
        enable_prefix_caching=False,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
    )
    
    print("✓ Model loaded successfully\n")
    
    # Load processor
    processor = AutoProcessor.from_pretrained(model_path)
    
    # Create test images - clearly distinguishable
    test_cases = [
        ("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
        ("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
        ("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
    ]
    
    prompt_text = "What color is this image? Answer in one word."
    
    print("Testing with 3 different colored images:")
    print("-" * 80)
    
    outputs_list = []
    
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=30,
    )
    
    for test_name, img in test_cases:
        # Prepare conversation
        conversation = [{
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": prompt_text},
            ],
        }]
        
        # Apply chat template
        text_prompt = processor.apply_chat_template(
            conversation, 
            add_generation_prompt=True,
            tokenize=False
        )
        
        # Create vLLM input
        vllm_input = {
            "prompt": text_prompt,
            "multi_modal_data": {"image": img},
        }
        
        outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
        output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
        
        outputs_list.append(output_text)
        print(f"  {test_name:15s} -> {output_text}")
    
    print("-" * 80)
    
    # Check if outputs are different
    unique_outputs = len(set(outputs_list))
    if unique_outputs == 1:
        print("\n❌ PROBLEM: All outputs are IDENTICAL (FP8 doesn't work with vLLM!)")
    elif unique_outputs == len(outputs_list):
        print("\n✅ SUCCESS: All outputs are DIFFERENT (FP8 CAN work with vLLM!)")
        print("\n💡 This means the issue is with how your model was quantized,")
        print("   not with vLLM's FP8 support for vision models.")
    else:
        print(f"\n⚠️  PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
    
except Exception as e:
    print(f"\n❌ ERROR: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment