The same, wrong results regardless input images
#1
by tund6789 - opened
Hi team, could you please take a look at this:
- Issue: FP8-quantized Llava-Onevision, Qwen2-VL models produce identical outputs for all input images (e.g., "blue" for red/blue/green images), while non-quantized models work correctly. FP8-quantized Qwen2.5-VL models work well.
- Reproduction: Load
nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic,nm-testing/Qwen2-VL-7B-Instruct-FP8-dynamicwith vLLM, test with different colored images (red/blue/green) - all produce identical outputs.RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamicanswers correctly.- Expected: Different outputs for different images (works with non-quantized)
- Actual: All images produce "blue" (identical outputs)
- Environment: vLLM 0.11.2, llmcompressor 0.8.1, NVIDIA L40S (CC 8.9)
- References:
- Failing: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic , https://huggingface.co/nm-testing/Qwen2-VL-7B-Instruct-FP8-dynamic
- Working: https://huggingface.co/RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic
Code: Qwen2-VL-based models
#!/usr/bin/env python3
"""
Test Neural Magic's nm-testing FP8 vision-language model.
Model: nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
This model was quantized using the same approach as your script - WITHOUT calibration data.
Reference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
"""
import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor
print("="*80)
print("TESTING NM-TESTING FP8 LLAVA-ONEVISION MODEL")
print("="*80)
model_path = "nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic"
print(f"\nModel: {model_path}")
print("\nLoading model with vLLM...")
try:
# Load model with vLLM - auto-detect quantization from config
llm = LLM(
model=model_path,
gpu_memory_utilization=0.6,
max_num_seqs=1,
max_model_len=4096,
enable_prefix_caching=False,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 1},
)
print("✓ Model loaded successfully\n")
# Load processor
processor = AutoProcessor.from_pretrained(model_path)
# Create test images - clearly distinguishable
test_cases = [
("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
]
prompt_text = "What color is this image? Answer in one word."
print("Testing with 3 different colored images:")
print("-" * 80)
outputs_list = []
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=30,
)
for test_name, img in test_cases:
# Prepare conversation
conversation = [{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt_text},
],
}]
# Apply chat template
text_prompt = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
# Create vLLM input
vllm_input = {
"prompt": text_prompt,
"multi_modal_data": {"image": img},
}
outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
outputs_list.append(output_text)
print(f" {test_name:15s} -> {output_text}")
print("-" * 80)
# Check if outputs are different
unique_outputs = len(set(outputs_list))
if unique_outputs == 1:
print("\n❌ PROBLEM: All outputs are IDENTICAL")
elif unique_outputs == len(outputs_list):
print("\n✅ SUCCESS: All outputs are DIFFERENT")
else:
print(f"\n⚠️ PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
except Exception as e:
print(f"\n❌ ERROR: {e}")
import traceback
traceback.print_exc()
print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)
print("\nReference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic")
Code: Qwen2.5-VLmodel
#!/usr/bin/env python3
"""
Test RedHat's professionally quantized FP8 vision-language model
to see if FP8 can work at all with vLLM for multimodal models
"""
import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor
print("="*80)
print("TESTING REDHAT FP8 VISION-LANGUAGE MODEL")
print("="*80)
model_path = "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic"
print(f"\nModel: {model_path}")
print("Loading model with vLLM...")
try:
# Load model with vLLM - auto-detect quantization from config
llm = LLM(
model=model_path,
gpu_memory_utilization=0.6,
max_num_seqs=1,
max_model_len=4096,
enable_prefix_caching=False,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 1},
)
print("✓ Model loaded successfully\n")
# Load processor
processor = AutoProcessor.from_pretrained(model_path)
# Create test images - clearly distinguishable
test_cases = [
("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
]
prompt_text = "What color is this image? Answer in one word."
print("Testing with 3 different colored images:")
print("-" * 80)
outputs_list = []
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=30,
)
for test_name, img in test_cases:
# Prepare conversation
conversation = [{
"role": "user",
"content": [
{"type": "image", "image": img},
{"type": "text", "text": prompt_text},
],
}]
# Apply chat template
text_prompt = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
# Create vLLM input
vllm_input = {
"prompt": text_prompt,
"multi_modal_data": {"image": img},
}
outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
outputs_list.append(output_text)
print(f" {test_name:15s} -> {output_text}")
print("-" * 80)
# Check if outputs are different
unique_outputs = len(set(outputs_list))
if unique_outputs == 1:
print("\n❌ PROBLEM: All outputs are IDENTICAL (FP8 doesn't work with vLLM!)")
elif unique_outputs == len(outputs_list):
print("\n✅ SUCCESS: All outputs are DIFFERENT (FP8 CAN work with vLLM!)")
print("\n💡 This means the issue is with how your model was quantized,")
print(" not with vLLM's FP8 support for vision models.")
else:
print(f"\n⚠️ PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
except Exception as e:
print(f"\n❌ ERROR: {e}")
import traceback
traceback.print_exc()
print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)