The same, wrong results regardless input images

#1
by tund6789 - opened

Hi team, could you please take a look at this:

Code: Qwen2-VL-based models

#!/usr/bin/env python3
"""
Test Neural Magic's nm-testing FP8 vision-language model.

Model: nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
This model was quantized using the same approach as your script - WITHOUT calibration data.
Reference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic
"""

import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor

print("="*80)
print("TESTING NM-TESTING FP8 LLAVA-ONEVISION MODEL")
print("="*80)

model_path = "nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic"

print(f"\nModel: {model_path}")
print("\nLoading model with vLLM...")

try:
    # Load model with vLLM - auto-detect quantization from config
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.6,
        max_num_seqs=1,
        max_model_len=4096,
        enable_prefix_caching=False,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
    )
    
    print("✓ Model loaded successfully\n")
    
    # Load processor
    processor = AutoProcessor.from_pretrained(model_path)
    
    # Create test images - clearly distinguishable
    test_cases = [
        ("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
        ("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
        ("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
    ]
    
    prompt_text = "What color is this image? Answer in one word."
    
    print("Testing with 3 different colored images:")
    print("-" * 80)
    
    outputs_list = []
    
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=30,
    )
    
    for test_name, img in test_cases:
        # Prepare conversation
        conversation = [{
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text},
            ],
        }]
        
        # Apply chat template
        text_prompt = processor.apply_chat_template(
            conversation, 
            add_generation_prompt=True,
            tokenize=False
        )
        
        # Create vLLM input
        vllm_input = {
            "prompt": text_prompt,
            "multi_modal_data": {"image": img},
        }
        
        outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
        output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
        
        outputs_list.append(output_text)
        print(f"  {test_name:15s} -> {output_text}")
    
    print("-" * 80)
    
    # Check if outputs are different
    unique_outputs = len(set(outputs_list))
    if unique_outputs == 1:
        print("\n❌ PROBLEM: All outputs are IDENTICAL")
    elif unique_outputs == len(outputs_list):
        print("\n✅ SUCCESS: All outputs are DIFFERENT")
    else:
        print(f"\n⚠️  PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
    
except Exception as e:
    print(f"\n❌ ERROR: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)
print("\nReference: https://huggingface.co/nm-testing/llava-onevision-qwen2-7b-ov-hf-FP8-dynamic")

Code: Qwen2.5-VLmodel

#!/usr/bin/env python3
"""
Test RedHat's professionally quantized FP8 vision-language model
to see if FP8 can work at all with vLLM for multimodal models
"""

import torch
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor

print("="*80)
print("TESTING REDHAT FP8 VISION-LANGUAGE MODEL")
print("="*80)

model_path = "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-dynamic"

print(f"\nModel: {model_path}")
print("Loading model with vLLM...")

try:
    # Load model with vLLM - auto-detect quantization from config
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.6,
        max_num_seqs=1,
        max_model_len=4096,
        enable_prefix_caching=False,
        enforce_eager=True,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
    )
    
    print("✓ Model loaded successfully\n")
    
    # Load processor
    processor = AutoProcessor.from_pretrained(model_path)
    
    # Create test images - clearly distinguishable
    test_cases = [
        ("RED image", Image.new("RGB", (384, 384), color=(255, 0, 0))),
        ("BLUE image", Image.new("RGB", (384, 384), color=(0, 0, 255))),
        ("GREEN image", Image.new("RGB", (384, 384), color=(0, 255, 0))),
    ]
    
    prompt_text = "What color is this image? Answer in one word."
    
    print("Testing with 3 different colored images:")
    print("-" * 80)
    
    outputs_list = []
    
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=30,
    )
    
    for test_name, img in test_cases:
        # Prepare conversation
        conversation = [{
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": prompt_text},
            ],
        }]
        
        # Apply chat template
        text_prompt = processor.apply_chat_template(
            conversation, 
            add_generation_prompt=True,
            tokenize=False
        )
        
        # Create vLLM input
        vllm_input = {
            "prompt": text_prompt,
            "multi_modal_data": {"image": img},
        }
        
        outputs = llm.generate([vllm_input], sampling_params, use_tqdm=False)
        output_text = outputs[0].outputs[0].text.strip().split('\n')[0]
        
        outputs_list.append(output_text)
        print(f"  {test_name:15s} -> {output_text}")
    
    print("-" * 80)
    
    # Check if outputs are different
    unique_outputs = len(set(outputs_list))
    if unique_outputs == 1:
        print("\n❌ PROBLEM: All outputs are IDENTICAL (FP8 doesn't work with vLLM!)")
    elif unique_outputs == len(outputs_list):
        print("\n✅ SUCCESS: All outputs are DIFFERENT (FP8 CAN work with vLLM!)")
        print("\n💡 This means the issue is with how your model was quantized,")
        print("   not with vLLM's FP8 support for vision models.")
    else:
        print(f"\n⚠️  PARTIAL: {unique_outputs}/{len(outputs_list)} unique outputs")
    
except Exception as e:
    print(f"\n❌ ERROR: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*80)
print("TEST COMPLETE")
print("="*80)

Sign up or log in to comment