Problem with Structured Outputs Vllm main MM

#2
by jart25 - opened

The Script:

import base64
import fitz
import json
import requests
import sys
import time
from pathlib import Path
from PIL import Image
from io import BytesIO
from typing import List, Optional, Literal
from pydantic import BaseModel
from qwen_vl_utils import smart_resize

TARGET_DPI = 150
PYMUPDF_BASE_DPI = 72
PATCH_MULTIPLE = 32
MAX_SIDE_PX = 3000

VLLM_URL = "http://192.168.1.75/v1/chat/completions"
MODEL = "QWEN35"

class Vendor(BaseModel):
name: Optional[str]
address: Optional[str]
cif_nif: Optional[str]
confidence: Literal["confident", "partial", "ambiguous"]

class Client(BaseModel):
name: Optional[str]
address: Optional[str]
cif_nif: Optional[str]
confidence: Literal["confident", "partial", "ambiguous"]

class Dates(BaseModel):
issue_date: Optional[str]
due_date: Optional[str]

class Amounts(BaseModel):
subtotal: Optional[float]
total_discount: Optional[float]
tax_rate: Optional[float]
tax_amount: Optional[float]
total: Optional[float]
currency: Optional[str]

class LineItem(BaseModel):
description: str
quantity: Optional[float]
unit_price: Optional[float]
discount_percentage: Optional[float]
discount_amount: Optional[float]
total: Optional[float]
confidence: Literal["confident", "partial", "ambiguous"]

class PaymentInfo(BaseModel):
iban: Optional[str]
payment_method: Optional[str]

class FieldWithIssue(BaseModel):
field_name: str
issue_type: Literal["not_found", "partial", "conflicting", "ambiguous"]
notes: Optional[str]

class InvoiceExtraction(BaseModel):
extraction_status: Literal["success", "partial", "failed"]
failure_reason: Optional[str]
invoice_number: Optional[str]
vendor: Optional[Vendor]
client: Optional[Client]
dates: Optional[Dates]
amounts: Optional[Amounts]
line_items: Optional[List[LineItem]]
payment_info: Optional[PaymentInfo]
fields_with_issues: List[FieldWithIssue]

SYSTEM_PROMPT = """
You are an expert invoice extraction system. Your sole task is to extract information EXPLICITLY visible in the document.

STRICT RULES:

  1. LINE ITEM SEPARATION: Extract EVERY SINGLE ROW as a separate line item object. NEVER merge distinct products, services, or descriptions into a single line item. If a row only contains a description, extract it as a separate line item with null amounts.
  2. NUMBER PARSING: Spanish/European formats use ',' for decimals and '.' for thousands.
    • A quantity of '1,000' is ALMOST ALWAYS 1 unit (1.0), NOT one thousand.
    • A price of '59,0000' is 59.0.
    • An amount of '1.260,00' is 1260.0.
    • Convert these strictly to standard JSON numbers (floats).
  3. MATHEMATICAL VALIDATION: Verify that (Quantity * Unit Price) - Discount = Total for EACH line.
  4. DISCOUNT HANDLING:
    • 'discount_percentage': Use this ONLY if there is a '%' sign or the column header explicitly indicates a percentage.
    • 'discount_amount': Use this ONLY if it is a direct monetary deduction or absolute value.

Respond ONLY with valid JSON conforming to the provided schema.
"""

USER_PROMPT_PREFIX = """
Analyze the provided invoice images (which may span multiple pages) and extract all fields according to the strict JSON schema.
"""

def pad_to_multiple(img: Image.Image, multiple: int = PATCH_MULTIPLE) -> Image.Image:
w, h = img.size
new_w = ((w + multiple - 1) // multiple) * multiple
new_h = ((h + multiple - 1) // multiple) * multiple
if new_w == w and new_h == h:
return img
canvas = Image.new("RGB", (new_w, new_h), (255, 255, 255))
canvas.paste(img, (0, 0))
return canvas

def clamp_resolution(img: Image.Image, max_side: int = MAX_SIDE_PX) -> Image.Image:
w, h = img.size
if max(w, h) <= max_side:
return img
scale = max_side / max(w, h)
return img.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)

def img_to_b64(img: Image.Image) -> str:
buf = BytesIO()
img.save(buf, format="PNG")
return base64.b64encode(buf.getvalue()).decode("utf-8")

def pdf_to_images(pdf_path: Path, dpi: int = TARGET_DPI) -> list[Image.Image]:
zoom = dpi / PYMUPDF_BASE_DPI
mat = fitz.Matrix(zoom, zoom)
doc = fitz.open(pdf_path)
pages = []
for page in doc:
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img = clamp_resolution(img)
pages.append(img)
doc.close()
return pages

def build_image_content(pages: list[Image.Image]) -> list[dict]:
content = []
min_pixels = 512 * 28 * 28
max_pixels = 4608 * 28 * 28

for img in pages:
    w, h = img.size
    new_h, new_w = smart_resize(h, w, min_pixels=min_pixels, max_pixels=max_pixels, factor=28)
    resized_img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
    padded_img = pad_to_multiple(resized_img, multiple=28)
    
    content.append(
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{img_to_b64(padded_img)}"},
        }
    )
content.append({"type": "text", "text": USER_PROMPT_PREFIX})
return content

def call_vllm(payload: dict) -> dict:
time.sleep(2)
response = requests.post(
VLLM_URL,
json=payload,
headers={"Content-Type": "application/json"},
)
response.raise_for_status()
return response.json()

def extract_invoice(pdf_path: str) -> dict:
pages = pdf_to_images(Path(pdf_path))
content = build_image_content(pages)

payload = {
    "model": MODEL,
    "messages": [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": content},
    ],
    "temperature": 0.6,
"top_k": 40,
"min_p": 0.2,
"top_p": 1.0,

"presence_penalty": 1.5,

"repetition_penalty": 1.0,

    "max_tokens": 21800,
    "chat_template_kwargs": {"enable_thinking": False},
    "response_format": {
        "type": "json_schema",
        "json_schema": {
            "name": "invoice_extraction",
            "schema": InvoiceExtraction.model_json_schema(),
            "strict": True,
        },
    },
}

response_data = call_vllm(payload)
message = response_data["choices"][0]["message"]

content_str = message.get("content")
if content_str is None:
    raise ValueError("None")

return json.loads(content_str)

if name == "main":
if len(sys.argv) < 2:
sys.exit(1)

result = extract_invoice(sys.argv[1])
print(json.dumps(result, indent=2, ensure_ascii=False))

Answer or fabric data:

{
"extraction_status": "success",
"failure_reason": null,
"invoice_number": null,
"vendor": null,
"client": null,
"dates": {
"issue_date": null,
"due_date": null
},
"amounts": {
"subtotal": null,
"total_discount": null,
"tax_rate": null,
"tax_amount": null,
"total": null,
"currency": null
},
"line_items": [],
"payment_info": null,
"fields_with_issues": []
}

Hello, I'm investigating why the quantized model in VLLM isn't generating structured outputs, unlike the non-quantized model.

This seems to happen with all types of quantization, including GPTQ, LlmCompressor, and FP8.

This model is also affected when processing an invoice; the model either doesn't see the data or doesn't receive the necessary attention.

I'd appreciate any help investigating this. Thank you!

Solved ignoring more layers and setting dpi to 200 :
https://github.com/vllm-project/llm-compressor/pull/2383

jart25 changed discussion status to closed

Sign up or log in to comment