Quant AWQ RDNA3 ROCM

#1
by jart25 - opened
This comment has been hidden (marked as Resolved)
jart25 changed discussion title from Would you be so kind as to publish the quantification script? to Removed
cyankiwi org

I might be able to help. Please message me on Linkedin or send me an email to ton@cyan.kiwi.

I would say that I am groping in the dark

import base64
from io import BytesIO
import torch
from datasets import load_dataset
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-4bit"

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,import base64
from io import BytesIO
import torch
from datasets import load_dataset
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-W4A16"

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

DATASET_ID = "lmms-lab/flickr30k"
NUM_CALIBRATION_SAMPLES = 256
DATASET_SPLIT = f"test[:{NUM_CALIBRATION_SAMPLES}]"
MAX_SEQUENCE_LENGTH = 1024

ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42)

def preprocess_and_tokenize(example):
buffered = BytesIO()
example["image"].save(buffered, format="PNG")
encoded_image = base64.b64encode(buffered.getvalue())
base64_image = f"data:image;base64,{encoded_image.decode('utf-8')}"
messages = [{"role": "user", "content": [{"type": "image", "image": base64_image}, {"type": "text", "text": "What does the image show?"}]}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=[example["image"]],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
)
return inputs

ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}

recipe = AWQModifier(
targets="Linear",
scheme="W4A16",
ignore=["re:.*lm_head", "re:.visual.", "re:.*mlp.gate$"],
duo_scaling=False,
)

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
pipeline="sequential",
)

dispatch_for_generation(model)
model.save_pretrained(OUTPUT_DIR, save_compressed=True)
processor.save_pretrained(OUTPUT_DIR)

torch_dtype is deprecated! Use dtype instead!
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00, 4.65it/s]
2025-10-16T12:09:14.733269+0000 | reset | INFO - Compression lifecycle reset
2025-10-16T12:09:14.737609+0000 | _create_default_logger | INFO - Logging all LLM Compressor modifier-level logs to sparse_logs/16-10-2025_12.09.14.log
2025-10-16T12:09:14.737815+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-10-16T12:09:14.788883+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-10-16T12:09:14.788969+0000 | from_modifiers | WARNING - Calibration pipeline is set to sequential, but it is recommended to use datafree
2025-10-16T12:09:35.017106+0000 | trace_subgraphs | WARNING - Expected 75 subgraphs, but only traced 49. This is likely due to having wrapped code which calls sequential targets
Updating global scales: 100%|████████████████████████████████████████████████████████████████████████████████████| 192/192 [00:00<00:00, 808540.53it/s]
Fusing global scales: 1169it [00:00, 880909.34it/s]
Calibrating weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 192/192 [00:27<00:00, 6.93it/s]
Preparing cache: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:16<00:00, 15.70it/s]
(1/49): Calibrating: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:04<00:00, 59.40it/s]
(1/49): Propagating: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:03<00:00, 69.35it/s]
(2/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 260.54it/s]
(2/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 341.61it/s]
(3/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 271.08it/s]
(3/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 348.00it/s]
(4/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 282.09it/s]
(4/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 347.01it/s]
(5/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 262.11it/s]
(5/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 357.37it/s]
(6/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 283.99it/s]
(6/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 361.42it/s]
(7/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 280.62it/s]
(7/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 359.52it/s]
(8/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 286.46it/s]
(8/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 362.94it/s]
(9/49): Calibrating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 268.71it/s]
(9/49): Propagating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 360.49it/s]
(10/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 295.27it/s]
(10/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 360.02it/s]
(11/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 303.81it/s]
(11/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 360.14it/s]
(12/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 350.78it/s]
(12/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 364.75it/s]
(13/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 344.97it/s]
(13/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 327.40it/s]
(14/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 307.35it/s]
(14/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 355.65it/s]
(15/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 354.26it/s]
(15/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.65it/s]
(16/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 324.12it/s]
(16/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.76it/s]
(17/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 351.65it/s]
(17/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.24it/s]
(18/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.66it/s]
(18/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.58it/s]
(19/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.13it/s]
(19/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 366.84it/s]
(20/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 352.47it/s]
(20/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 366.62it/s]
(21/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 351.11it/s]
(21/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 370.72it/s]
(22/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 312.41it/s]
(22/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 365.74it/s]
(23/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 288.45it/s]
(23/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.66it/s]
(24/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 326.71it/s]
(24/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.57it/s]
(25/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 355.48it/s]
(25/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.46it/s]
(26/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 355.06it/s]
(26/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.51it/s]
(27/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.45it/s]
(27/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.11it/s]
(28/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 351.48it/s]
(28/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.96it/s]
(29/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 354.60it/s]
(29/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.02it/s]
(30/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 354.56it/s]
(30/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.14it/s]
(31/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 357.48it/s]
(31/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.65it/s]
(32/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 351.97it/s]
(32/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 366.59it/s]
(33/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 342.61it/s]
(33/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 333.54it/s]
(34/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 286.90it/s]
(34/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.60it/s]
(35/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 354.16it/s]
(35/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 366.14it/s]
(36/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 354.43it/s]
(36/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.49it/s]
(37/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.02it/s]
(37/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 367.66it/s]
(38/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 356.91it/s]
(38/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.72it/s]
(39/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 357.11it/s]
(39/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.87it/s]
(40/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 356.26it/s]
(40/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.18it/s]
(41/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 356.04it/s]
(41/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.14it/s]
(42/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 356.28it/s]
(42/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 369.64it/s]
(43/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.64it/s]
(43/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 366.01it/s]
(44/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 352.57it/s]
(44/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 368.09it/s]
(45/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.54it/s]
(45/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 330.32it/s]
(46/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 356.85it/s]
(46/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 365.30it/s]
(47/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 353.93it/s]
(47/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 365.84it/s]
(48/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 303.85it/s]
(48/49): Propagating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 361.00it/s]
(49/49): Calibrating: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 837.08it/s]
(49/49): Propagating: 100%|████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 1332.42it/s]
2025-10-16T12:11:52.365129+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-10-16T12:11:52.545009+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.
Compressing model: 192it [00:02, 72.72it/s]
2025-10-16T12:12:39.489964+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.
Compressing model: 192it [00:00, 799.81it/s]

I can Calibrate but same size after compression:

Many thanks!

This model also fail, only works : https://huggingface.co/QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ :(

cyankiwi org

Since llmcompressor only works on nn.Linear, we have to convert the experts to Linear layers. There is a script in llmcompressor repo with the following usage :

from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
from llmcompressor.modeling import replace_modules_for_calibration

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = replace_modules_for_calibration(model)

Then you shuold be able to quantize the model normally using llmcompressor.

Also, does my model work on your local environment, if it does not, may I know the error code?

Thanks,

Ton

many thanks for the tip!:
I'm using ROCM with RDNA3

This way use GPTQ Marlin Repack

llm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] Traceback (most recent call last):
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 597, in worker_main
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] worker = WorkerProc(*args, **kwargs)
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 456, in init
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.worker.load_model()
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 229, in load_model
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.model_runner.load_model(eep_scale_up=eep_scale_up)
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2880, in load_model
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.model = model_loader.load_model(
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 56, in load_model
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] process_weights_after_loading(model, model_config, target_device)
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 117, in process_weights_after_loading
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] quant_method.process_weights_after_loading(module)
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 1571, in process_weights_after_loading
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] marlin_w13_qweight = ops.gptq_marlin_moe_repack(
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1169, in gptq_marlin_moe_repack
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] output[e] = torch.ops._C.gptq_marlin_repack(
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1353, in getattr
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] raise AttributeError(
vllm1-1 | (Worker_TP1 pid=232) ERROR 10-16 15:55:16 [multiproc_executor.py:623] AttributeError: '_OpNamespace' '_C' object has no attribute 'gptq_marlin_repack'
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] WorkerProc failed to start.
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] Traceback (most recent call last):
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 597, in worker_main
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] worker = WorkerProc(*args, **kwargs)
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 456, in init
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.worker.load_model()
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 229, in load_model
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.model_runner.load_model(eep_scale_up=eep_scale_up)
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2880, in load_model
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] self.model = model_loader.load_model(
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 56, in load_model
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] process_weights_after_loading(model, model_config, target_device)
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 117, in process_weights_after_loading
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] quant_method.process_weights_after_loading(module)
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 1571, in process_weights_after_loading
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] marlin_w13_qweight = ops.gptq_marlin_moe_repack(
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1169, in gptq_marlin_moe_repack
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] output[e] = torch.ops._C.gptq_marlin_repack(
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1353, in getattr
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] raise AttributeError(
vllm1-1 | (Worker_TP2 pid=233) ERROR 10-16 15:55:16 [multiproc_executor.py:623] AttributeError: '_OpNamespace' '_C' object has no attribute 'gptq_marlin_repack'
vllm1-1 | (Worker_TP1 pid=232) INFO 10-16 15:55:16 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP2 pid=233) INFO 10-16 15:55:16 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP0 pid=231) INFO 10-16 15:55:16 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP3 pid=234) INFO 10-16 15:55:16 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | [rank0]:[W1016 15:55:17.434645512 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] EngineCore failed to start.
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] Traceback (most recent call last):
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 783, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] engine_core = EngineCoreProc(*args, **kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 555, in init
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] super().init(
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 105, in init
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] self.model_executor = executor_class(vllm_config)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in init
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] self._init_executor()
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 113, in _init_executor
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] self.workers = WorkerProc.wait_for_ready(unready_workers)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 535, in wait_for_ready
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] raise e from None
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-16 15:55:17 [core.py:792] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
vllm1-1 | (EngineCore_DP0 pid=159) Process EngineCore_DP0:
vllm1-1 | (EngineCore_DP0 pid=159) Traceback (most recent call last):
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
vllm1-1 | (EngineCore_DP0 pid=159) self.run()
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
vllm1-1 | (EngineCore_DP0 pid=159) self._target(*self._args, **self._kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 796, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) raise e
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 783, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) engine_core = EngineCoreProc(*args, **kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 555, in init
vllm1-1 | (EngineCore_DP0 pid=159) super().init(
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 105, in init
vllm1-1 | (EngineCore_DP0 pid=159) self.model_executor = executor_class(vllm_config)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in init
vllm1-1 | (EngineCore_DP0 pid=159) self._init_executor()
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 113, in _init_executor
vllm1-1 | (EngineCore_DP0 pid=159) self.workers = WorkerProc.wait_for_ready(unready_workers)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 535, in wait_for_ready
vllm1-1 | (EngineCore_DP0 pid=159) raise e from None
vllm1-1 | (EngineCore_DP0 pid=159) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
vllm1-1 | (APIServer pid=1) Traceback (most recent call last):
vllm1-1 | (APIServer pid=1) File "/usr/local/bin/vllm", line 7, in
vllm1-1 | (APIServer pid=1) sys.exit(main())
vllm1-1 | (APIServer pid=1) ^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main
vllm1-1 | (APIServer pid=1) args.dispatch_function(args)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 62, in cmd
vllm1-1 | (APIServer pid=1) uvloop.run(run_server(args))
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
vllm1-1 | (APIServer pid=1) return __asyncio.run(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
vllm1-1 | (APIServer pid=1) return runner.run(main)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
vllm1-1 | (APIServer pid=1) return self._loop.run_until_complete(task)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
vllm1-1 | (APIServer pid=1) return await main
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1917, in run_server
vllm1-1 | (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1933, in run_server_worker
vllm1-1 | (APIServer pid=1) async with build_async_engine_client(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
vllm1-1 | (APIServer pid=1) return await anext(self.gen)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 191, in build_async_engine_client
vllm1-1 | (APIServer pid=1) async with build_async_engine_client_from_engine_args(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
vllm1-1 | (APIServer pid=1) return await anext(self.gen)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 238, in build_async_engine_client_from_engine_args
vllm1-1 | (APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/init.py", line 1336, in inner
vllm1-1 | (APIServer pid=1) return fn(*args, **kwargs)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 208, in from_vllm_config
vllm1-1 | (APIServer pid=1) return cls(
vllm1-1 | (APIServer pid=1) ^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 130, in init
vllm1-1 | (APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 121, in make_async_mp_client
vllm1-1 | (APIServer pid=1) return AsyncMPClient(*client_args)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 807, in init
vllm1-1 | (APIServer pid=1) super().init(
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 468, in init
vllm1-1 | (APIServer pid=1) with launch_core_engines(vllm_config, executor_class, log_stats) as (
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in exit
vllm1-1 | (APIServer pid=1) next(self.gen)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 816, in launch_core_engines
vllm1-1 | (APIServer pid=1) wait_for_engine_startup(
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 873, in wait_for_engine_startup
vllm1-1 | (APIServer pid=1) raise RuntimeError(
vllm1-1 | (APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
vllm1-1 | /usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown

cyankiwi org

Hi @jart25 , thank you for trying my model.

QuantTrio's AWQ model was likely quantized using autoawq, while mine was quantized using llm-compressor/compressed-tensors, vllm loads the quantized models differently and has different compatibility.

As QuantTrio's already worked in your case, I would recommend sticking with those. But if you're in the mood for tinkering and trying my model, I would recommend the flag --quantization compressed-tensors or try deleting the compiled ops (or simply clone a fresh repo) and installing vLLM from source.

@cpatonn
Have changed this code to force to use the supported compressed by my rdna3 and your model now works :)
https://www.diffchecker.com/WASgcXiJ/

Many thanks for your tip!

what quantization script did you use?

jart25 changed discussion title from Removed to Quant AWQ RDNA3 ROCM

@cpatonn
import base64
from io import BytesIO
import torch
from datasets import load_dataset
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modeling import replace_modules_for_calibration

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-W8A16"

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

model = replace_modules_for_calibration(model)

DATASET_ID = "lmms-lab/flickr30k"
NUM_CALIBRATION_SAMPLES = 256
DATASET_SPLIT = f"test[:{NUM_CALIBRATION_SAMPLES}]"
MAX_SEQUENCE_LENGTH = 1024

ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42)

def preprocess_and_tokenize(example):
buffered = BytesIO()
example["image"].save(buffered, format="PNG")
encoded_image = base64.b64encode(buffered.getvalue())
base64_image = f"data:image;base64,{encoded_image.decode('utf-8')}"
messages = [{
"role": "user",
"content": [
{"type": "image", "image": base64_image},
{"type": "text", "text": "What does the image show?"}
]
}]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=[text],
images=[example["image"]],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
)
return inputs

ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}

recipe = AWQModifier(
targets="Linear",
scheme="W8A16",
ignore=["re:.*lm_head", "re:.visual.", "re:.*mlp.gate$"],
duo_scaling=False,
)

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
pipeline="sequential",
)

model.save_pretrained(OUTPUT_DIR, save_compressed=True)
processor.save_pretrained(OUTPUT_DIR)

I would say it is because of the limitation of W4A16

root@890168747ffb:~# python3 te.py
torch_dtype is deprecated! Use dtype instead!
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 216.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 977/977 [02:20<00:00, 6.96it/s]
2025-10-17T06:07:43.966545+0000 | reset | INFO - Compression lifecycle reset
2025-10-17T06:07:44.000362+0000 | _create_default_logger | INFO - Logging all LLM Compressor modifier-level logs to sparse_logs/17-10-2025_06.07.43.log
2025-10-17T06:07:44.000611+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-10-17T06:07:44.875922+0000 | on_initialize | INFO - No AWQModifier.mappings provided, inferring from model...
2025-10-17T06:07:44.876164+0000 | get_layer_mappings_from_architecture | INFO - Architecture Qwen3VLMoeForConditionalGeneration not found in mappings. Using default mappings: [AWQMapping(smooth_layer='re:.*input_layernorm$', balance_layers=['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']), AWQMapping(smooth_layer='re:.*v_proj$', balance_layers=['re:.*o_proj$']), AWQMapping(smooth_layer='re:.*post_attention_layernorm$', balance_layers=['re:.*gate_proj$', 're:.*up_proj$']), AWQMapping(smooth_layer='re:.*up_proj$', balance_layers=['re:.*down_proj$'])]
Resolving mapping 1/4 (0 skipped): : 48it [00:00, 240.29it/s]
Resolving mapping 2/4 (47 skipped): : 48it [00:00, 889.88it/s]
Resolving mapping 3/4 (0 skipped): : 48it [00:00, 264.32it/s]
Resolving mapping 4/4 (0 skipped): : 6144it [00:00, 28896.98it/s]
2025-10-17T06:07:45.524784+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-10-17T06:07:55.297844+0000 | trace_subgraphs | WARNING - Expected 75 subgraphs, but only traced 49. This is likely due to having wrapped code which calls sequential targets
Preparing cache: 50%|███████████████████████████████████████████████▏ | 127/256 [00:11<00:05, 23.78it/s]Preparing cache: 59%|████████████████████████████████████████████████████████ | 151/256 [00:12<00:03, 28.16it/s]Preparing cache: 60%|█████████████████████████████████████████████████████████▏ | 154/256 [00:12<00:04, 24.60it/s]Preparing cache: 62%|███████████████████████████████████████████████████████████ | 159/256 [00:12<00:04, 22.11it/s]Preparing cache: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:17<00:00, 14.99it/s]
(1/49): Calibrating: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 256/256 [01:14<00:00, 3.45it/s]
Smoothing: 0%| | 0/130 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/root/te.py", line 77, in
oneshot(
File "/root/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 330, in oneshot
one_shot()
File "/root/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 158, in call
self.apply_recipe_modifiers(
File "/root/llm-compressor/src/llmcompressor/entrypoints/oneshot.py", line 201, in apply_recipe_modifiers
pipeline(
File "/root/llm-compressor/src/llmcompressor/pipelines/sequential/pipeline.py", line 127, in call
LifecycleCallbacks.sequential_epoch_end()
File "/root/llm-compressor/src/llmcompressor/core/session_functions.py", line 161, in sequential_epoch_end
return cls.event(EventType.SEQUENTIAL_EPOCH_END, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/llm-compressor/src/llmcompressor/core/session_functions.py", line 85, in event
return active_session().event(event_type, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/llm-compressor/src/llmcompressor/core/session.py", line 187, in event
mod_data = self._lifecycle.event(
^^^^^^^^^^^^^^^^^^^^^^
File "/root/llm-compressor/src/llmcompressor/core/lifecycle.py", line 204, in event
data = mod.update_event(state=self.state, event=event, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/llm-compressor/src/llmcompressor/modifiers/modifier.py", line 123, in update_event
self.on_event(state, event, **kwargs)
File "/root/llm-compressor/src/llmcompressor/modifiers/awq/base.py", line 252, in on_event
self._apply_smoothing(state.model)
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/llm-compressor/src/llmcompressor/modifiers/awq/base.py", line 463, in _apply_smoothing
weight = weight.view(-1, self._group_size)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: view(): argument 'size' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got NoneType"

Work Quantization Progress:

import base64
from io import BytesIO
import torch
from datasets import load_dataset
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modeling import replace_modules_for_calibration

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-W8A16"

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)

DATASET_ID = "lmms-lab/flickr30k"
NUM_CALIBRATION_SAMPLES = 256
DATASET_SPLIT = f"test[:{NUM_CALIBRATION_SAMPLES}]"
MAX_SEQUENCE_LENGTH = 1024

ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42)

def preprocess_and_tokenize(example):
buffered = BytesIO()
example["image"].save(buffered, format="PNG")
encoded_image = base64.b64encode(buffered.getvalue())
base64_image = f"data:image;base64,{encoded_image.decode('utf-8')}"
messages = [{
"role": "user",
"content": [
{"type": "image", "image": base64_image},
{"type": "text", "text": "What does the image show?"}
]
}]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=[text],
images=[example["image"]],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
)
return inputs

ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}

recipe = AWQModifier(
ignore=["re:.*lm_head", "re:.visual.", "re:.*mlp.gate$"],
duo_scaling=False,
config_groups={
"group_0": {
"targets": ["Linear"],
"input_activations": None,
"output_activations": None,
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": True,
"strategy": "group",
"group_size": 32,
"observer": "mse",
}
}
}
)

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
pipeline="sequential",
)

model.save_pretrained(OUTPUT_DIR, save_compressed=True)
processor.save_pretrained(OUTPUT_DIR)

@cpatonn the quant_config
{
"architectures": [
"Qwen3VLMoeForConditionalGeneration"
],
"dtype": "bfloat16",
"image_token_id": 151655,
"model_type": "qwen3_vl_moe",
"quantization_config": {
"config_groups": {
"group_0": {
"format": "pack-quantized",
"input_activations": null,
"output_activations": null,
"targets": [
"Linear"
],
"weights": {
"actorder": null,
"block_structure": null,
"dynamic": false,
"group_size": 32,
"num_bits": 8,
"observer": "mse",
"observer_kwargs": {},
"strategy": "group",
"symmetric": true,
"type": "int"
}
}
},
"format": "pack-quantized",
"global_compression_ratio": null,
"ignore": [
"model.visual.blocks.0.attn.qkv",
"model.visual.blocks.0.attn.proj",
"model.visual.blocks.0.mlp.linear_fc1",
"model.visual.blocks.0.mlp.linear_fc2",
"model.visual.blocks.1.attn.qkv",
"model.visual.blocks.1.attn.proj",
"model.visual.blocks.1.mlp.linear_fc1",
"model.visual.blocks.1.mlp.linear_fc2",
"model.visual.blocks.2.attn.qkv",
"model.visual.blocks.2.attn.proj",
"model.visual.blocks.2.mlp.linear_fc1",
"model.visual.blocks.2.mlp.linear_fc2",
"model.visual.blocks.3.attn.qkv",
"model.visual.blocks.3.attn.proj",
"model.visual.blocks.3.mlp.linear_fc1",
"model.visual.blocks.3.mlp.linear_fc2",
"model.visual.blocks.4.attn.qkv",
"model.visual.blocks.4.attn.proj",
"model.visual.blocks.4.mlp.linear_fc1",
"model.visual.blocks.4.mlp.linear_fc2",
"model.visual.blocks.5.attn.qkv",
"model.visual.blocks.5.attn.proj",
"model.visual.blocks.5.mlp.linear_fc1",
"model.visual.blocks.5.mlp.linear_fc2",
"model.visual.blocks.6.attn.qkv",
"model.visual.blocks.6.attn.proj",
"model.visual.blocks.6.mlp.linear_fc1",
"model.visual.blocks.6.mlp.linear_fc2",
"model.visual.blocks.7.attn.qkv",
"model.visual.blocks.7.attn.proj",
"model.visual.blocks.7.mlp.linear_fc1",
"model.visual.blocks.7.mlp.linear_fc2",
"model.visual.blocks.8.attn.qkv",
"model.visual.blocks.8.attn.proj",
"model.visual.blocks.8.mlp.linear_fc1",
"model.visual.blocks.8.mlp.linear_fc2",
"model.visual.blocks.9.attn.qkv",
"model.visual.blocks.9.attn.proj",
"model.visual.blocks.9.mlp.linear_fc1",
"model.visual.blocks.9.mlp.linear_fc2",
"model.visual.blocks.10.attn.qkv",
"model.visual.blocks.10.attn.proj",
"model.visual.blocks.10.mlp.linear_fc1",
"model.visual.blocks.10.mlp.linear_fc2",
"model.visual.blocks.11.attn.qkv",
"model.visual.blocks.11.attn.proj",
"model.visual.blocks.11.mlp.linear_fc1",
"model.visual.blocks.11.mlp.linear_fc2",
"model.visual.blocks.12.attn.qkv",
"model.visual.blocks.12.attn.proj",
"model.visual.blocks.12.mlp.linear_fc1",
"model.visual.blocks.12.mlp.linear_fc2",
"model.visual.blocks.13.attn.qkv",
"model.visual.blocks.13.attn.proj",
"model.visual.blocks.13.mlp.linear_fc1",
"model.visual.blocks.13.mlp.linear_fc2",
"model.visual.blocks.14.attn.qkv",
"model.visual.blocks.14.attn.proj",
"model.visual.blocks.14.mlp.linear_fc1",
"model.visual.blocks.14.mlp.linear_fc2",
"model.visual.blocks.15.attn.qkv",
"model.visual.blocks.15.attn.proj",
"model.visual.blocks.15.mlp.linear_fc1",
"model.visual.blocks.15.mlp.linear_fc2",
"model.visual.blocks.16.attn.qkv",
"model.visual.blocks.16.attn.proj",
"model.visual.blocks.16.mlp.linear_fc1",
"model.visual.blocks.16.mlp.linear_fc2",
"model.visual.blocks.17.attn.qkv",
"model.visual.blocks.17.attn.proj",
"model.visual.blocks.17.mlp.linear_fc1",
"model.visual.blocks.17.mlp.linear_fc2",
"model.visual.blocks.18.attn.qkv",
"model.visual.blocks.18.attn.proj",
"model.visual.blocks.18.mlp.linear_fc1",
"model.visual.blocks.18.mlp.linear_fc2",
"model.visual.blocks.19.attn.qkv",
"model.visual.blocks.19.attn.proj",
"model.visual.blocks.19.mlp.linear_fc1",
"model.visual.blocks.19.mlp.linear_fc2",
"model.visual.blocks.20.attn.qkv",
"model.visual.blocks.20.attn.proj",
"model.visual.blocks.20.mlp.linear_fc1",
"model.visual.blocks.20.mlp.linear_fc2",
"model.visual.blocks.21.attn.qkv",
"model.visual.blocks.21.attn.proj",
"model.visual.blocks.21.mlp.linear_fc1",
"model.visual.blocks.21.mlp.linear_fc2",
"model.visual.blocks.22.attn.qkv",
"model.visual.blocks.22.attn.proj",
"model.visual.blocks.22.mlp.linear_fc1",
"model.visual.blocks.22.mlp.linear_fc2",
"model.visual.blocks.23.attn.qkv",
"model.visual.blocks.23.attn.proj",
"model.visual.blocks.23.mlp.linear_fc1",
"model.visual.blocks.23.mlp.linear_fc2",
"model.visual.blocks.24.attn.qkv",
"model.visual.blocks.24.attn.proj",
"model.visual.blocks.24.mlp.linear_fc1",
"model.visual.blocks.24.mlp.linear_fc2",
"model.visual.blocks.25.attn.qkv",
"model.visual.blocks.25.attn.proj",
"model.visual.blocks.25.mlp.linear_fc1",
"model.visual.blocks.25.mlp.linear_fc2",
"model.visual.blocks.26.attn.qkv",
"model.visual.blocks.26.attn.proj",
"model.visual.blocks.26.mlp.linear_fc1",
"model.visual.blocks.26.mlp.linear_fc2",
"model.visual.merger.linear_fc1",
"model.visual.merger.linear_fc2",
"model.visual.deepstack_merger_list.0.linear_fc1",
"model.visual.deepstack_merger_list.0.linear_fc2",
"model.visual.deepstack_merger_list.1.linear_fc1",
"model.visual.deepstack_merger_list.1.linear_fc2",
"model.visual.deepstack_merger_list.2.linear_fc1",
"model.visual.deepstack_merger_list.2.linear_fc2",
"model.language_model.layers.0.mlp.gate",
"model.language_model.layers.1.mlp.gate",
"model.language_model.layers.2.mlp.gate",
"model.language_model.layers.3.mlp.gate",
"model.language_model.layers.4.mlp.gate",
"model.language_model.layers.5.mlp.gate",
"model.language_model.layers.6.mlp.gate",
"model.language_model.layers.7.mlp.gate",
"model.language_model.layers.8.mlp.gate",
"model.language_model.layers.9.mlp.gate",
"model.language_model.layers.10.mlp.gate",
"model.language_model.layers.11.mlp.gate",
"model.language_model.layers.12.mlp.gate",
"model.language_model.layers.13.mlp.gate",
"model.language_model.layers.14.mlp.gate",
"model.language_model.layers.15.mlp.gate",
"model.language_model.layers.16.mlp.gate",
"model.language_model.layers.17.mlp.gate",
"model.language_model.layers.18.mlp.gate",
"model.language_model.layers.19.mlp.gate",
"model.language_model.layers.20.mlp.gate",
"model.language_model.layers.21.mlp.gate",
"model.language_model.layers.22.mlp.gate",
"model.language_model.layers.23.mlp.gate",
"model.language_model.layers.24.mlp.gate",
"model.language_model.layers.25.mlp.gate",
"model.language_model.layers.26.mlp.gate",
"model.language_model.layers.27.mlp.gate",
"model.language_model.layers.28.mlp.gate",
"model.language_model.layers.29.mlp.gate",
"model.language_model.layers.30.mlp.gate",
"model.language_model.layers.31.mlp.gate",
"model.language_model.layers.32.mlp.gate",
"model.language_model.layers.33.mlp.gate",
"model.language_model.layers.34.mlp.gate",
"model.language_model.layers.35.mlp.gate",
"model.language_model.layers.36.mlp.gate",
"model.language_model.layers.37.mlp.gate",
"model.language_model.layers.38.mlp.gate",
"model.language_model.layers.39.mlp.gate",
"model.language_model.layers.40.mlp.gate",
"model.language_model.layers.41.mlp.gate",
"model.language_model.layers.42.mlp.gate",
"model.language_model.layers.43.mlp.gate",
"model.language_model.layers.44.mlp.gate",
"model.language_model.layers.45.mlp.gate",
"model.language_model.layers.46.mlp.gate",
"model.language_model.layers.47.mlp.gate",
"lm_head"
],
"kv_cache_scheme": null,
"quant_method": "compressed-tensors",
"quantization_status": "compressed",
"sparsity_config": {},
"transform_config": {},
"version": "0.12.3.a20251013"
},
"text_config": {
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"decoder_sparse_step": 1,
"dtype": "bfloat16",
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 6144,
"max_position_embeddings": 262144,
"mlp_only_layers": [],
"model_type": "qwen3_vl_moe_text",
"moe_intermediate_size": 768,
"norm_topk_prob": true,
"num_attention_heads": 32,
"num_experts": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": 48,
"num_key_value_heads": 4,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default"
},
"rope_theta": 5000000,
"router_aux_loss_coef": 0.001,
"use_cache": true,
"vocab_size": 151936
},
"tie_word_embeddings": false,
"transformers_version": "4.57.0",
"video_token_id": 151656,
"vision_config": {
"deepstack_visual_indexes": [
8,
16,
24
],
"depth": 27,
"dtype": "bfloat16",
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"in_channels": 3,
"initializer_range": 0.02,
"intermediate_size": 4304,
"model_type": "qwen3_vl_moe",
"num_heads": 16,
"num_position_embeddings": 2304,
"out_hidden_size": 2048,
"patch_size": 16,
"spatial_merge_size": 2,
"temporal_patch_size": 2
},
"vision_end_token_id": 151653,
"vision_start_token_id": 151652
}

the script:

import base64
from io import BytesIO
import torch
from datasets import load_dataset
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.utils import dispatch_for_generation

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-W8A16-mse-sym-false"

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

model = replace_modules_for_calibration(model)

DATASET_ID = "lmms-lab/flickr30k"
NUM_CALIBRATION_SAMPLES = 256
DATASET_SPLIT = f"test[:{NUM_CALIBRATION_SAMPLES}]"
MAX_SEQUENCE_LENGTH = 1024

ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42)

def preprocess_and_tokenize(example):
buffered = BytesIO()
example["image"].save(buffered, format="PNG")
encoded_image = base64.b64encode(buffered.getvalue())
base64_image = f"data:image;base64,{encoded_image.decode('utf-8')}"
messages = [{
"role": "user",
"content": [
{"type": "image", "image": base64_image},
{"type": "text", "text": "What does the image show?"}
]
}]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=[text],
images=[example["image"]],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
)
return inputs

ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}

recipe = AWQModifier(
ignore=["re:.*lm_head", "re:.visual.", "re:.*mlp.gate$"],
duo_scaling=True,
config_groups={
"group_0": {
"targets": ["Linear"],
"input_activations": None,
"output_activations": None,
"weights": {
"actorder": None,
"block_structure": None,
"dynamic": False,
"group_size": 32,
"num_bits": 8,
"observer": "mse",
"observer_kwargs": {},
"strategy": "group",
"symmetric": True,
"type": "int"
}
}
}
)

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
pipeline="sequential",
)

model.save_pretrained(OUTPUT_DIR, save_compressed=True)
processor.save_pretrained(OUTPUT_DIR)

The error:

vllm1-1 | (Worker_TP3 pid=234) INFO 10-17 16:33:43 [compressed_tensors_moe.py:146] Using CompressedTensorsWNA16MoEMethod
Loading safetensors checkpoint shards: 0% 0/8 [00:00<?, ?it/s](Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] WorkerProc failed to start.
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] Traceback (most recent call last):
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 597, in worker_main
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] worker = WorkerProc(*args, **kwargs)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 456, in init
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] self.worker.load_model()
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 229, in load_model
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] self.model_runner.load_model(eep_scale_up=eep_scale_up)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2880, in load_model
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] self.model = model_loader.load_model(
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] self.load_weights(model, model_config)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/default_loader.py", line 300, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] loaded_weights = model.load_weights(
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1771, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 318, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] autoloaded_weights = set(self._load_module("", self.module, weights))
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 272, in _load_module
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] yield from self._load_module(
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 245, in _load_module
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] loaded_params = module_load_weights(weights)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 755, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] return loader.load_weights(weights)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 318, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] autoloaded_weights = set(self._load_module("", self.module, weights))
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 272, in _load_module
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] yield from self._load_module(
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 245, in _load_module
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] loaded_params = module_load_weights(weights)
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl_moe.py", line 272, in load_weights
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] param = params_dict[name_mapped]
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] ~~~~~~~~~~~^^^^^^^^^^^^^
vllm1-1 | (Worker_TP0 pid=231) ERROR 10-17 16:33:43 [multiproc_executor.py:623] KeyError: 'layers.19.mlp.experts.w2_weight_zero_point'
Loading safetensors checkpoint shards: 0% 0/8 [00:00<?, ?it/s]
vllm1-1 | (Worker_TP2 pid=233) INFO 10-17 16:33:43 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP0 pid=231) INFO 10-17 16:33:43 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP1 pid=232) INFO 10-17 16:33:43 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | (Worker_TP3 pid=234) INFO 10-17 16:33:43 [multiproc_executor.py:584] Parent process exited, terminating worker
vllm1-1 | [rank0]:[W1017 16:33:43.229670474 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] EngineCore failed to start.
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] Traceback (most recent call last):
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 783, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] engine_core = EngineCoreProc(*args, **kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 555, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] super().__init__(
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 105, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] self.model_executor = executor_class(vllm_config)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] self._init_executor()
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 113, in _init_executor
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] self.workers = WorkerProc.wait_for_ready(unready_workers)
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 535, in wait_for_ready
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] raise e from None
vllm1-1 | (EngineCore_DP0 pid=159) ERROR 10-17 16:33:44 [core.py:792] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
vllm1-1 | (EngineCore_DP0 pid=159) Process EngineCore_DP0:
vllm1-1 | (EngineCore_DP0 pid=159) Traceback (most recent call last):
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
vllm1-1 | (EngineCore_DP0 pid=159) self.run()
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
vllm1-1 | (EngineCore_DP0 pid=159) self._target(*self._args, **self._kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 796, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) raise e
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 783, in run_engine_core
vllm1-1 | (EngineCore_DP0 pid=159) engine_core = EngineCoreProc(*args, **kwargs)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 555, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) super().__init__(
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 105, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) self.model_executor = executor_class(vllm_config)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
vllm1-1 | (EngineCore_DP0 pid=159) self._init_executor()
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 113, in _init_executor
vllm1-1 | (EngineCore_DP0 pid=159) self.workers = WorkerProc.wait_for_ready(unready_workers)
vllm1-1 | (EngineCore_DP0 pid=159) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (EngineCore_DP0 pid=159) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 535, in wait_for_ready
vllm1-1 | (EngineCore_DP0 pid=159) raise e from None
vllm1-1 | (EngineCore_DP0 pid=159) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
vllm1-1 | (APIServer pid=1) Traceback (most recent call last):
vllm1-1 | (APIServer pid=1) File "/usr/local/bin/vllm", line 7, in
vllm1-1 | (APIServer pid=1) sys.exit(main())
vllm1-1 | (APIServer pid=1) ^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main
vllm1-1 | (APIServer pid=1) args.dispatch_function(args)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 62, in cmd
vllm1-1 | (APIServer pid=1) uvloop.run(run_server(args))
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
vllm1-1 | (APIServer pid=1) return __asyncio.run(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
vllm1-1 | (APIServer pid=1) return runner.run(main)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
vllm1-1 | (APIServer pid=1) return self._loop.run_until_complete(task)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
vllm1-1 | (APIServer pid=1) return await main
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1917, in run_server
vllm1-1 | (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1933, in run_server_worker
vllm1-1 | (APIServer pid=1) async with build_async_engine_client(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
vllm1-1 | (APIServer pid=1) return await anext(self.gen)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 191, in build_async_engine_client
vllm1-1 | (APIServer pid=1) async with build_async_engine_client_from_engine_args(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
vllm1-1 | (APIServer pid=1) return await anext(self.gen)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 238, in build_async_engine_client_from_engine_args
vllm1-1 | (APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/init.py", line 1336, in inner
vllm1-1 | (APIServer pid=1) return fn(*args, **kwargs)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 208, in from_vllm_config
vllm1-1 | (APIServer pid=1) return cls(
vllm1-1 | (APIServer pid=1) ^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 130, in init
vllm1-1 | (APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 121, in make_async_mp_client
vllm1-1 | (APIServer pid=1) return AsyncMPClient(*client_args)
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 807, in init
vllm1-1 | (APIServer pid=1) super().init(
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 468, in init
vllm1-1 | (APIServer pid=1) with launch_core_engines(vllm_config, executor_class, log_stats) as (
vllm1-1 | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm1-1 | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in exit
vllm1-1 | (APIServer pid=1) next(self.gen)
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 816, in launch_core_engines
vllm1-1 | (APIServer pid=1) wait_for_engine_startup(
vllm1-1 | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 873, in wait_for_engine_startup
vllm1-1 | (APIServer pid=1) raise RuntimeError(
vllm1-1 | (APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

import torch
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

NOTE: Requires a minimum of transformers 4.57.0

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"

Load model.

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 8192

ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

def preprocess_function(example):
messages = []
for message in example["messages"]:
messages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)

return processor.apply_chat_template(
    messages,
    return_tensors="pt",
    padding=False,
    truncation=True,
    max_length=MAX_SEQUENCE_LENGTH,
    tokenize=True,
    add_special_tokens=False,
    return_dict=True,
    add_generation_prompt=False,
)

ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}

Configure AWQ quantization with smoothing and balancing

recipe = AWQModifier(
ignore=[
're:.*embed_tokens',
're:.*input_layernorm$',
're:.*mlp[.]gate$',
're:.post_attention_layernorm$',
're:.norm$',
're:model[.]visual.
',
're:visual.
',
'lm_head'
],
mappings=[
{
"smooth_layer": "re:.*input_layernorm$",
"balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
},
{
"smooth_layer": "re:.*v_proj$",
"balance_layers": ['re:.*o_proj$']
},
{
"smooth_layer": "re:.*post_attention_layernorm$",
"balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
},
{
"smooth_layer": "re:.*up_proj$",
"balance_layers": ['re:.*down_proj$']
}
],
duo_scaling=True,
config_groups={
"group_0": {
"targets": ["Linear"],
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": True,
"group_size": 32,
"strategy": "group",
"block_structure": None,
"dynamic": False,
"actorder": None,
"observer": "mse",
"observer_kwargs": {}
},
"input_activations": None,
"output_activations": None,
"format": None
}
}
)

Apply AWQ quantization.

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
)

print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
print("==========================================")

Save to disk in compressed-tensors format.

SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-sym"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)

Works!

import torch
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

NOTE: Requires a minimum of transformers 4.57.0

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"

Load model.

model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 8192

ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

def preprocess_function(example):
messages = []
for message in example["messages"]:
messages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)

return processor.apply_chat_template(
    messages,
    return_tensors="pt",
    padding=False,
    truncation=True,
    max_length=MAX_SEQUENCE_LENGTH,
    tokenize=True,
    add_special_tokens=False,
    return_dict=True,
    add_generation_prompt=False,
)

ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)

def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}

Configure AWQ quantization with smoothing and balancing

recipe = AWQModifier(
ignore=[
're:.*embed_tokens',
're:.*input_layernorm$',
're:.*mlp[.]gate$',
're:.post_attention_layernorm$',
're:.norm$',
're:model[.]visual.
',
're:visual.
',
'lm_head'
],
mappings=[
{
"smooth_layer": "re:.*input_layernorm$",
"balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
},
{
"smooth_layer": "re:.*v_proj$",
"balance_layers": ['re:.*o_proj$']
},
{
"smooth_layer": "re:.*post_attention_layernorm$",
"balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
},
{
"smooth_layer": "re:.*up_proj$",
"balance_layers": ['re:.*down_proj$']
}
],
duo_scaling=True,
config_groups={
"group_0": {
"targets": ["Linear"],
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": True,
"group_size": 32,
"strategy": "group",
"block_structure": None,
"dynamic": False,
"actorder": None,
"observer": "mse",
"observer_kwargs": {}
},
"input_activations": None,
"output_activations": None,
"format": None
}
}
)

Apply AWQ quantization.

oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
)

print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
print("==========================================")

Save to disk in compressed-tensors format.

SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-sym"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)

Works!

This code lead to the following error: ValueError: Found a final incomplete set with matches found for keys: {'re:.*q_proj$', 're:.*v_proj$', 're:.*k_proj$'} but no matches found for keys: {'re:.*input_layernorm$'}
,and the solution can be found https://github.com/vllm-project/llm-compressor/issues/2151#issuecomment-3681031998

Sign up or log in to comment