repro:

import torch
from huggingface_hub import create_repo, get_token
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM
from transformers import AutoModelForImageTextToText, AutoTokenizer, TorchAoConfig

from torchao.prototype.awq import AWQConfig
from torchao.quantization import (
    Int4WeightOnlyConfig,
    ModuleFqnToConfig,
    quantize_,
)
from torchao.quantization.quantize_.common.quantization_step import (
    QuantizationStep,
)
from torchao.quantization.quantize_.workflows import (
    Int4ChooseQParamsAlgorithm,
    Int4PackingFormat,
)

MODEL_ID = "google/gemma-3-27b-it"
USER_ID = "namgyu-youn"

# NOTE: This config require H100+ device
BASE_CONFIG = Int4WeightOnlyConfig(
    group_size=128,
    int4_packing_format=Int4PackingFormat.TILE_PACKED_TO_4D,
    int4_choose_qparams_algorithm=Int4ChooseQParamsAlgorithm.TINYGEMM,
)

LAYER_PATTERNS = [
    r"re:language_model\.model\.layers\..+\.mlp\..+_proj",
    r"re:language_model\.model\.layers\..+\.self_attn\..+_proj",
]


def get_quant_config(linear_config):
    return ModuleFqnToConfig({pat: linear_config for pat in LAYER_PATTERNS})


# --- Load model ---
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID, device_map="auto", dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# --- AWQ: prepare + calibrate ---
quantize_(
    model,
    get_quant_config(AWQConfig(BASE_CONFIG, step=QuantizationStep.PREPARE)),
    filter_fn=None,
)

evaluator.simple_evaluate(
    HFLM(pretrained=model, tokenizer=tokenizer),
    tasks=["mmlu_philosophy"],
    limit=30,
    batch_size=1,
)

# --- AWQ: convert ---
quantize_(
    model,
    get_quant_config(AWQConfig(BASE_CONFIG, step=QuantizationStep.CONVERT)),
    filter_fn=None,
)

quant_config = AWQConfig(BASE_CONFIG, step=QuantizationStep.PREPARE_FOR_LOADING)
model.config.quantization_config = TorchAoConfig(quant_config)

# --- Push to Hub ---
save_to = f"{USER_ID}/{MODEL_ID.split('/')[-1]}-AWQ-INT4"
token = get_token()
create_repo(save_to, token=token, exist_ok=True)
model.push_to_hub(save_to, token=token, safe_serialization=False)
tokenizer.push_to_hub(save_to, token=token)

repro (benchmark):

# NOTE: lm-eval (v0.4.11) and vLLM (v0.15.1) are failed to reproduce.
# Accuracy
lm_eval --model hf \
    --model_args pretrained=namgyu-youn/gemma-3-27b-it-AWQ-INT4,dtype=float16 \
    --tasks gsm8k \
    --limit 10 \
    --apply_chat_template \
    --batch_size auto


# Throughput
vllm bench throughput \
  --input-len 256 \
  --output-len 256 \
  --model namgyu-youn/gemma-3-27b-it-AWQ-INT4 \
  --num-prompts 10 \
  --enforce-eager

benchmark result:

# vLLM failure log
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 541, in <listcomp>
(EngineCore_DP0 pid=7511)     SiglipEncoderLayer(
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 495, in __init__
(EngineCore_DP0 pid=7511)     self.mlp = SiglipMLP(
(EngineCore_DP0 pid=7511)                ^^^^^^^^^^
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 460, in __init__
(EngineCore_DP0 pid=7511)     self.fc2 = RowParallelLinear(
(EngineCore_DP0 pid=7511)                ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 1366, in __init__
(EngineCore_DP0 pid=7511)     self.quant_method.create_weights(
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/torchao.py", line 333, in create_weights
(EngineCore_DP0 pid=7511)     weight = torchao_quantize_param_data(
(EngineCore_DP0 pid=7511)              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/torchao.py", line 299, in torchao_quantize_param_data
(EngineCore_DP0 pid=7511)     quantize_(dummy_linear, torchao_config)
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 496, in quantize_
(EngineCore_DP0 pid=7511)     _replace_with_custom_fn_if_matches_filter(
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 214, in _replace_with_custom_fn_if_matches_filter
(EngineCore_DP0 pid=7511)     new_child = _replace_with_custom_fn_if_matches_filter(
(EngineCore_DP0 pid=7511)                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 209, in _replace_with_custom_fn_if_matches_filter
(EngineCore_DP0 pid=7511)     model = replacement_fn(model, *extra_args)
(EngineCore_DP0 pid=7511)             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511)   File "/workspace/.venv/lib/python3.11/site-packages/torchao/prototype/awq/api.py", line 108, in _awq_transform
(EngineCore_DP0 pid=7511)     assert isinstance(qw, SupportsActivationPreScaling), (
(EngineCore_DP0 pid=7511)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) AssertionError: weight must support activation scaling through implementing `SupportsActivationPreScaling`
[rank0]:[W223 19:13:05.047849326 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
Traceback (most recent call last):
  File "/workspace/.venv/bin/vllm", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
    args.dispatch_function(args)
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
    main(args)
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/benchmarks/throughput.py", line 868, in main
    elapsed_time, request_outputs = run_vllm(
                                    ^^^^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/benchmarks/throughput.py", line 55, in run_vllm
    llm = LLM(**dataclasses.asdict(engine_args))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 334, in __init__
    self.llm_engine = LLMEngine.from_engine_args(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 172, in from_engine_args
    return cls(
           ^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 106, in __init__
    self.engine_core = EngineCoreClient.make_client(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 94, in make_client
    return SyncMPClient(vllm_config, executor_class, log_stats)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 647, in __init__
    super().__init__(
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 479, in __init__
    with launch_core_engines(vllm_config, executor_class, log_stats) as (
  File "/.uv/python_install/cpython-3.11.14-linux-x86_64-gnu/lib/python3.11/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 933, in launch_core_engines
    wait_for_engine_startup(
  File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 992, in wait_for_engine_startup
    raise RuntimeError(
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

# lm-eval log (whyyyyyyy???)
(workspace) (main) root@C.31934002:/workspace/.hf_home/hub$ lm_eval --model hf \
    --model_args pretrained=namgyu-youn/gemma-3-27b-it-AWQ-INT4,dtype=float16 \
    --tasks gsm8k \
    --limit 10 \
    --apply_chat_template \
    --batch_size auto
2026-02-23:19:01:59 WARNING  [config.evaluate_config:281] --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.
2026-02-23:19:01:59 INFO     [config.evaluate_config:301] Using default fewshot_as_multiturn=True.
2026-02-23:19:02:01 INFO     [_cli.run:376] Selected Tasks: ['gsm8k']
2026-02-23:19:02:01 INFO     [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-23:19:02:01 INFO     [evaluator:236] Initializing hf model, with arguments: {'pretrained': 'namgyu-youn/gemma-3-27b-it-AWQ-INT4', 'dtype': 'float16'}
2026-02-23:19:02:02 INFO     [models.huggingface:161] Using device 'cuda:0'
2026-02-23:19:02:03 INFO     [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
/workspace/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py:239: UserWarning: You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` from the model will be used.
  warnings.warn(warning_msg)
pytorch_model.bin.index.json: 127kB [00:00, 143MB/s]
pytorch_model-00002-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [05:06<00:00, 16.2MB/s]
pytorch_model-00008-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [05:07<00:00, 16.1MB/s]
pytorch_model-00001-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.85G/4.85G [05:07<00:00, 15.8MB/s]
pytorch_model-00006-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [05:07<00:00, 16.1MB/s]
pytorch_model-00005-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [05:09<00:00, 16.0MB/s]
pytorch_model-00004-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [05:12<00:00, 15.8MB/s]
pytorch_model-00010-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [02:08<00:00, 38.4MB/s]
pytorch_model-00009-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [02:10<00:00, 38.0MB/s]
pytorch_model-00003-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [07:17<00:00, 11.3MB/s]
pytorch_model-00012-of-00012.bin: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 462M/462M [02:06<00:00, 3.65MB/s]
pytorch_model-00011-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [02:09<00:00, 38.1MB/s]
pytorch_model-00007-of-00012.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [07:36<00:00, 10.9MB/s]
Fetching 12 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [07:36<00:00, 38.05s/it]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.13it/s]
2026-02-23:19:09:48 INFO     [tasks:700] Selected tasks:████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.95G/4.95G [07:36<00:00, 733kB/s]
2026-02-23:19:09:48 INFO     [tasks:691] Task: gsm8k (gsm8k/gsm8k.yaml)███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 4.90G/4.95G [02:09<00:00, 271MB/s]
2026-02-23:19:09:48 INFO     [evaluator:314] gsm8k: Using gen_kwargs: {'until': ['Question:', '</s>', '<|im_end|>'], 'do_sample': False, 'temperature': 0.0}████████████████████████████████████████████| 4.95G/4.95G [02:09<00:00, 289MB/s]
2026-02-23:19:09:48 WARNING  [evaluator:490] Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details.
2026-02-23:19:09:48 INFO     [api.task:311] Building contexts for gsm8k on rank 0...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 601.30it/s]
2026-02-23:19:09:48 INFO     [evaluator:584] Running generate_until requests
Running generate_until requests:   0%|                                                                                                                                                                               | 0/10 [00:00<?, ?it/s]
Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 11
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Running generate_until requests: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:17<00:00, 13.79s/it]
fatal: not a git repository (or any of the parent directories): .git
2026-02-23:19:12:07 INFO     [loggers.evaluation_tracker:316] Output path not provided, skipping saving results aggregated
hf ({'pretrained': 'namgyu-youn/gemma-3-27b-it-AWQ-INT4', 'dtype': 'float16'}), gen_kwargs: ({}), limit: 10.0, num_fewshot: None, batch_size: auto
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value|   |Stderr|
|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |    0|±  |     0|
|     |       |strict-match    |     5|exact_match|↑  |    0|±  |     0|

Downloads last month: 11

Safetensors

Model size

27B params

Tensor type

BF16

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for namgyu-youn/gemma-3-27b-it-AWQ-INT4

Base model

google/gemma-3-27b-pt

Finetuned

google/gemma-3-27b-it

Quantized

(126)

this model