This checkpoint is generated by torchao v0.16.0. See https://github.com/pytorch/ao/pull/3617#issuecomment-3932314241 for detail
- repro:
import torch
from huggingface_hub import create_repo, get_token
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM
from transformers import AutoModelForImageTextToText, AutoTokenizer, TorchAoConfig
from torchao.prototype.awq import AWQConfig
from torchao.quantization import (
Int4WeightOnlyConfig,
ModuleFqnToConfig,
quantize_,
)
from torchao.quantization.quantize_.common.quantization_step import (
QuantizationStep,
)
from torchao.quantization.quantize_.workflows import (
Int4ChooseQParamsAlgorithm,
Int4PackingFormat,
)
MODEL_ID = "google/gemma-3-27b-it"
USER_ID = "namgyu-youn"
# NOTE: This config require H100+ device
BASE_CONFIG = Int4WeightOnlyConfig(
group_size=128,
int4_packing_format=Int4PackingFormat.TILE_PACKED_TO_4D,
int4_choose_qparams_algorithm=Int4ChooseQParamsAlgorithm.TINYGEMM,
)
LAYER_PATTERNS = [
r"re:language_model\.model\.layers\..+\.mlp\..+_proj",
r"re:language_model\.model\.layers\..+\.self_attn\..+_proj",
]
def get_quant_config(linear_config):
return ModuleFqnToConfig({pat: linear_config for pat in LAYER_PATTERNS})
# --- Load model ---
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID, device_map="auto", dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# --- AWQ: prepare + calibrate ---
quantize_(
model,
get_quant_config(AWQConfig(BASE_CONFIG, step=QuantizationStep.PREPARE)),
filter_fn=None,
)
evaluator.simple_evaluate(
HFLM(pretrained=model, tokenizer=tokenizer),
tasks=["mmlu_philosophy"],
limit=30,
batch_size=1,
)
# --- AWQ: convert ---
quantize_(
model,
get_quant_config(AWQConfig(BASE_CONFIG, step=QuantizationStep.CONVERT)),
filter_fn=None,
)
quant_config = AWQConfig(BASE_CONFIG, step=QuantizationStep.PREPARE_FOR_LOADING)
model.config.quantization_config = TorchAoConfig(quant_config)
# --- Push to Hub ---
save_to = f"{USER_ID}/{MODEL_ID.split('/')[-1]}-AWQ-INT4"
token = get_token()
create_repo(save_to, token=token, exist_ok=True)
model.push_to_hub(save_to, token=token, safe_serialization=False)
tokenizer.push_to_hub(save_to, token=token)
- repro (benchmark):
# NOTE: lm-eval (v0.4.11) and vLLM (v0.15.1) are failed to reproduce.
# Accuracy
lm_eval --model hf \
--model_args pretrained=namgyu-youn/gemma-3-27b-it-AWQ-INT4,dtype=float16 \
--tasks gsm8k \
--limit 10 \
--apply_chat_template \
--batch_size auto
# Throughput
vllm bench throughput \
--input-len 256 \
--output-len 256 \
--model namgyu-youn/gemma-3-27b-it-AWQ-INT4 \
--num-prompts 10 \
--enforce-eager
- benchmark result:
# vLLM failure log
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 541, in <listcomp>
(EngineCore_DP0 pid=7511) SiglipEncoderLayer(
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 495, in __init__
(EngineCore_DP0 pid=7511) self.mlp = SiglipMLP(
(EngineCore_DP0 pid=7511) ^^^^^^^^^^
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/models/siglip.py", line 460, in __init__
(EngineCore_DP0 pid=7511) self.fc2 = RowParallelLinear(
(EngineCore_DP0 pid=7511) ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 1366, in __init__
(EngineCore_DP0 pid=7511) self.quant_method.create_weights(
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/torchao.py", line 333, in create_weights
(EngineCore_DP0 pid=7511) weight = torchao_quantize_param_data(
(EngineCore_DP0 pid=7511) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/torchao.py", line 299, in torchao_quantize_param_data
(EngineCore_DP0 pid=7511) quantize_(dummy_linear, torchao_config)
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 496, in quantize_
(EngineCore_DP0 pid=7511) _replace_with_custom_fn_if_matches_filter(
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 214, in _replace_with_custom_fn_if_matches_filter
(EngineCore_DP0 pid=7511) new_child = _replace_with_custom_fn_if_matches_filter(
(EngineCore_DP0 pid=7511) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/torchao/quantization/quant_api.py", line 209, in _replace_with_custom_fn_if_matches_filter
(EngineCore_DP0 pid=7511) model = replacement_fn(model, *extra_args)
(EngineCore_DP0 pid=7511) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) File "/workspace/.venv/lib/python3.11/site-packages/torchao/prototype/awq/api.py", line 108, in _awq_transform
(EngineCore_DP0 pid=7511) assert isinstance(qw, SupportsActivationPreScaling), (
(EngineCore_DP0 pid=7511) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=7511) AssertionError: weight must support activation scaling through implementing `SupportsActivationPreScaling`
[rank0]:[W223 19:13:05.047849326 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
Traceback (most recent call last):
File "/workspace/.venv/bin/vllm", line 10, in <module>
sys.exit(main())
^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
args.dispatch_function(args)
File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/benchmark/throughput.py", line 21, in cmd
main(args)
File "/workspace/.venv/lib/python3.11/site-packages/vllm/benchmarks/throughput.py", line 868, in main
elapsed_time, request_outputs = run_vllm(
^^^^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/benchmarks/throughput.py", line 55, in run_vllm
llm = LLM(**dataclasses.asdict(engine_args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 334, in __init__
self.llm_engine = LLMEngine.from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 172, in from_engine_args
return cls(
^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 106, in __init__
self.engine_core = EngineCoreClient.make_client(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 94, in make_client
return SyncMPClient(vllm_config, executor_class, log_stats)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 647, in __init__
super().__init__(
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 479, in __init__
with launch_core_engines(vllm_config, executor_class, log_stats) as (
File "/.uv/python_install/cpython-3.11.14-linux-x86_64-gnu/lib/python3.11/contextlib.py", line 144, in __exit__
next(self.gen)
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 933, in launch_core_engines
wait_for_engine_startup(
File "/workspace/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 992, in wait_for_engine_startup
raise RuntimeError(
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
# lm-eval log (whyyyyyyy???)
(workspace) (main) root@C.31934002:/workspace/.hf_home/hub$ lm_eval --model hf \
--model_args pretrained=namgyu-youn/gemma-3-27b-it-AWQ-INT4,dtype=float16 \
--tasks gsm8k \
--limit 10 \
--apply_chat_template \
--batch_size auto
2026-02-23:19:01:59 WARNING [config.evaluate_config:281] --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.
2026-02-23:19:01:59 INFO [config.evaluate_config:301] Using default fewshot_as_multiturn=True.
2026-02-23:19:02:01 INFO [_cli.run:376] Selected Tasks: ['gsm8k']
2026-02-23:19:02:01 INFO [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-23:19:02:01 INFO [evaluator:236] Initializing hf model, with arguments: {'pretrained': 'namgyu-youn/gemma-3-27b-it-AWQ-INT4', 'dtype': 'float16'}
2026-02-23:19:02:02 INFO [models.huggingface:161] Using device 'cuda:0'
2026-02-23:19:02:03 INFO [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
/workspace/.venv/lib/python3.11/site-packages/transformers/quantizers/auto.py:239: UserWarning: You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` from the model will be used.
warnings.warn(warning_msg)
pytorch_model.bin.index.json: 127kB [00:00, 143MB/s]
pytorch_model-00002-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [05:06<00:00, 16.2MB/s]
pytorch_model-00008-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [05:07<00:00, 16.1MB/s]
pytorch_model-00001-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.85G/4.85G [05:07<00:00, 15.8MB/s]
pytorch_model-00006-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [05:07<00:00, 16.1MB/s]
pytorch_model-00005-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [05:09<00:00, 16.0MB/s]
pytorch_model-00004-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [05:12<00:00, 15.8MB/s]
pytorch_model-00010-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [02:08<00:00, 38.4MB/s]
pytorch_model-00009-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [02:10<00:00, 38.0MB/s]
pytorch_model-00003-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [07:17<00:00, 11.3MB/s]
pytorch_model-00012-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 462M/462M [02:06<00:00, 3.65MB/s]
pytorch_model-00011-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [02:09<00:00, 38.1MB/s]
pytorch_model-00007-of-00012.bin: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [07:36<00:00, 10.9MB/s]
Fetching 12 files: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 12/12 [07:36<00:00, 38.05s/it]
Loading checkpoint shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 12/12 [00:05<00:00, 2.13it/s]
2026-02-23:19:09:48 INFO [tasks:700] Selected tasks:ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [07:36<00:00, 733kB/s]
2026-02-23:19:09:48 INFO [tasks:691] Task: gsm8k (gsm8k/gsm8k.yaml)ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 4.90G/4.95G [02:09<00:00, 271MB/s]
2026-02-23:19:09:48 INFO [evaluator:314] gsm8k: Using gen_kwargs: {'until': ['Question:', '</s>', '<|im_end|>'], 'do_sample': False, 'temperature': 0.0}ββββββββββββββββββββββββββββββββββββββββββββ| 4.95G/4.95G [02:09<00:00, 289MB/s]
2026-02-23:19:09:48 WARNING [evaluator:490] Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details.
2026-02-23:19:09:48 INFO [api.task:311] Building contexts for gsm8k on rank 0...
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 10/10 [00:00<00:00, 601.30it/s]
2026-02-23:19:09:48 INFO [evaluator:584] Running generate_until requests
Running generate_until requests: 0%| | 0/10 [00:00<?, ?it/s]
Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 11
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Running generate_until requests: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 10/10 [02:17<00:00, 13.79s/it]
fatal: not a git repository (or any of the parent directories): .git
2026-02-23:19:12:07 INFO [loggers.evaluation_tracker:316] Output path not provided, skipping saving results aggregated
hf ({'pretrained': 'namgyu-youn/gemma-3-27b-it-AWQ-INT4', 'dtype': 'float16'}), gen_kwargs: ({}), limit: 10.0, num_fewshot: None, batch_size: auto
|Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr|
|-----|------:|----------------|-----:|-----------|---|----:|---|-----:|
|gsm8k| 3|flexible-extract| 5|exact_match|β | 0|Β± | 0|
| | |strict-match | 5|exact_match|β | 0|Β± | 0|
- Downloads last month
- 11
Inference Providers NEW
This model isn't deployed by any Inference Provider. π Ask for provider support