Quantized EXAONE4
Collection
Quantized checkpoints for EXAONE4 series • 5 items • Updated
Generated by GPTQModel https://github.com/ModelCloud/GPTQModel .
from gptqmodel import GPTQModel
model = GPTQModel.from_quantized("namgyu-youn/EXAONE-4.0-1.2B-GPTQ-W4A16", device="cuda:0")
from vllm import LLM
llm = LLM(model="namgyu-youn/EXAONE-4.0-1.2B-GPTQ-W4A16", dtype="float16")
repro:
MODEL="namgyu-youn/EXAONE-4.0-1.2B-GPTQ-W4A16-EoRA"
lm_eval --model vllm \
--model_args pretrained=${MODEL},dtype=float16,gpu_memory_utilization=0.85,enable_thinking=False,max_gen_toks=2048 \
--tasks gsm8k \
--limit 512 \
--output_path results \
--apply_chat_template \
--batch_size auto
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.6621|± |0.0209|
| | |strict-match | 5|exact_match|↑ |0.6562|± |0.0210|
|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.6758|± |0.0207|
| | |strict-match | 5|exact_match|↑ |0.6680|± |0.0208|
repro:
from vllm import LLM, SamplingParams
# NOTE: This module should not be fixed because it's related to submission server
def run_vllm(model_path: str):
"""Run vLLM inference on the given model
Args:
model_path: Path to the model or HuggingFace model ID
"""
prompts = [
[{"role": "user", "content": "Explain how wonderful you are"}],
]
sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=256)
llm = LLM(model=model_path)
outputs = llm.chat(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
if __name__ == "__main__":
MODEL="namgyu-youn/EXAONE-4.0-1.2B-GPTQ-W4A16"
MODEL="LGAI-EXAONE/EXAONE-4.0-1.2B"
run_vllm(MODEL)
Total num prompt tokens: 102400 Total num output tokens: 12800
Throughput: 10.36 requests/s, 11940.46 total tokens/s, 1326.72 output tokens/s
Throughput: 10.35 requests/s, 11919.77 total tokens/s, 1324.42 output tokens/s
Base model
LGAI-EXAONE/EXAONE-4.0-1.2B