This is the qunatized version of Qwen3-1.7B model in onnx format ,and it quantized using olive tool developed by microsoft.
For the inference we will use onnxruntime_genai.
The usage guide:
Installation:
%pip install olive-ai --q %pip install transformers onnxruntime-genai optimum optimum[onnxruntime] --q
Quantization:
!olive auto-opt --model_name_or_path Qwen/Qwen3-1.7B --output_path /kaggle/working/qwen3_1_5b_onnx --device cpu --provider CPUExecutionProvider --use_ort_genai \ --precision int4 --use_model_builder --use_ort_genai --log_level 1
HuggingFace login
from huggingface_hub import login
login('huggingface token')
Inference:
import json
from jinja2 import Template
import time
import onnxruntime_genai as og
import os
model_path = "/kaggle/working/qwen3_1_5b_onnx"
model = og.Model(model_path)
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()
while True:
text = input("Input: ")
if not text:
print("Error, input cannot be empty")
continue
messages = [{"role": "user", "content": text}]
jinja_path = os.path.join(model_path, "chat_template.jinja")
with open(jinja_path, "r", encoding="utf-8") as f:
template_str = f.read()
template = Template(template_str)
rendered_chat = template.render(messages=messages, tools=[])
# If your template is broken, fallback to a minimal prompt:
# rendered_chat = f"<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(rendered_chat)
params = og.GeneratorParams(model)
params.set_search_options(
do_sample=True,
max_length=2048,
top_p=0.9,
top_k=40,
temperature=0.8,
repetition_penalty=1.0
)
generator = og.Generator(model, params)
generator.append_tokens(input_ids)
started_timestamp = time.time()
first_token_timestamp = None
new_tokens = []
try:
while not generator.is_done():
generator.generate_next_token()
tokens = generator.get_next_tokens()
if tokens:
if first_token_timestamp is None:
first_token_timestamp = time.time()
# print **all** tokens
for t in tokens:
new_tokens.append(t)
print(tokenizer_stream.decode(t), end='', flush=True)
except KeyboardInterrupt:
print("\n-- Aborted --")
break
print("\n")
if first_token_timestamp:
prompt_time = first_token_timestamp - started_timestamp
run_time = time.time() - first_token_timestamp
print(
f"Prompt length: {len(input_ids)}, New tokens: {len(new_tokens)}, "
f"Time to first: {prompt_time:.2f}s, "
f"Prompt tokens/sec: {len(input_ids)/prompt_time:.2f}, "
f"New tokens/sec: {len(new_tokens)/run_time:.2f}"
)
- Downloads last month
- 3