import torch from datasets import load_dataset from transformers import AutoTokenizer from auto_round import AutoRound model_name_or_path = "." output_dir = "./Qwen3.6-27B-INT8-autoround" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) ignore_keywords =[ "embed_tokens", "linear_attn", # "self_attn", "visual", "mtp", "lm_head" ] layer_config = {} for keyword in ignore_keywords: layer_config[keyword] = {"bits": 16} hf_dataset = load_dataset("NeelNanda/pile-10k", split="train") seqlen = 2048 tokens_list =[] max_samples = 512 for item in hf_dataset: text = item["text"] tokenized = tokenizer( text, truncation=True, max_length=seqlen, return_tensors="pt" ) input_ids = tokenized["input_ids"] if input_ids.shape[-1] >= seqlen: tokens_list.append(input_ids) if len(tokens_list) >= max_samples: break ar = AutoRound( model=model_name_or_path, tokenizer=tokenizer, scheme="W8A16", enable_torch_compile=True, group_size=-1, sym=True, layer_config=layer_config, dataset=tokens_list, device_map="0,1", batch_size=8, seqlen=seqlen, iters=1000 ) ar.quantize_and_save(output_dir, format="auto_round")