Made with llm-compressor:
import re
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor import oneshot
import os, json
from datasets import load_dataset, get_dataset_split_names
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.utils import dispatch_for_generation
MODEL_ID = "huihui-ai/Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated"
OUTPUT_DIR = "./Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated-ExpertsOnly-nvfp4"
CAL_DATASET = os.environ.get("CAL_DATASET", "HuggingFaceH4/ultrachat_200k")
CAL_SPLIT_ENV = os.environ.get("CAL_SPLIT")
NUM_SAMPLES = int(os.environ.get("NUM_CAL_SAMPLES", "1024"))
MAX_SEQ_LEN = int(os.environ.get("MAX_SEQ_LEN", "4096"))
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, )
# 1. Target only expert MLP linears via regex
# Adjust the regex if the actual names differ; you can inspect with a tiny script
# that just loads the model once and prints .named_modules() to confirm.
expert_pattern = r".*mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)$"
recipe = [
QuantizationModifier(
scheme="nvfp4",
# Regex target: only expert MLP linear layers
targets=[f"re:{expert_pattern}"],
ignore=["lm_head"],
)
]
if CAL_SPLIT_ENV:
split = CAL_SPLIT_ENV
else:
try:
splits = get_dataset_split_names(CAL_DATASET)
except Exception as e:
raise RuntimeError(f"Could not list splits for {CAL_DATASET}: {e}")
# preference order: 'train', anything starting with 'train', otherwise first
if "train" in splits:
split = "train_sft"
else:
train_like = [s for s in splits if s.startswith("train")]
split = train_like[0] if train_like else splits[0]
print(f"[INFO] Using dataset {CAL_DATASET} split '{split}'")
print(f"[INFO] Preparing {NUM_SAMPLES} calibration samples @ max_len={MAX_SEQ_LEN}")
raw = load_dataset(CAL_DATASET, split=f"{split}[:{NUM_SAMPLES}]").shuffle(seed=42)
def to_text(ex):
if "messages" in ex:
# chat-style sample
return {"text": tok.apply_chat_template(ex["messages"], tokenize=False)}
for key in ("text", "content", "raw"):
if key in ex:
return {"text": ex[key]}
return {"text": str(ex)}
ds_text = raw.map(to_text)
def tok_fn(sample):
return tok(sample["text"], padding=False, truncation=True,
max_length=MAX_SEQ_LEN, add_special_tokens=False)
ds_tok = ds_text.map(tok_fn, remove_columns=ds_text.column_names)
# 2. Run oneshot in "data-free" mode
# - dataset=None โ no calibration dataset
# - num_calibration_samples=0 โ skip trying to prepare any calibration loader
# - moe_calibrate_all_experts=False โ avoids extra MoE calibration logic
# - quantization_aware_calibration=False โ no QAT-style forward passes
quantized_model = oneshot(
model=MODEL_ID,
precision="bf16",
trust_remote_code_model=True,
recipe=recipe,
output_dir=OUTPUT_DIR,
save_compressed=True,
dataset=ds_tok,
max_seq_length=MAX_SEQ_LEN,
num_calibration_samples=NUM_SAMPLES,
)
run with vllm 11.2+CUDA13. You need the following environment variables set for VLLM:
VLLM_USE_FLASHINFER_MOE_FP4=1
VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE=248434496
and the following command line options in addition to standard ones:
--enable-auto-tool-choice
--tool-call-parser hermes \
--compilation-config '{"cudagraph_mode": "PIECEWISE"}'
- Downloads last month
- 40
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐ Ask for provider support
Model tree for coughmedicine/Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated-nvfp4
Base model
Qwen/Qwen3-Next-80B-A3B-Instruct