Made with llm-compressor:

import re
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor import oneshot

import os, json
from datasets import load_dataset, get_dataset_split_names
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.utils import dispatch_for_generation


MODEL_ID = "huihui-ai/Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated"
OUTPUT_DIR = "./Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated-ExpertsOnly-nvfp4"
CAL_DATASET   = os.environ.get("CAL_DATASET", "HuggingFaceH4/ultrachat_200k")
CAL_SPLIT_ENV = os.environ.get("CAL_SPLIT")
NUM_SAMPLES   = int(os.environ.get("NUM_CAL_SAMPLES", "1024"))
MAX_SEQ_LEN   = int(os.environ.get("MAX_SEQ_LEN", "4096"))

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, )


# 1. Target only expert MLP linears via regex
#    Adjust the regex if the actual names differ; you can inspect with a tiny script
#    that just loads the model once and prints .named_modules() to confirm.
expert_pattern = r".*mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)$"

recipe = [
    QuantizationModifier(
        scheme="nvfp4",
        # Regex target: only expert MLP linear layers
        targets=[f"re:{expert_pattern}"],
        ignore=["lm_head"],
    )
]

if CAL_SPLIT_ENV:
    split = CAL_SPLIT_ENV
else:
    try:
        splits = get_dataset_split_names(CAL_DATASET)
    except Exception as e:
        raise RuntimeError(f"Could not list splits for {CAL_DATASET}: {e}")
    # preference order: 'train', anything starting with 'train', otherwise first
    if "train" in splits:
        split = "train_sft"
    else:
        train_like = [s for s in splits if s.startswith("train")]
        split = train_like[0] if train_like else splits[0]
print(f"[INFO] Using dataset {CAL_DATASET} split '{split}'")

print(f"[INFO] Preparing {NUM_SAMPLES} calibration samples @ max_len={MAX_SEQ_LEN}")
raw = load_dataset(CAL_DATASET, split=f"{split}[:{NUM_SAMPLES}]").shuffle(seed=42)

def to_text(ex):
    if "messages" in ex:
        # chat-style sample
        return {"text": tok.apply_chat_template(ex["messages"], tokenize=False)}
    for key in ("text", "content", "raw"):
        if key in ex:
            return {"text": ex[key]}
    return {"text": str(ex)}

ds_text = raw.map(to_text)

def tok_fn(sample):
    return tok(sample["text"], padding=False, truncation=True,
               max_length=MAX_SEQ_LEN, add_special_tokens=False)

ds_tok = ds_text.map(tok_fn, remove_columns=ds_text.column_names)



# 2. Run oneshot in "data-free" mode
#    - dataset=None โ†’ no calibration dataset
#    - num_calibration_samples=0 โ†’ skip trying to prepare any calibration loader
#    - moe_calibrate_all_experts=False โ†’ avoids extra MoE calibration logic
#    - quantization_aware_calibration=False โ†’ no QAT-style forward passes
quantized_model = oneshot(
    model=MODEL_ID,
    precision="bf16",
    trust_remote_code_model=True,
    recipe=recipe,
    output_dir=OUTPUT_DIR,
    save_compressed=True,
    dataset=ds_tok,
    max_seq_length=MAX_SEQ_LEN,
    num_calibration_samples=NUM_SAMPLES,

)

run with vllm 11.2+CUDA13. You need the following environment variables set for VLLM:

VLLM_USE_FLASHINFER_MOE_FP4=1 
VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE=248434496

and the following command line options in addition to standard ones:

--enable-auto-tool-choice
--tool-call-parser hermes \
--compilation-config '{"cudagraph_mode": "PIECEWISE"}'
Downloads last month
40
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for coughmedicine/Huihui-Qwen3-Next-80B-A3B-Instruct-abliterated-nvfp4