quantize_glm46_awq.py · bullpoint/GLM-4.6-AWQ at main

GLM-4.6-AWQ / quantize_glm46_awq.py

Add files using upload-large-folder tool

d578beb verified 7 months ago

10.6 kB

	#!/usr/bin/env python3
	"""
	GLM-4.6 AWQ Quantization Script

	Quantizes GLM-4.6 (357B MoE) to 4-bit AWQ for efficient inference with vLLM.

	Requirements:
	- 1× GPU with 48GB+ VRAM (single GPU is optimal)
	- 768GB+ system RAM (DDR4/DDR5)
	- 300GB+ swap space (will be actively used)
	- PyTorch with CUDA support
	- llm-compressor
	- transformers
	- datasets

	Hardware Notes:
	- Multi-GPU provides NO quantization speedup (process is RAM-bound, not GPU-bound)
	- The full BF16 model (~714GB) will be offloaded to system RAM/swap
	- Quantized using: 1× RTX PRO 6000 Blackwell Max-Q (96GB) + 768GB RAM
	- Quantization time: ~5 hours (includes calibration, smoothing, compression, and saving)

	Usage:
	python quantize_glm46_awq.py --model zai-org/GLM-4.6 --output ./GLM-4.6-AWQ

	Advanced options:
	python quantize_glm46_awq.py \
	--model zai-org/GLM-4.6 \
	--output ./GLM-4.6-AWQ \
	--device-map sequential \
	--max-cpu-memory 750GiB \
	--cal-samples 512
	"""

	import os
	import argparse
	import json
	import shutil
	import pathlib
	from typing import List

	import torch
	from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
	from datasets import load_dataset
	from llmcompressor import oneshot
	from llmcompressor.modifiers.awq import AWQModifier


	def add_no_split(cfg: AutoConfig, classes: List[str]) -> AutoConfig:
	"""Prevent splitting specific module classes across devices."""
	ns = set(getattr(cfg, "no_split_module_classes", []) or [])
	ns.update(classes)
	cfg.no_split_module_classes = list(ns)
	return cfg


	def compute_batch_size(seq_len: int, target_tokens: int) -> int:
	"""Calculate batch size to achieve target tokens per calibration step."""
	return max(1, target_tokens // seq_len)


	def clone_and_fix_index(src_dir: str) -> str:
	"""
	Clone model directory and fix empty-string key in weight_map if present.
	This prevents device_map='auto' errors with some sharded checkpoints.
	"""
	src = pathlib.Path(src_dir)
	dst = src.parent / (src.name + "_fixed_index")
	if dst.exists():
	shutil.rmtree(dst)
	shutil.copytree(src, dst)

	candidates = ["model.safetensors.index.json", "pytorch_model.bin.index.json"]
	found = None
	for c in candidates:
	p = dst / c
	if p.exists():
	found = p
	break
	if not found:
	return str(dst)

	with open(found, "r") as f:
	idx = json.load(f)
	wm = idx.get("weight_map", {})
	if "" in wm:
	del wm[""]
	idx["weight_map"] = wm
	with open(found, "w") as f:
	json.dump(idx, f)
	return str(dst)


	def main():
	parser = argparse.ArgumentParser(description="Quantize GLM-4.6 to 4-bit AWQ")
	parser.add_argument("--model", required=True, help="Path or HF ID of GLM-4.6 model (e.g., zai-org/GLM-4.6)")
	parser.add_argument("--output", required=True, help="Output directory for quantized model")
	parser.add_argument("--cal-samples", type=int, default=512, help="Number of calibration samples (default: 512)")
	parser.add_argument("--cal-seq-len", type=int, default=2048, help="Calibration sequence length (default: 2048)")
	parser.add_argument("--batch-tokens", type=int, default=131072, help="Tokens per calibration step (default: 131072)")
	parser.add_argument("--dataset", default="neuralmagic/LLM_compression_calibration", help="Calibration dataset")
	parser.add_argument("--dataset-split", default="train", help="Dataset split to use")
	parser.add_argument("--device-map", choices=["auto", "sequential"], default="auto",
	help="Device placement strategy: 'auto' (recommended) or 'sequential' (robust)")
	parser.add_argument("--max-memory-per-gpu", type=str, default="92GiB",
	help="Max memory per GPU (default: 92GiB for 96GB GPUs)")
	parser.add_argument("--max-cpu-memory", type=str, default="500GiB",
	help="Max CPU memory for offloading (default: 500GiB)")
	args = parser.parse_args()

	# Environment setup
	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
	os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:512")

	# Use only GPU 0 for quantization (multi-GPU provides no benefit - process is RAM-bound)
	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")

	# Enable TF32 for faster computation on Ampere+ GPUs
	try:
	torch.backends.cuda.matmul.fp32_precision = "tf32"
	torch.backends.cudnn.conv.fp32_precision = "tf32"
	except Exception:
	pass

	torch.set_num_threads(8)

	# Verify CUDA availability
	if not torch.cuda.is_available():
	raise RuntimeError("CUDA is not available. This script requires GPU(s).")

	num_gpus = torch.cuda.device_count()
	print(f"✓ Found {num_gpus} CUDA device(s)")
	print(f"✓ Using GPU 0 for quantization (CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', 'all')})")
	print(f"\nNote: Multi-GPU provides NO speedup for quantization - the process is RAM-bound.")
	print(f" The full BF16 model (~714GB) will be offloaded to system RAM/swap.")

	# Load configuration
	print(f"Loading config from: {args.model}")
	cfg = AutoConfig.from_pretrained(args.model, trust_remote_code=True)

	# Prevent splitting merged linear layers across devices
	cfg = add_no_split(cfg, ["MergedColumnParallelLinear"])

	# Load tokenizer
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True, use_fast=True)

	# Load model with device placement
	print(f"Loading model weights from: {args.model}")
	load_dir = args.model

	if args.device_map == "auto":
	try:
	load_dir = clone_and_fix_index(args.model)
	except Exception as e:
	print(f"Index sanitization skipped: {e}")

	# Configure memory allocation
	max_mem = {i: args.max_memory_per_gpu for i in range(num_gpus)}
	max_mem["cpu"] = args.max_cpu_memory

	try:
	model = AutoModelForCausalLM.from_pretrained(
	load_dir,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	device_map=args.device_map,
	config=cfg,
	max_memory=max_mem,
	offload_folder=None,
	offload_state_dict=False,
	)
	except KeyError as e:
	if args.device_map == "auto":
	print(f"Auto device_map failed with {e}; falling back to sequential...")
	model = AutoModelForCausalLM.from_pretrained(
	load_dir,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	device_map="sequential",
	config=cfg,
	max_memory=max_mem,
	)
	else:
	raise

	print("✓ Model loaded successfully")

	# Print GPU memory usage
	print("\nGPU Memory Usage:")
	for i in range(num_gpus):
	allocated = torch.cuda.memory_allocated(i) / 1e9
	peak = torch.cuda.max_memory_allocated(i) / 1e9
	print(f" GPU {i}: {allocated:.2f} GB allocated / {peak:.2f} GB peak")

	# Load calibration dataset
	print(f"\nLoading calibration dataset: {args.dataset}")
	ds = load_dataset(args.dataset, split=args.dataset_split)
	ds = ds.shuffle(seed=42).select(range(args.cal_samples))
	print(f"✓ Selected {len(ds)} calibration samples")

	seq_len = args.cal_seq_len
	batch_size = compute_batch_size(seq_len, args.batch_tokens)
	print(f"Calibration config: seq_len={seq_len}, batch_size={batch_size}")

	# AWQ quantization recipe
	# Keep critical layers at full precision for quality
	ignore_patterns = [
	"lm_head",
	"model.embed_tokens",
	"re:.*input_layernorm$",
	"re:.*post_attention_layernorm$",
	"model.norm",
	"re:.*q_norm$",
	"re:.*k_norm$",
	"re:.shared_experts.", # Always-active experts
	"re:.*mlp\\.gate\\.weight$", # MoE router
	"re:.mlp\\.gate\\..bias$", # MoE router bias
	"re:model.layers.[0-2]\\.", # First 3 layers for quality
	]

	# Target patterns for quantization
	targets = [
	"re:.gate_proj.", # MLP projections
	"re:.up_proj.",
	"re:.down_proj.",
	"re:.k_proj.", # Attention projections
	"re:.q_proj.",
	"re:.v_proj.",
	"re:.o_proj.",
	]

	recipe = [
	AWQModifier(
	ignore=ignore_patterns,
	config_groups={
	"group_0": {
	"targets": targets,
	"weights": {
	"num_bits": 4,
	"type": "int",
	"symmetric": True,
	"group_size": 128,
	"strategy": "group",
	"dynamic": False,
	},
	"input_activations": None,
	"output_activations": None,
	"format": None,
	}
	},
	)
	]

	# Run AWQ quantization
	print("\n" + "="*80)
	print("Starting AWQ quantization...")
	print("="*80)

	with torch.inference_mode():
	oneshot_args = {
	"model": model,
	"dataset": ds,
	"recipe": recipe,
	"max_seq_length": seq_len,
	"num_calibration_samples": len(ds),
	}

	# Add batch_size if supported
	try:
	from inspect import signature
	if "batch_size" in signature(oneshot).parameters:
	oneshot_args["batch_size"] = batch_size
	except Exception:
	pass

	oneshot(**oneshot_args)

	print("\n✓ AWQ quantization completed successfully")

	# Save quantized model
	print(f"\nSaving quantized model to: {args.output}")
	os.makedirs(args.output, exist_ok=True)

	model.save_pretrained(args.output, save_compressed=True)
	tokenizer.save_pretrained(args.output)

	print("\n" + "="*80)
	print("QUANTIZATION COMPLETE")
	print("="*80)
	print(f"Quantized model saved to: {args.output}")
	print(f"\nModel size on disk: ~176 GB (39 safetensors files)")
	print(f"\nTo use with vLLM:")
	print(f" vllm serve {args.output} \\")
	print(f" --tensor-parallel-size 4 \\")
	print(f" --enable-expert-parallel \\")
	print(f" --trust-remote-code")
	print("="*80)


	if __name__ == "__main__":
	main()