Initial release: DramaBox DiT INT8 selective quantization

dac920b verified 9 days ago

3.59 kB

	"""Load pre-quantized INT8 DramaBox DiT weights from safetensors.

	Quantized layers are stored as:
	{layer_name}.weight.__int_data (INT8 tensor)
	{layer_name}.weight.__scale (BF16 per-channel scale)

	Non-quantized layers are stored as plain BF16 tensors.

	Usage:
	from load_int8 import load_int8_dit
	load_int8_dit(tts._velocity_model, "dramabox-dit-int8-selective.safetensors")
	"""
	import json
	import logging
	import os

	import torch
	from safetensors.torch import load_file

	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


	def load_int8_dit(
	model: torch.nn.Module,
	safetensors_path: str,
	config_path: str \| None = None,
	device: str = "cuda",
	) -> torch.nn.Module:
	"""Replace model weights with pre-quantized INT8 weights from safetensors.

	For quantized layers, reconstructs the dequantized BF16 weight from
	int_data * scale (equivalent to what torchao does at runtime, but without
	needing torchao installed for loading).

	For runtime INT8 inference (keeping weights in INT8 and dequantizing during
	matmul), use the torchao approach instead — see README.md Option 1.
	"""
	if config_path is None:
	config_path = os.path.join(os.path.dirname(safetensors_path), "config.json")

	tensors = load_file(safetensors_path, device=device)
	logging.info(f"Loaded {len(tensors)} tensors from {safetensors_path}")

	int_data_suffix = ".weight.__int_data"
	scale_suffix = ".weight.__scale"

	quantized_names = set()
	for key in tensors:
	if key.endswith(int_data_suffix):
	name = key[: -len(int_data_suffix)]
	quantized_names.add(name)

	sd = model.state_dict()
	loaded, skipped = 0, 0

	for key in list(sd.keys()):
	parts = key.rsplit(".", 1)
	if len(parts) == 2:
	layer_name, param_name = parts
	else:
	layer_name, param_name = "", parts[0]

	if layer_name in quantized_names and param_name == "weight":
	int_data = tensors[f"{layer_name}{int_data_suffix}"]
	scale = tensors[f"{layer_name}{scale_suffix}"]
	sd[key] = (int_data.float() * scale.unsqueeze(1)).to(torch.bfloat16)
	loaded += 1
	elif key in tensors:
	sd[key] = tensors[key]
	loaded += 1
	elif f"{layer_name}.{param_name}" in tensors:
	sd[key] = tensors[f"{layer_name}.{param_name}"]
	loaded += 1
	else:
	skipped += 1

	model.load_state_dict(sd, strict=False)
	logging.info(
	f"Loaded {loaded} params ({len(quantized_names)} dequantized from INT8), "
	f"skipped {skipped}"
	)
	return model


	def load_int8_dit_torchao(
	model: torch.nn.Module,
	safetensors_path: str,
	device: str = "cuda",
	) -> torch.nn.Module:
	"""Load INT8 weights and apply torchao quantization for runtime INT8 matmul.

	This keeps weights in INT8 during inference (lower VRAM) but requires torchao.
	"""
	from torchao.quantization import quantize_, Int8WeightOnlyConfig

	load_int8_dit(model, safetensors_path, device=device)

	config_path = os.path.join(os.path.dirname(safetensors_path), "config.json")
	with open(config_path) as f:
	config = json.load(f)

	quantized_set = set(config["quantized_layers"])

	def filter_fn(mod, fqn):
	return isinstance(mod, torch.nn.Linear) and fqn in quantized_set

	quantize_(model, Int8WeightOnlyConfig(), filter_fn=filter_fn)
	logging.info(f"Applied torchao INT8 to {len(quantized_set)} layers")
	return model