license: mit
Example
gguf load example... Q8 need 24G Vram
from transformers import AutoTokenizer, Qwen3ForCausalLM, Qwen3Config
from accelerate import init_empty_weights
from diffusers import GGUFQuantizationConfig
from diffusers.quantizers.gguf import GGUFQuantizer
from diffusers.models.model_loading_utils import load_model_dict_into_meta
import gc
llm_path=“./BitDance-14B-64x-text-encode-gguf-Q4_K_S.gguf” # or Q8
model_path =“shallowdream204/BitDance-14B-64x”
llm_config = Qwen3Config.from_pretrained(model_path)
with init_empty_weights():
llm_model = Qwen3ForCausalLM(llm_config)
g_config = GGUFQuantizationConfig(compute_dtype=torch.bfloat16)
hf_quantizer = GGUFQuantizer(quantization_config=g_config)
hf_quantizer.pre_quantized = True
model_state_dict=load_gguf_checkpoint_vl(llm_path)
gc.collect()
hf_quantizer._process_model_before_weight_loading(
llm_model,
device_map=None,
state_dict=model_state_dict
)
load_model_dict_into_meta(
llm_model,
model_state_dict,
hf_quantizer=hf_quantizer,
device_map=None,
dtype=torch.bfloat16,
)
hf_quantizer._process_model_after_weight_loading(llm_model)
del model_state_dict
gc.collect()
llm_model.eval().to(torch.bfloat16)
print("Text encoder loaded successfully")
def load_gguf_checkpoint_vl(gguf_checkpoint_path):
from diffusers.utils import is_gguf_available, is_torch_available
if is_gguf_available() and is_torch_available():
import gguf
from gguf import GGUFReader
from diffusers.quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter
else:
raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
reader = GGUFReader(gguf_checkpoint_path)
parsed_parameters = {}
for tensor in reader.tensors:
name = tensor.name
quant_type = tensor.tensor_type
# if the tensor is a torch supported dtype do not use GGUFParameter
is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
_supported_quants_str = "\n".join([str(type) for type in SUPPORTED_GGUF_QUANT_TYPES])
raise ValueError(
(
f"{name} has a quantization type: {str(quant_type)} which is unsupported."
"\n\nCurrently the following quantization types are supported: \n\n"
f"{_supported_quants_str}"
"\n\nTo request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
)
)
weights = torch.from_numpy(tensor.data.copy())
parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
del reader
gc.collect()
return parsed_parameters
- Downloads last month
- 17
Hardware compatibility
Log In to add your hardware
4-bit
8-bit
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support
