license: mit

Example

gguf load example... Q8 need 24G Vram

from transformers import AutoTokenizer, Qwen3ForCausalLM, Qwen3Config
from accelerate import init_empty_weights
from diffusers import  GGUFQuantizationConfig
from diffusers.quantizers.gguf import GGUFQuantizer
from diffusers.models.model_loading_utils import load_model_dict_into_meta
import gc


llm_path=“./BitDance-14B-64x-text-encode-gguf-Q4_K_S.gguf” #  or Q8
model_path =“shallowdream204/BitDance-14B-64x”

llm_config = Qwen3Config.from_pretrained(model_path)
with init_empty_weights():
    llm_model = Qwen3ForCausalLM(llm_config)
g_config = GGUFQuantizationConfig(compute_dtype=torch.bfloat16)
hf_quantizer = GGUFQuantizer(quantization_config=g_config)
hf_quantizer.pre_quantized = True
model_state_dict=load_gguf_checkpoint_vl(llm_path) 

gc.collect()    
hf_quantizer._process_model_before_weight_loading(
    llm_model,
    device_map=None,
    state_dict=model_state_dict
    )

load_model_dict_into_meta(
    llm_model, 
    model_state_dict, 
    hf_quantizer=hf_quantizer,
    device_map=None,
    dtype=torch.bfloat16,
    
)

hf_quantizer._process_model_after_weight_loading(llm_model)
del model_state_dict
gc.collect()
llm_model.eval().to(torch.bfloat16)

print("Text encoder loaded successfully")


def load_gguf_checkpoint_vl(gguf_checkpoint_path):
    from  diffusers.utils  import is_gguf_available, is_torch_available
    if is_gguf_available() and is_torch_available():
        import gguf
        from gguf import GGUFReader
        from diffusers.quantizers.gguf.utils import SUPPORTED_GGUF_QUANT_TYPES, GGUFParameter
    else:
        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")

    reader = GGUFReader(gguf_checkpoint_path)
    parsed_parameters = {}
 
    for tensor in reader.tensors:
        name = tensor.name
        quant_type = tensor.tensor_type

        # if the tensor is a torch supported dtype do not use GGUFParameter
        is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
        if is_gguf_quant and quant_type not in SUPPORTED_GGUF_QUANT_TYPES:
            _supported_quants_str = "\n".join([str(type) for type in SUPPORTED_GGUF_QUANT_TYPES])
            raise ValueError(
                (
                    f"{name} has a quantization type: {str(quant_type)} which is unsupported."
                    "\n\nCurrently the following quantization types are supported: \n\n"
                    f"{_supported_quants_str}"
                    "\n\nTo request support for this quantization type please open an issue here: https://github.com/huggingface/diffusers"
                )
            )
        weights = torch.from_numpy(tensor.data.copy())
        parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
    
    del reader
    gc.collect()
    return parsed_parameters
Downloads last month: 17
GGUF
Model size
15B params
Architecture
qwen
Hardware compatibility
4-bit
8-bit
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support