dsa
AI & ML interests
Recent Activity
Organizations
split
Example of how to run inference on this with optimum (optimum[onnx-runtime])?
code
How to use this model
import torch
from transformers import Mistral3ForConditionalGeneration
from diffusers import Flux2Pipeline, Flux2Transformer2DModel
repo_id = "diffusers/FLUX.2-dev-bnb-4bit"
device = "cuda:0"
torch_dtype = torch.bfloat16
transformer = Flux2Transformer2DModel.from_pretrained(
repo_id, subfolder="transformer", torch_dtype=torch_dtype, device_map="cpu"
)
text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
repo_id, subfolder="text_encoder", dtype=torch_dtype, device_map="cpu"
)
pipe = Flux2Pipeline.from_pretrained(
repo_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype
)
pipe.enable_model_cpu_offload()
prompt = "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text BFL Diffusers on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom."
image = pipe(
prompt=prompt,
generator=torch.Generator(device=device).manual_seed(42),
num_inference_steps=50, # 28 is a good trade-off
guidance_scale=4,
).images[0]
image.save("flux2_t2i_nf4.png")
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py:206: UserWarning: The local_dir_use_symlinks argument is deprecated and ignored in hf_hub_download. Downloading to a local directory does not use symlinks anymore.
warnings.warn(
Download complete:  0.00/0.00 [00:00<?, ?B/s]Fetching 2 files: 100% 2/2 [00:00<00:00, 98.80it/s]Loading checkpoint shards: 100% 2/2 [00:01<00:00,  2.03it/s]Download complete:  0.00/0.00 [00:00<?, ?B/s]Fetching 4 files: 100% 4/4 [00:00<00:00, 167.57it/s]Loading weights: 100% 585/585 [00:02<00:00, 310.91it/s, Materializing param=model.vision_tower.transformer.layers.23.ffn_norm.weight]The tied weights mapping and config for this model specifies to tie model.language_model.embed_tokens.weight to lm_head.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with tie_word_embeddings=False to silence this warning
Loading pipeline components...: 100% 5/5 [00:03<00:00,  1.14it/s]---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
/tmp/ipykernel_18947/863729753.py in <cell line: 0>()
22 prompt = "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text BFL Diffusers on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom."
23
---> 24 image = pipe(
25 prompt=prompt,
26 generator=torch.Generator(device=device).manual_seed(42),
36 frames/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
122 # pyrefly: ignore [bad-context-manager]
123 with ctx_factory():
--> 124 return func(*args, **kwargs)
125
126 return decorate_context
/usr/local/lib/python3.12/dist-packages/diffusers/pipelines/flux2/pipeline_flux2.py in call(self, image, prompt, height, width, num_inference_steps, sigmas, guidance_scale, num_images_per_prompt, generator, latents, prompt_embeds, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, max_sequence_length, text_encoder_out_layers, caption_upsample_temperature)
869 prompt, images=image, temperature=caption_upsample_temperature, device=device
870 )
--> 871 prompt_embeds, text_ids = self.encode_prompt(
872 prompt=prompt,
873 prompt_embeds=prompt_embeds,
/usr/local/lib/python3.12/dist-packages/diffusers/pipelines/flux2/pipeline_flux2.py in encode_prompt(self, prompt, device, num_images_per_prompt, prompt_embeds, max_sequence_length, text_encoder_out_layers)
586
587 if prompt_embeds is None:
--> 588 prompt_embeds = self._get_mistral_3_small_prompt_embeds(
589 text_encoder=self.text_encoder,
590 tokenizer=self.tokenizer,
/usr/local/lib/python3.12/dist-packages/diffusers/pipelines/flux2/pipeline_flux2.py in _get_mistral_3_small_prompt_embeds(text_encoder, tokenizer, prompt, dtype, device, max_sequence_length, system_message, hidden_states_layers)
337
338 # Forward pass through the model
--> 339 output = text_encoder(
340 input_ids=input_ids,
341 attention_mask=attention_mask,
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
190 output = module._old_forward(*args, **kwargs)
191 else:
--> 192 output = module._old_forward(*args, **kwargs)
193 return module._hf_hook.post_forward(module, output)
194
/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py in wrapper(self, *args, **kwargs)
1000 outputs = func(self, *args, **kwargs)
1001 else:
-> 1002 outputs = func(self, *args, **kwargs)
1003 except TypeError as original_exception:
1004 # If we get a TypeError, it's possible that the model is not receiving the recordable kwargs correctly.
/usr/local/lib/python3.12/dist-packages/transformers/models/mistral3/modeling_mistral3.py in forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, image_sizes, **kwargs)
444 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
445
--> 446 outputs = self.model(
447 input_ids=input_ids,
448 pixel_values=pixel_values,
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py in wrapper(self, *args, **kwargs)
1000 outputs = func(self, *args, **kwargs)
1001 else:
-> 1002 outputs = func(self, *args, **kwargs)
1003 except TypeError as original_exception:
1004 # If we get a TypeError, it's possible that the model is not receiving the recordable kwargs correctly.
/usr/local/lib/python3.12/dist-packages/transformers/models/mistral3/modeling_mistral3.py in forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, image_sizes, **kwargs)
323 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
324
--> 325 outputs = self.language_model(
326 attention_mask=attention_mask,
327 position_ids=position_ids,
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py in wrapper(self, *args, **kwargs)
1000 outputs = func(self, *args, **kwargs)
1001 else:
-> 1002 outputs = func(self, *args, **kwargs)
1003 except TypeError as original_exception:
1004 # If we get a TypeError, it's possible that the model is not receiving the recordable kwargs correctly.
/usr/local/lib/python3.12/dist-packages/transformers/models/mistral/modeling_mistral.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, cache_position, **kwargs)
395
396 for decoder_layer in self.layers[: self.config.num_hidden_layers]:
--> 397 hidden_states = decoder_layer(
398 hidden_states,
399 attention_mask=causal_mask,
/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py in call(self, *args, **kwargs)
91
92 return self._gradient_checkpointing_func(partial(super().call, **kwargs), *args)
---> 93 return super().call(*args, **kwargs)
94
95
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py in wrapped_forward(*args, **kwargs)
953 if key == "hidden_states" and len(collected_outputs[key]) == 0:
954 collected_outputs[key] += (args[0],)
--> 955 output = orig_forward(*args, **kwargs)
956 if not isinstance(output, tuple):
957 collected_outputs[key] += (output,)
/usr/local/lib/python3.12/dist-packages/transformers/models/mistral/modeling_mistral.py in forward(self, hidden_states, attention_mask, position_ids, past_key_values, use_cache, cache_position, position_embeddings, **kwargs)
228 hidden_states = self.input_layernorm(hidden_states)
229 # Self Attention
--> 230 hidden_states, _ = self.self_attn(
231 hidden_states=hidden_states,
232 attention_mask=attention_mask,
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/transformers/models/mistral/modeling_mistral.py in forward(self, hidden_states, position_embeddings, attention_mask, past_key_values, cache_position, **kwargs)
151 hidden_shape = (*input_shape, -1, self.head_dim)
152
--> 153 query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
154 key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
155 value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1774 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1775 else:
-> 1776 return self._call_impl(*args, **kwargs)
1777
1778 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1785 or _global_backward_pre_hooks or _global_backward_hooks
1786 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1787 return forward_call(*args, **kwargs)
1788
1789 result = None
/usr/local/lib/python3.12/dist-packages/bitsandbytes/nn/modules.py in forward(self, x)
554 weight = self.weight if getattr(quant_state, "packing_format_for_cpu", False) else self.weight.t()
555
--> 556 return bnb.matmul_4bit(x, weight, bias=bias, quant_state=quant_state).to(inp_dtype)
557
558
/usr/local/lib/python3.12/dist-packages/bitsandbytes/autograd/_functions.py in matmul_4bit(A, B, quant_state, out, bias)
399 return out
400 else:
--> 401 return MatMul4Bit.apply(A, B, out, bias, quant_state)
/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py in apply(cls, *args, **kwargs)
581 # See NOTE: [functorch vjp and autograd interaction]
582 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 583 return super().apply(*args, **kwargs) # type: ignore[misc]
584
585 if not is_setup_ctx_defined:
/usr/local/lib/python3.12/dist-packages/bitsandbytes/autograd/_functions.py in forward(ctx, A, B, out, bias, quant_state)
313 # 1. Dequantize
314 # 2. MatmulnN
--> 315 output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
316
317 # 3. Save state
/usr/local/lib/python3.12/dist-packages/bitsandbytes/functional.py in dequantize_4bit(A, quant_state, absmax, out, blocksize, quant_type)
1048 )
1049 else:
-> 1050 out = torch.ops.bitsandbytes.dequantize_4bit.default(
1051 A,
1052 absmax,
/usr/local/lib/python3.12/dist-packages/torch/_ops.py in call(self, *args, **kwargs)
817 # that are named "self". This way, all the aten ops can be called by kwargs.
818 def call(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
--> 819 return self._op(*args, **kwargs)
820
821 # Use positional-only argument to avoid naming collision with aten ops arguments
/usr/local/lib/python3.12/dist-packages/torch/_compile.py in inner(*args, **kwargs)
52 fn.__dynamo_disable = disable_fn # type: ignore[attr-defined]
53
---> 54 return disable_fn(*args, **kwargs)
55
56 return inner
/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py in _fn(*args, **kwargs)
1179 ):
1180 return fn(*args, **kwargs)
-> 1181 return fn(*args, **kwargs)
1182 finally:
1183 set_eval_frame(None)
/usr/local/lib/python3.12/dist-packages/torch/library.py in func_no_dynamo(*args, **kwargs)
740 @torch ._disable_dynamo
741 def func_no_dynamo(*args, **kwargs):
--> 742 return func(*args, **kwargs)
743
744 for key in keys:
/usr/local/lib/python3.12/dist-packages/bitsandbytes/backends/cuda/ops.py in _(A, absmax, blocksize, quant_type, shape, dtype)
361 dtype: torch.dtype,
362 ) -> torch.Tensor:
--> 363 out = torch.empty(shape, dtype=dtype, device=A.device)
364 _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
365 return out
OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB. GPU 0 has a total capacity of 14.56 GiB of which 17.81 MiB is free. Including non-PyTorch memory, this process has 14.54 GiB memory in use. Of the allocated memory 14.40 GiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
2 gpu
how run flux on two gpu
code????