Upload folder using huggingface_hub
Browse files- .gitattributes +4 -0
- README.md +314 -3
- assets/architecture.png +3 -0
- assets/edit_example.png +3 -0
- assets/llada_logo.png +0 -0
- assets/performance.png +3 -0
- assets/understanding_example.png +0 -0
- config.json +48 -0
- configuration_llada2uni_moe.py +122 -0
- decoder-turbo/config.json +32 -0
- decoder-turbo/decoder_model.safetensors +3 -0
- decoder/config.json +32 -0
- decoder/decoder_model.safetensors +3 -0
- image_tokenizer/config.json +61 -0
- image_tokenizer/image_tokenizer.safetensors +3 -0
- image_tokenizer/preprocessor_config.json +16 -0
- image_tokenizer/sigvq_embedding.pt +3 -0
- model-00001-of-00013.safetensors +3 -0
- model-00002-of-00013.safetensors +3 -0
- model-00003-of-00013.safetensors +3 -0
- model-00004-of-00013.safetensors +3 -0
- model-00005-of-00013.safetensors +3 -0
- model-00006-of-00013.safetensors +3 -0
- model-00007-of-00013.safetensors +3 -0
- model-00008-of-00013.safetensors +3 -0
- model-00009-of-00013.safetensors +3 -0
- model-00010-of-00013.safetensors +3 -0
- model-00011-of-00013.safetensors +3 -0
- model-00012-of-00013.safetensors +3 -0
- model-00013-of-00013.safetensors +3 -0
- model.safetensors.index.json +0 -0
- modeling_llada2uni_moe.py +0 -0
- special_tokens_map.json +37 -0
- tokenizer.json +3 -0
- tokenizer_config.json +0 -0
- vae/config.json +38 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/architecture.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/edit_example.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/performance.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,314 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
tags:
|
| 6 |
+
- multimodal
|
| 7 |
+
- image-generation
|
| 8 |
+
- image-understanding
|
| 9 |
+
- image-editing
|
| 10 |
+
- diffusion
|
| 11 |
+
- moe
|
| 12 |
+
- text-to-image
|
| 13 |
+
library_name: transformers
|
| 14 |
+
pipeline_tag: image-text-to-text
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
<p align="center">
|
| 18 |
+
<img src="./assets/llada_logo.png" width="20%"/>
|
| 19 |
+
</p>
|
| 20 |
+
<div align="center">
|
| 21 |
+
<h1> LLaDA2.0-Uni: Unifying Multimodal Understanding and Generation with Diffusion Large Language Model </h1>
|
| 22 |
+
|
| 23 |
+
[[📑 Technical Report ]()]   [[🌐 Github ](https://github.com/inclusionAI/LLaDA2.0-Uni)]
|
| 24 |
+
|
| 25 |
+
<b>AGI Research Center, Inclusion AI </b>
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
## Model Capabilities
|
| 29 |
+
|
| 30 |
+
**LLaDA2.0-Uni** is a unified diffusion Large Language Model (dLLM) based on Mixture-of-Experts (MoE) that seamlessly integrates multimodal understanding and generation within a single model. It supports:
|
| 31 |
+
|
| 32 |
+
- 🖼️ **Text-to-Image Generation** — high-fidelity image synthesis with optional thinking/reasoning.
|
| 33 |
+
- 🔍 **Image Understanding** — visual question answering, image captioning, document understanding, etc.
|
| 34 |
+
- ✏️ **Image Editing** — instruction-based editing with single or multi-reference support.
|
| 35 |
+
- 🎨 **Interleaved Generation and Reasoning** — provide preliminary support for interleaved generation and unlock advanced interleaved reasoning.
|
| 36 |
+
- ⚡ **Sprint Acceleration** — KV cache reuse and adaptive unmasking for faster inference.
|
| 37 |
+
|
| 38 |
+
## Model Architecture
|
| 39 |
+
|
| 40 |
+
<img src="./assets/architecture.png" width="100%"/>
|
| 41 |
+
|
| 42 |
+
- **Unified dLLM-MoE Backbone**: Unifies multimodal understanding and generation into a simple Mask Token Prediction paradigm.
|
| 43 |
+
- **Discrete Semantic Tokenizer**: Utilizes SigLIP-VQ to convert visual inputs into discrete semantic tokens, significantly enhancing multimodal understanding.
|
| 44 |
+
- **Efficient Diffusion Decoder**: Pairs discrete tokens with a specialized diffusion decoder for high-fidelity generation, enabling rapid 8-step inference via distillation.
|
| 45 |
+
|
| 46 |
+
## Evaluation Results
|
| 47 |
+
|
| 48 |
+
<img src="./assets/performance.png" width="100%"/>
|
| 49 |
+
|
| 50 |
+
## Quick Start
|
| 51 |
+
|
| 52 |
+
> **Note:** Full installation instructions and CLI scripts are available in the [GitHub repository](https://github.com/inclusionAI/LLaDA2-Uni).
|
| 53 |
+
|
| 54 |
+
### ⚙️ Installation
|
| 55 |
+
|
| 56 |
+
#### 1. Create a conda environment
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
git clone https://github.com/inclusionAI/LLaDA2-Uni && cd LLaDA2-Uni
|
| 60 |
+
conda create -n llada2_uni python=3.10 -y
|
| 61 |
+
conda activate llada2_uni
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
#### 2. Install PyTorch (CUDA 12.4)
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
#### 3. Install Flash Attention 2 (required for efficient inference)
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
pip install flash-attn --no-build-isolation
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
#### 4. Install remaining dependencies
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
pip install -r requirements.txt
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### 🌟 Text-to-Image Generation
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
import torch
|
| 86 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 87 |
+
from decoder import decode_vq_tokens
|
| 88 |
+
|
| 89 |
+
model_path = "inclusionAI/LLaDA2.0-Uni"
|
| 90 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 91 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 92 |
+
model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
|
| 93 |
+
).eval()
|
| 94 |
+
model.tokenizer = tokenizer
|
| 95 |
+
|
| 96 |
+
# Generate image tokens
|
| 97 |
+
result = model.generate_image(
|
| 98 |
+
"A modern Scandinavian kitchen with white cabinetry, marble countertops, and a single orchid on the island. A Nordic woman with sleek blonde ponytail, wearing an oversized sweater and dainty silver necklaces, stirs a matcha bowl with a bamboo whisk, eyes sparkling with quiet joy. Shot with 50mm, f/2.5, diffused window light, cool white balance, low saturation, clean skin retouch. Mood: serene, wholesome, hygge.",
|
| 99 |
+
image_h=1024, image_w=1024,
|
| 100 |
+
steps=8, cfg_scale=2.0,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Decode to PIL image (default: 50-step ODE)
|
| 104 |
+
image = decode_vq_tokens(result["token_ids"], result["h"], result["w"], model_path, "cuda")
|
| 105 |
+
image.save("output.png")
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
> [!Note]
|
| 109 |
+
> 💡 **Faster decoding** — Use the **decoder-turbo** (distilled decoder) for **~10× faster** image decoding (8 steps instead of 50) with minimal quality loss:
|
| 110 |
+
> ```python
|
| 111 |
+
> image = decode_vq_tokens(
|
| 112 |
+
> result["token_ids"], result["h"], result["w"], model_path, "cuda",
|
| 113 |
+
> num_steps=8, decode_mode="decoder-turbo",
|
| 114 |
+
> )
|
| 115 |
+
> ```
|
| 116 |
+
|
| 117 |
+
### 🌟 Text-to-Image Generation with Thinking
|
| 118 |
+
|
| 119 |
+
```python
|
| 120 |
+
import torch
|
| 121 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 122 |
+
from decoder import decode_vq_tokens
|
| 123 |
+
|
| 124 |
+
model_path = "inclusionAI/LLaDA2.0-Uni"
|
| 125 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 126 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 127 |
+
model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
|
| 128 |
+
).eval()
|
| 129 |
+
model.tokenizer = tokenizer
|
| 130 |
+
|
| 131 |
+
# Generate image tokens with thinking process
|
| 132 |
+
result = model.generate_image(
|
| 133 |
+
"A fox with thick, dense, fluffy fur in a winter setting, possibly surrounded by snow.",
|
| 134 |
+
image_h=1024, image_w=1024,
|
| 135 |
+
mode="thinking",
|
| 136 |
+
steps=8, cfg_scale=2.0,
|
| 137 |
+
thinking_steps=32, thinking_gen_length=4096,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Print thinking trace
|
| 141 |
+
print("Thinking:", result["thinking"])
|
| 142 |
+
|
| 143 |
+
# Decode to PIL image
|
| 144 |
+
image = decode_vq_tokens(result["token_ids"], result["h"], result["w"], model_path, "cuda", num_steps=8, decode_mode="decoder-turbo",)
|
| 145 |
+
image.save("output_thinking.png")
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### 🌟 Image Understanding
|
| 149 |
+
|
| 150 |
+
```python
|
| 151 |
+
import torch
|
| 152 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 153 |
+
from encoder.image_tokenizer import ImageTokenizer
|
| 154 |
+
from decoder.smart_img_process import smart_resize_images
|
| 155 |
+
|
| 156 |
+
model_path = "inclusionAI/LLaDA2.0-Uni"
|
| 157 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 158 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 159 |
+
model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
|
| 160 |
+
).eval()
|
| 161 |
+
model.tokenizer = tokenizer
|
| 162 |
+
|
| 163 |
+
# Encode image to discrete tokens
|
| 164 |
+
image_tokenizer = ImageTokenizer(model_path=model_path, device="cuda")
|
| 165 |
+
pil_image = smart_resize_images(["./assets/understanding_example.png"])[0]
|
| 166 |
+
info = image_tokenizer.encode_with_info(pil_image)
|
| 167 |
+
image_tokens = [x + model.config.image_token_offset for x in info["token_ids"]]
|
| 168 |
+
_, h, w = info["grid_thw"]
|
| 169 |
+
|
| 170 |
+
# Understand the image
|
| 171 |
+
response = model.understand_image(
|
| 172 |
+
image_tokens, h, w,
|
| 173 |
+
question="Describe this image in detail.",
|
| 174 |
+
steps=32, gen_length=2048,
|
| 175 |
+
)
|
| 176 |
+
print(response)
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
### 🌟 Image Editing
|
| 180 |
+
|
| 181 |
+
```python
|
| 182 |
+
import torch
|
| 183 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 184 |
+
from encoder.image_tokenizer import ImageTokenizer
|
| 185 |
+
from decoder.utils import generate_crop_size_list, var_center_crop
|
| 186 |
+
from decoder import decode_vq_tokens
|
| 187 |
+
from PIL import Image
|
| 188 |
+
|
| 189 |
+
model_path = "inclusionAI/LLaDA2.0-Uni"
|
| 190 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
| 191 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 192 |
+
model_path, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
|
| 193 |
+
).eval()
|
| 194 |
+
model.tokenizer = tokenizer
|
| 195 |
+
|
| 196 |
+
# Encode source image
|
| 197 |
+
image_tokenizer = ImageTokenizer(model_path=model_path, device="cuda")
|
| 198 |
+
crop_size_list = generate_crop_size_list((512 // 32) ** 2, 32)
|
| 199 |
+
pil_image = var_center_crop(Image.open("./assets/edit_example.png").convert("RGB"), crop_size_list=crop_size_list)
|
| 200 |
+
info = image_tokenizer.encode_with_info(pil_image)
|
| 201 |
+
image_tokens = [x + model.config.image_token_offset for x in info["token_ids"]]
|
| 202 |
+
_, h, w = info["grid_thw"]
|
| 203 |
+
|
| 204 |
+
# Edit the image
|
| 205 |
+
result = model.edit_image(
|
| 206 |
+
image_tokens, h, w,
|
| 207 |
+
instruction="Change the background to a beach.",
|
| 208 |
+
steps=8, cfg_text_scale=4.0,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Decode to PIL image
|
| 212 |
+
edited_image = decode_vq_tokens(result["token_ids"], result["h"], result["w"], model_path, "cuda", num_steps=8, decode_mode="decoder-turbo",)
|
| 213 |
+
edited_image.save("edited.png")
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### 🌟 SPRINT Acceleration
|
| 217 |
+
|
| 218 |
+
SPRINT accelerates inference by combining **KV cache reuse**, **adaptive unmasking**, and **threshold-based batch acceptance**:
|
| 219 |
+
|
| 220 |
+
- **KV Cache Reuse & Pruning**: The prefix KV cache is computed once during warmup steps, then optionally pruned by importance scores (blending KV attention importance with token confidence). Subsequent denoising steps reuse the cached prefix, significantly reducing computation. Per-modality keep ratios (`image_keep_ratio`, `text_keep_ratio`) allow fine-grained control — e.g., retaining all image/text tokens for quality while still benefiting from cache reuse.
|
| 221 |
+
- **Adaptive Unmasking**: Instead of unmasking a fixed number of tokens per step, Sprint dynamically decides how many tokens to reveal based on model confidence. At each step, it computes confidence scores (via strategies like `low_confidence`, `top_k_margin`, or `neg_entropy`) and transfers the top-k most confident tokens, where k is adaptively set as `ceil(remaining_masked / steps_left)`. This allows easy positions to be resolved quickly while concentrating compute on harder tokens.
|
| 222 |
+
- **Batch Acceptance**: On top of adaptive scheduling, all tokens whose probability exceeds `threshold` are accepted in batch, further reducing the number of denoising iterations needed.
|
| 223 |
+
|
| 224 |
+
**Image Understanding** with Sprint:
|
| 225 |
+
|
| 226 |
+
```python
|
| 227 |
+
response = model.understand_image(
|
| 228 |
+
image_tokens, h, w,
|
| 229 |
+
question="Describe this image in detail.",
|
| 230 |
+
steps=32, gen_length=4096,
|
| 231 |
+
use_sprint=True,
|
| 232 |
+
threshold=0.93,
|
| 233 |
+
keep_ratio=0.5,
|
| 234 |
+
cache_warmup_steps=1,
|
| 235 |
+
image_keep_ratio=1.0,
|
| 236 |
+
text_keep_ratio=1.0,
|
| 237 |
+
)
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
**Text-to-Image** with Sprint:
|
| 241 |
+
|
| 242 |
+
```python
|
| 243 |
+
result = model.generate_image(
|
| 244 |
+
"A modern Scandinavian kitchen with white cabinetry, marble countertops, and a single orchid on the island. A Nordic woman with sleek blonde ponytail, wearing an oversized sweater and dainty silver necklaces, stirs a matcha bowl with a bamboo whisk, eyes sparkling with quiet joy. Shot with 50mm, f/2.5, diffused window light, cool white balance, low saturation, clean skin retouch. Mood: serene, wholesome, hygge.",
|
| 245 |
+
image_h=1024, image_w=1024,
|
| 246 |
+
cfg_scale=2.0,
|
| 247 |
+
use_sprint=True,
|
| 248 |
+
block_length=32,
|
| 249 |
+
steps=8,
|
| 250 |
+
keep_ratio=0.5,
|
| 251 |
+
cache_warmup_steps=1,
|
| 252 |
+
)
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
> [!Note]
|
| 256 |
+
> Sprint is supported for Simple CFG and no-CFG modes. When using Editing CFG (three-way guidance with `cfg_text_scale` / `cfg_image_scale`), Sprint automatically falls back to baseline.
|
| 257 |
+
|
| 258 |
+
## Repository Structure
|
| 259 |
+
|
| 260 |
+
```
|
| 261 |
+
LLaDA2-Uni/
|
| 262 |
+
├── config.json # Model configuration
|
| 263 |
+
├── modeling_llada2uni_moe.py # Model implementation (trust_remote_code)
|
| 264 |
+
├── configuration_llada2uni_moe.py # Config class
|
| 265 |
+
├── tokenizer.json # Tokenizer
|
| 266 |
+
├── model-00001-of-00013.safetensors # MoE backbone weights (sharded, bf16)
|
| 267 |
+
├── ...
|
| 268 |
+
├── model-00013-of-00013.safetensors
|
| 269 |
+
├── model.safetensors.index.json
|
| 270 |
+
├── image_tokenizer/
|
| 271 |
+
│ ├── config.json
|
| 272 |
+
│ ├── image_tokenizer.safetensors # SigLIP-VQ encoder
|
| 273 |
+
│ ├── sigvq_embedding.pt # SigVQ embedding + projector
|
| 274 |
+
│ └── preprocessor_config.json
|
| 275 |
+
├── decoder/
|
| 276 |
+
│ ├── config.json
|
| 277 |
+
│ └── decoder_model.safetensors # Diffusion decoder (bf16, 12GB)
|
| 278 |
+
├── decoder-turbo/
|
| 279 |
+
│ ├── config.json
|
| 280 |
+
│ └── decoder_model.safetensors # Distilled few-step decoder (bf16, 12GB)
|
| 281 |
+
└── vae/
|
| 282 |
+
├── config.json
|
| 283 |
+
└── diffusion_pytorch_model.safetensors
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
## Hardware Requirements
|
| 287 |
+
|
| 288 |
+
| Component | GPU Memory |
|
| 289 |
+
|---|---|
|
| 290 |
+
| MoE Backbone (bf16, 16B total) | ~32 GB |
|
| 291 |
+
| Diffusion Decoder (bf16, 6.2B) | ~12 GB |
|
| 292 |
+
| VAE + SigVQ + Tokenizer | ~3 GB |
|
| 293 |
+
| **Total (generation/editing)** | **~47 GB** |
|
| 294 |
+
| **Total (understanding only)** | **~35 GB** |
|
| 295 |
+
|
| 296 |
+
> 💡 While only ~1B parameters are activated per token during inference, all 16B MoE parameters must be loaded into memory. The diffusion decoder is only needed for image generation/editing and is released afterwards.
|
| 297 |
+
|
| 298 |
+
## 🚀 SGLang Support (Coming Soon)
|
| 299 |
+
|
| 300 |
+
We are working on integrating [SGLang](https://github.com/sgl-project/sglang) for high-throughput serving and optimized inference. Stay tuned!
|
| 301 |
+
|
| 302 |
+
## ⚠️ License
|
| 303 |
+
|
| 304 |
+
This project is licensed under the terms of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
| 305 |
+
|
| 306 |
+
## 📖 BibTeX
|
| 307 |
+
|
| 308 |
+
```bibtex
|
| 309 |
+
@article{LLaDA2Uni,
|
| 310 |
+
title = {LLaDA2.0-Uni: Unifying Multimodal Understanding and Generation with Diffusion Large Language Model},
|
| 311 |
+
author = {Inclusion AI},
|
| 312 |
+
year = {2026}
|
| 313 |
+
}
|
| 314 |
+
```
|
assets/architecture.png
ADDED
|
Git LFS Details
|
assets/edit_example.png
ADDED
|
Git LFS Details
|
assets/llada_logo.png
ADDED
|
assets/performance.png
ADDED
|
Git LFS Details
|
assets/understanding_example.png
ADDED
|
config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LLaDA2MoeModelLM"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_llada2uni_moe.LLaDA2MoeConfig",
|
| 7 |
+
"AutoModel": "modeling_llada2uni_moe.LLaDA2MoeModelLM",
|
| 8 |
+
"AutoModelForCausalLM": "modeling_llada2uni_moe.LLaDA2MoeModelLM"
|
| 9 |
+
},
|
| 10 |
+
"model_type": "llada2_moe",
|
| 11 |
+
"torch_dtype": "bfloat16",
|
| 12 |
+
"transformers_version": "4.51.0",
|
| 13 |
+
"vocab_size": 173568,
|
| 14 |
+
"hidden_size": 2048,
|
| 15 |
+
"intermediate_size": 5120,
|
| 16 |
+
"num_hidden_layers": 20,
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_key_value_heads": 4,
|
| 19 |
+
"head_dim": 128,
|
| 20 |
+
"hidden_act": "silu",
|
| 21 |
+
"use_qkv_bias": false,
|
| 22 |
+
"use_qk_norm": true,
|
| 23 |
+
"use_bias": false,
|
| 24 |
+
"rms_norm_eps": 1e-06,
|
| 25 |
+
"attention_dropout": 0.0,
|
| 26 |
+
"initializer_range": 0.02,
|
| 27 |
+
"max_position_embeddings": 8192,
|
| 28 |
+
"rope_theta": 600000,
|
| 29 |
+
"rope_parameters": {
|
| 30 |
+
"rope_type": "default",
|
| 31 |
+
"rope_theta": 600000,
|
| 32 |
+
"partial_rotary_factor": 0.5
|
| 33 |
+
},
|
| 34 |
+
"partial_rotary_factor": 0.5,
|
| 35 |
+
"use_cache": false,
|
| 36 |
+
"sliding_window": null,
|
| 37 |
+
"pad_token_id": 156892,
|
| 38 |
+
"num_experts": 256,
|
| 39 |
+
"num_shared_experts": 1,
|
| 40 |
+
"num_experts_per_tok": 8,
|
| 41 |
+
"n_group": 8,
|
| 42 |
+
"topk_group": 4,
|
| 43 |
+
"routed_scaling_factor": 2.5,
|
| 44 |
+
"moe_intermediate_size": 512,
|
| 45 |
+
"first_k_dense_replace": 1,
|
| 46 |
+
"output_router_logits": false,
|
| 47 |
+
"image_token_offset": 157184
|
| 48 |
+
}
|
configuration_llada2uni_moe.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Antgroup and The HuggingFace Inc. team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""LLaDA2 MoE model configuration."""
|
| 15 |
+
|
| 16 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LLaDA2MoeConfig(PretrainedConfig):
|
| 20 |
+
r"""
|
| 21 |
+
Configuration class for the LLaDA2 MoE model (discrete-token multimodal LLM).
|
| 22 |
+
|
| 23 |
+
This config covers the LLM backbone only. Images are represented as discrete VQ tokens
|
| 24 |
+
in the extended vocabulary — no vision encoder config is needed.
|
| 25 |
+
|
| 26 |
+
```python
|
| 27 |
+
>>> from configuration_llada2uni_moe import LLaDA2MoeConfig
|
| 28 |
+
>>> config = LLaDA2MoeConfig()
|
| 29 |
+
```
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
model_type = "llada2_moe"
|
| 33 |
+
|
| 34 |
+
def __init__(
|
| 35 |
+
self,
|
| 36 |
+
vocab_size=30592,
|
| 37 |
+
hidden_size=1024,
|
| 38 |
+
intermediate_size=None,
|
| 39 |
+
num_hidden_layers=24,
|
| 40 |
+
num_attention_heads=16,
|
| 41 |
+
num_key_value_heads=0,
|
| 42 |
+
head_dim=None,
|
| 43 |
+
hidden_act="silu",
|
| 44 |
+
use_qkv_bias=False,
|
| 45 |
+
use_qk_norm=False,
|
| 46 |
+
use_bias=True,
|
| 47 |
+
rms_norm_eps=1e-05,
|
| 48 |
+
tie_word_embeddings=False,
|
| 49 |
+
attention_dropout=0.1,
|
| 50 |
+
initializer_range=0.02,
|
| 51 |
+
max_position_embeddings=16384,
|
| 52 |
+
rope_theta=10000.0,
|
| 53 |
+
rope_parameters=None,
|
| 54 |
+
partial_rotary_factor=0.5,
|
| 55 |
+
use_cache=True,
|
| 56 |
+
sliding_window=None,
|
| 57 |
+
pad_token_id=126081,
|
| 58 |
+
# Image
|
| 59 |
+
image_token_offset=157184,
|
| 60 |
+
# MoE
|
| 61 |
+
num_experts=16,
|
| 62 |
+
num_shared_experts=0,
|
| 63 |
+
num_experts_per_tok=2,
|
| 64 |
+
n_group=8,
|
| 65 |
+
topk_group=4,
|
| 66 |
+
routed_scaling_factor=2.5,
|
| 67 |
+
moe_intermediate_size=None,
|
| 68 |
+
first_k_dense_replace=0,
|
| 69 |
+
output_router_logits=False,
|
| 70 |
+
**kwargs,
|
| 71 |
+
):
|
| 72 |
+
self.vocab_size = vocab_size
|
| 73 |
+
self.hidden_size = hidden_size
|
| 74 |
+
self.intermediate_size = intermediate_size
|
| 75 |
+
self.num_hidden_layers = num_hidden_layers
|
| 76 |
+
self.num_attention_heads = num_attention_heads
|
| 77 |
+
self.num_key_value_heads = num_key_value_heads
|
| 78 |
+
self.head_dim = head_dim or hidden_size // num_attention_heads
|
| 79 |
+
self.hidden_act = hidden_act
|
| 80 |
+
self.use_qkv_bias = use_qkv_bias
|
| 81 |
+
self.use_qk_norm = use_qk_norm
|
| 82 |
+
self.use_bias = use_bias
|
| 83 |
+
self.rms_norm_eps = rms_norm_eps
|
| 84 |
+
self.attention_dropout = attention_dropout
|
| 85 |
+
self.initializer_range = initializer_range
|
| 86 |
+
self.max_position_embeddings = max_position_embeddings
|
| 87 |
+
self.rope_theta = rope_theta
|
| 88 |
+
self.partial_rotary_factor = partial_rotary_factor
|
| 89 |
+
self.use_cache = use_cache
|
| 90 |
+
self.sliding_window = sliding_window
|
| 91 |
+
|
| 92 |
+
# Image token offset: VQ codebook indices are shifted by this amount in the vocabulary
|
| 93 |
+
self.image_token_offset = image_token_offset
|
| 94 |
+
|
| 95 |
+
# RoPE parameters dict — used by LLaDA2MoeRotaryEmbedding
|
| 96 |
+
if rope_parameters is None:
|
| 97 |
+
rope_parameters = {
|
| 98 |
+
"rope_type": "default",
|
| 99 |
+
"rope_theta": rope_theta,
|
| 100 |
+
"partial_rotary_factor": partial_rotary_factor,
|
| 101 |
+
}
|
| 102 |
+
self.rope_parameters = rope_parameters
|
| 103 |
+
|
| 104 |
+
# MoE
|
| 105 |
+
self.num_experts = num_experts
|
| 106 |
+
self.num_shared_experts = num_shared_experts
|
| 107 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 108 |
+
self.n_group = n_group
|
| 109 |
+
self.topk_group = topk_group
|
| 110 |
+
self.routed_scaling_factor = routed_scaling_factor
|
| 111 |
+
self.moe_intermediate_size = moe_intermediate_size
|
| 112 |
+
self.first_k_dense_replace = first_k_dense_replace
|
| 113 |
+
self.output_router_logits = output_router_logits
|
| 114 |
+
|
| 115 |
+
super().__init__(
|
| 116 |
+
pad_token_id=pad_token_id,
|
| 117 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 118 |
+
**kwargs,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
__all__ = ["LLaDA2MoeConfig"]
|
decoder-turbo/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ZImageTransformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"all_f_patch_size": [
|
| 5 |
+
1
|
| 6 |
+
],
|
| 7 |
+
"all_patch_size": [
|
| 8 |
+
2
|
| 9 |
+
],
|
| 10 |
+
"axes_dims": [
|
| 11 |
+
32,
|
| 12 |
+
48,
|
| 13 |
+
48
|
| 14 |
+
],
|
| 15 |
+
"axes_lens": [
|
| 16 |
+
1536,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"cap_feat_dim": 2560,
|
| 21 |
+
"dim": 3840,
|
| 22 |
+
"in_channels": 16,
|
| 23 |
+
"n_heads": 30,
|
| 24 |
+
"n_kv_heads": 30,
|
| 25 |
+
"n_layers": 30,
|
| 26 |
+
"n_refiner_layers": 2,
|
| 27 |
+
"norm_eps": 1e-05,
|
| 28 |
+
"qk_norm": true,
|
| 29 |
+
"rope_theta": 256.0,
|
| 30 |
+
"siglip_feat_dim": null,
|
| 31 |
+
"t_scale": 1000.0
|
| 32 |
+
}
|
decoder-turbo/decoder_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76007e00703e5289b6a64c1c0f44dc26ed0616e0f99018c64b79ef201f8ff248
|
| 3 |
+
size 12321673696
|
decoder/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ZImageTransformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"all_f_patch_size": [
|
| 5 |
+
1
|
| 6 |
+
],
|
| 7 |
+
"all_patch_size": [
|
| 8 |
+
2
|
| 9 |
+
],
|
| 10 |
+
"axes_dims": [
|
| 11 |
+
32,
|
| 12 |
+
48,
|
| 13 |
+
48
|
| 14 |
+
],
|
| 15 |
+
"axes_lens": [
|
| 16 |
+
1536,
|
| 17 |
+
512,
|
| 18 |
+
512
|
| 19 |
+
],
|
| 20 |
+
"cap_feat_dim": 2560,
|
| 21 |
+
"dim": 3840,
|
| 22 |
+
"in_channels": 16,
|
| 23 |
+
"n_heads": 30,
|
| 24 |
+
"n_kv_heads": 30,
|
| 25 |
+
"n_layers": 30,
|
| 26 |
+
"n_refiner_layers": 2,
|
| 27 |
+
"norm_eps": 1e-05,
|
| 28 |
+
"qk_norm": true,
|
| 29 |
+
"rope_theta": 256.0,
|
| 30 |
+
"siglip_feat_dim": null,
|
| 31 |
+
"t_scale": 1000.0
|
| 32 |
+
}
|
decoder/decoder_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4538abc88dc41ecbdced5b032a5f0ac1f0780f96b36256d37bb7a105930ae8f
|
| 3 |
+
size 12321673696
|
image_tokenizer/config.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GlmImageForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"image_start_token_id": 16384,
|
| 6 |
+
"image_end_token_id": 16385,
|
| 7 |
+
"image_token_id": 167855,
|
| 8 |
+
"model_type": "glm_image",
|
| 9 |
+
"text_config": {
|
| 10 |
+
"attention_dropout": 0.0,
|
| 11 |
+
"eos_token_id": 16385,
|
| 12 |
+
"pad_token_id": 167841,
|
| 13 |
+
"hidden_act": "silu",
|
| 14 |
+
"hidden_size": 4096,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": 13696,
|
| 17 |
+
"max_position_embeddings": 131072,
|
| 18 |
+
"model_type": "glm_image_text",
|
| 19 |
+
"num_attention_heads": 32,
|
| 20 |
+
"num_hidden_layers": 40,
|
| 21 |
+
"num_key_value_heads": 2,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"dtype": "bfloat16",
|
| 24 |
+
"rope_parameters": {
|
| 25 |
+
"rope_theta": 10000,
|
| 26 |
+
"rope_type": "default",
|
| 27 |
+
"mrope_section": [
|
| 28 |
+
8,
|
| 29 |
+
12,
|
| 30 |
+
12
|
| 31 |
+
],
|
| 32 |
+
"partial_rotary_factor": 0.5
|
| 33 |
+
},
|
| 34 |
+
"use_cache": true,
|
| 35 |
+
"vision_vocab_size": 16512,
|
| 36 |
+
"vocab_size": 168064
|
| 37 |
+
},
|
| 38 |
+
"transformers_version": "5.0.0.dev0",
|
| 39 |
+
"vision_config": {
|
| 40 |
+
"attention_bias": true,
|
| 41 |
+
"attention_dropout": 0.0,
|
| 42 |
+
"depth": 40,
|
| 43 |
+
"hidden_act": "gelu",
|
| 44 |
+
"hidden_size": 1536,
|
| 45 |
+
"image_size": 2048,
|
| 46 |
+
"in_channels": 3,
|
| 47 |
+
"intermediate_size": 6144,
|
| 48 |
+
"layer_norm_eps": 1e-06,
|
| 49 |
+
"model_type": "glm_image_vision",
|
| 50 |
+
"num_heads": 16,
|
| 51 |
+
"patch_size": 16
|
| 52 |
+
},
|
| 53 |
+
"vq_config": {
|
| 54 |
+
"embed_dim": 2048,
|
| 55 |
+
"in_channels": 3,
|
| 56 |
+
"initializer_range": 0.02,
|
| 57 |
+
"latent_channels": 1536,
|
| 58 |
+
"model_type": "glm_image_vqmodel",
|
| 59 |
+
"num_embeddings": 16384
|
| 60 |
+
}
|
| 61 |
+
}
|
image_tokenizer/image_tokenizer.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0a11a82ad221ac1f3b917abfce31ffaaec3571200ae7ee5318a223ff2eedc49
|
| 3 |
+
size 2398968416
|
image_tokenizer/preprocessor_config.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"min_pixels": 262144,
|
| 3 |
+
"max_pixels": 4194304,
|
| 4 |
+
"do_rescale": true,
|
| 5 |
+
"do_normalize": true,
|
| 6 |
+
"do_resize": false,
|
| 7 |
+
"patch_size": 16,
|
| 8 |
+
"temporal_patch_size": 1,
|
| 9 |
+
"merge_size": 1,
|
| 10 |
+
"image_mean": [0.5, 0.5, 0.5],
|
| 11 |
+
"image_std": [0.5, 0.5, 0.5],
|
| 12 |
+
"image_processor_type": "GlmImageImageProcessor",
|
| 13 |
+
"processor_class": "GlmImageProcessor",
|
| 14 |
+
"resample": 3,
|
| 15 |
+
"rescale_factor": 0.00392156862745098
|
| 16 |
+
}
|
image_tokenizer/sigvq_embedding.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f3689458a14ee04088def752dfaf8fe391910a1a61511decd9ce87fbdbe981b
|
| 3 |
+
size 402688449
|
model-00001-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9de425701ab2118e3554f63a274ac98962c4efa4d95a05c8d81d409a45ffacd6
|
| 3 |
+
size 5369025312
|
model-00002-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f55a735671a05fcc5a8983181de16b0008e3bf8766273d10e490a9524a15df5
|
| 3 |
+
size 5369025664
|
model-00003-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26f578d62d76cecfd83d2ac28cb42618d6ea6ff37b6a28317ae54f1fb53ac284
|
| 3 |
+
size 5369025576
|
model-00004-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4fe3bacdc83f196430f374db78c5ca647fdf61e3ae8657fd8d1de7155429a97
|
| 3 |
+
size 5369027896
|
model-00005-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:960858047af7d66f0c38c2d4fb76b4a80c65d5b4e04b8f55c5e3b7704ecda466
|
| 3 |
+
size 5369027904
|
model-00006-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c16ab6dbd22b8273211d956cc0ead6aed6239a14457f9a071604d6ec25cf9c3f
|
| 3 |
+
size 3821234896
|
model-00007-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbcd0ef62f99aa2025fdc47d59ecb771e415162730628f74133dce4c533d1708
|
| 3 |
+
size 63992360
|
model-00008-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43e3be5de1c7b2b6f95f3cf31d842e9888115b2754775973979b33134738f2ee
|
| 3 |
+
size 824220328
|
model-00009-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:455ef640a86513be99ee39d38164dc17908c51bae27cafe50601bfaf46f069fa
|
| 3 |
+
size 124811128
|
model-00010-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b839b13466a6f65b1d8121ed21a80b02273b0f70cdf74b08de73a439c11c306e
|
| 3 |
+
size 84965488
|
model-00011-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a39c7db1ca265dcefd0c7167dc1782fe8e782aef204bc94643518df1579cce7
|
| 3 |
+
size 79725952
|
model-00012-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2f905aa76429a6ac3b995b0a32019bf1f18df64d6b36106dafe7f770e1f7849
|
| 3 |
+
size 84967960
|
model-00013-of-00013.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:541420db214a20bb6d107256b48b1ae139cfa2376df12decaaf745d981360d3a
|
| 3 |
+
size 718288984
|
model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
modeling_llada2uni_moe.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|startoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "[CLS]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "<|endoftext|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<|mask|>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<|endoftext|>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2197aeddaf09785316673451ca6fb86dcfcfdb108972a3145d106b8fa4c927e6
|
| 3 |
+
size 15297062
|
tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vae/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.30.0.dev0",
|
| 4 |
+
"_name_or_path": "../checkpoints/flux-dev",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
128,
|
| 8 |
+
256,
|
| 9 |
+
512,
|
| 10 |
+
512
|
| 11 |
+
],
|
| 12 |
+
"down_block_types": [
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D",
|
| 16 |
+
"DownEncoderBlock2D"
|
| 17 |
+
],
|
| 18 |
+
"force_upcast": true,
|
| 19 |
+
"in_channels": 3,
|
| 20 |
+
"latent_channels": 16,
|
| 21 |
+
"latents_mean": null,
|
| 22 |
+
"latents_std": null,
|
| 23 |
+
"layers_per_block": 2,
|
| 24 |
+
"mid_block_add_attention": true,
|
| 25 |
+
"norm_num_groups": 32,
|
| 26 |
+
"out_channels": 3,
|
| 27 |
+
"sample_size": 1024,
|
| 28 |
+
"scaling_factor": 0.3611,
|
| 29 |
+
"shift_factor": 0.1159,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpDecoderBlock2D",
|
| 32 |
+
"UpDecoderBlock2D",
|
| 33 |
+
"UpDecoderBlock2D",
|
| 34 |
+
"UpDecoderBlock2D"
|
| 35 |
+
],
|
| 36 |
+
"use_post_quant_conv": false,
|
| 37 |
+
"use_quant_conv": false
|
| 38 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5b59a26851551b67ae1fe58d32e76486e1e812def4696a4bea97f16604d40a3
|
| 3 |
+
size 167666902
|