JSYuuu commited on 11 days ago

Commit

84163e3

verified ·

1 Parent(s): aefca3d

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +1 -0
README.md +93 -0
mllm/config.json +66 -0
mllm/generation_config.json +7 -0
mllm/merges.txt +0 -0
mllm/model-00001-of-00004.safetensors +3 -0
mllm/model-00002-of-00004.safetensors +3 -0
mllm/model-00003-of-00004.safetensors +3 -0
mllm/model-00004-of-00004.safetensors +3 -0
mllm/model.safetensors.index.json +757 -0
model_index.json +24 -0
processor/added_tokens.json +28 -0
processor/chat_template.jinja +110 -0
processor/chat_template.json +3 -0
processor/preprocessor_config.json +39 -0
processor/special_tokens_map.json +31 -0
processor/tokenizer.json +3 -0
processor/tokenizer_config.json +241 -0
processor/video_preprocessor_config.json +41 -0
processor/vocab.json +0 -0
scheduler/scheduler_config.json +6 -0
scheduler/scheduling_flow_match_euler_discrete.py +229 -0
transformer/config.json +27 -0
transformer/diffusion_pytorch_model-00001-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +591 -0
transformer/transformer_thinkgen.py +2457 -0
vae/config.json +38 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+library_name: diffusers
+license: apache-2.0
+pipeline_tag: any-to-any
+---
+# ThinkGen: Generalized Thinking for Visual Generation
+ThinkGen is the first think-driven visual generation framework that explicitly leverages Multimodal Large Language Models' (MLLMs) Chain-of-Thought (CoT) reasoning in various generation scenarios. ThinkGen employs a decoupled architecture comprising a pretrained MLLM and a Diffusion Transformer (DiT), wherein the MLLM generates tailored instructions based on user intent, and the DiT produces high-quality images guided by these instructions.
+- **Paper:** [ThinkGen: Generalized Thinking for Visual Generation](https://huggingface.co/papers/2512.23568)
+- **Code:** [GitHub Repository](https://github.com/jiaosiyuu/ThinkGen)
+**Authors**: Siyu Jiao, Yiheng Lin, Yujie Zhong, Qi She, Wei Zhou, Xiaohan Lan, Zilong Huang, Fei Yu, Yingchen Yu, Yunqing Zhao, Yao Zhao, Yunchao Wei.
+## 🚀 Quick Start
+### 🛠️ Environment Setup
+```bash
+# 1. Clone the repo
+git clone https://github.com/jiaosiyuu/ThinkGen.git
+cd ThinkGen
+# 2. (Optional) Create a clean Python environment
+conda create -n thinkgen python=3.11
+conda activate thinkgen
+# 3. Install dependencies
+pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
+pip install -r req.txt
+# ThinkGen runs even without flash-attn, though we recommend install it for best performance.
+pip install --no-cache-dir flash-attn==2.7.4.post1 --no-build-isolation
+```
+### 💻 Sample Usage
+```python
+from ThinkGen.model import ThinkGen_Chat
+import os
+model_path = "JSYuuu/ThinkGen-stage3"
+chat_model = ThinkGen_Chat(
+    model_path=model_path,
+    dtype='bf16',
+    height=1024,
+    width=1024
+)
+# 1. Image Generation
+messages = [
+    {"type": "text", "value": "A young woman wearing a straw hat, standing in a golden wheat field."}
+]
+results = chat_model.generate_image(messages)
+results.images[0].save("result.png")
+# 2. Image Generation with Thinking (CoT)
+# This enables the MLLM's CoT reasoning for generation
+results_think = chat_model.generate_image(messages, think=True)
+print(f"cot & rewrite prompt:
+{results_think.prompt_cot}")
+results_think.images[0].save("result_think.png")
+# 3. Image Understanding
+messages_und = [
+    {"type": "image", "value": "images/teaser.png"},
+    {"type": "text", "value": "Describe this image"}
+]
+response = chat_model.generate_text(messages_und)
+print(response)
+```
+## Acknowledgments
+This work builds upon the following great open-source projects:
+* **OmniGen2:** https://github.com/VectorSpaceLab/OmniGen2
+* **Qwen3VL:** https://github.com/QwenLM/Qwen3-VL
+* **EasyR1:** https://github.com/hiyouga/EasyR1
+* **Flow-GRPO:** https://github.com/yifan123/flow_grpo
+## Citation
+```bibtex
+@article{jiao2025thinkgen,
+  title={ThinkGen: Generalized Thinking for Visual Generation},
+  author={Jiao, Siyu and Lin, Yiheng and Zhong, Yujie and She, Qi and Zhou, Wei and Lan, Xiaohan and Huang, Zilong and Yu, Fei and Yu, Yingchen and Zhao, Yunqing and Zhao, Yao and Wei, Yunchao},
+  journal={arXiv preprint arXiv:2512.23568},
+  year={2025}
+}
+```
+## License
+This work is licensed under the Apache 2.0 license.

mllm/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "pad_token_id": 151643,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "float32",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "dtype": "float32",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

mllm/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.57.1"
+}

mllm/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

mllm/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a2ad68570fe788a0bbf03ed07c1d32cea83884fa328c587a5ef97d797cf2e91
+size 4902275944

mllm/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5192d62fbc367626d743551b6a91461bfae305db6cf71eaeade89598d21e4f7d
+size 4915962496

mllm/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4931c8b5e85666292daa65153726b20030f54d81d8f51d732a367b9e051e5fbc
+size 4999831048

mllm/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef14a367ae345934318ea7b06bf404b97c3a312418f40dd5f92d8296af96de13
+size 2716270024

mllm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,757 @@

+{
+  "metadata": {
+    "total_size": 17534247392
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.pos_embed.weight": "model-00004-of-00004.safetensors"
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "ThinkGenPipeline",
+  "_diffusers_version": "0.34.1",
+  "scheduler": [
+    "scheduling_flow_match_euler_discrete",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "transformer": [
+    "transformer_thinkgen",
+    "ThinkGenTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ],
+  "mllm": [
+    "transformers",
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "processor": [
+    "transformers",
+    "Qwen3VLProcessor"
+  ]
+}

processor/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

processor/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,110 @@

+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- else %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                <|vision_start|><|image_pad|><|vision_end|>
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                <|vision_start|><|video_pad|><|vision_end|>
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{%- endmacro %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- render_content(messages[0].content, false) + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + render_content(messages[0].content, false) + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false) %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, True) %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}

processor/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are MiMo, an AI assistant developed by Xiaomi.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

processor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_pixels": null,
+  "merge_size": 2,
+  "min_pixels": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 16777216,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2
+}

processor/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

processor/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

processor/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen3VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "use_fast": true
+}

processor/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": true,
+  "fps": 2,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "merge_size": 2,
+  "min_frames": 4,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 25165824,
+    "shortest_edge": 4096
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen3VLVideoProcessor"
+}

processor/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.34.1",
+  "dynamic_time_shift": true,
+  "num_train_timesteps": 1000
+}

scheduler/scheduling_flow_match_euler_discrete.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        dynamic_time_shift: bool = False
+    ):
+        timesteps = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float32)[:-1]
+        self.timesteps = timesteps
+        self._step_index = None
+        self._begin_index = None
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self._timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    # def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+    #     return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[float]] = None,
+        num_tokens: Optional[int] = None
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        if timesteps is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(0, 1, num_inference_steps + 1, dtype=np.float32)[:-1]
+            if self.config.dynamic_time_shift and num_tokens is not None:
+                m = np.sqrt(num_tokens) / 40 # when input resolution is 320 * 320, m = 1, when input resolution is 1024 * 1024, m = 3.2
+                timesteps = timesteps / (m - m * timesteps + timesteps)
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
+        _timesteps = torch.cat([timesteps, torch.ones(1, device=timesteps.device)])
+        self.timesteps = timesteps
+        self._timesteps = _timesteps
+        self._step_index = None
+        self._begin_index = None
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        t = self._timesteps[self.step_index]
+        t_next = self._timesteps[self.step_index + 1]
+        prev_sample = sample + (t_next - t) * model_output
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

transformer/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_class_name": "ThinkGenTransformer2DModel",
+  "_diffusers_version": "0.34.1",
+  "axes_dim_rope": [
+    40,
+    40,
+    40
+  ],
+  "axes_lens": [
+    10000,
+    10000,
+    10000
+  ],
+  "ffn_dim_multiplier": null,
+  "hidden_size": 2520,
+  "in_channels": 16,
+  "multiple_of": 256,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 21,
+  "num_kv_heads": 7,
+  "num_layers": 32,
+  "num_refiner_layers": 2,
+  "out_channels": null,
+  "patch_size": 2,
+  "text_feat_dim": 4096,
+  "timestep_scale": 1000.0
+}

transformer/diffusion_pytorch_model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:322d806247df29cd16a1e8d7ddf307844ec13656172bf8e36d80562fcf8fb62f
+size 9913126464

transformer/diffusion_pytorch_model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a41d01a025b8166aafe6fa1b605176f6be5136fd1f1e018ce426816b591ac33
+size 6018290672

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,591 @@

+{
+  "metadata": {
+    "total_size": 15931355544
+  },
+  "weight_map": {
+    "context_refiner.0.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.0.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "context_refiner.1.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "image_index_embedding": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.0.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.1.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.10.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.11.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.12.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.13.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.14.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.15.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.16.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.17.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.17.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.18.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.19.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.2.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.2.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.20.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.20.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.21.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.22.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.23.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.24.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.25.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.26.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.27.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.28.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.29.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.3.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.3.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.30.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.30.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.to_k.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.to_out.0.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.to_q.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.attn.to_v.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.feed_forward.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.feed_forward.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.feed_forward.linear_3.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.ffn_norm1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.ffn_norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.norm1.linear.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.norm1.linear.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.norm1.norm.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.31.norm2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "layers.4.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.4.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.5.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.6.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.7.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.8.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "layers.9.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.0.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "noise_refiner.1.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "norm_out.linear_1.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "norm_out.linear_1.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "norm_out.linear_2.bias": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "norm_out.linear_2.weight": "diffusion_pytorch_model-00002-of-00002.safetensors",
+    "prepad_embed": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "prepad_mask": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_patch_embedder.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_patch_embedder.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.0.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.to_k.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.to_out.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.to_q.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.attn.to_v.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.feed_forward.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.feed_forward.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.feed_forward.linear_3.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.ffn_norm1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.ffn_norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.norm1.linear.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.norm1.linear.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.norm1.norm.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "ref_image_refiner.1.norm2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.caption_embedder.0.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.caption_embedder.1.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.caption_embedder.1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.timestep_embedder.linear_1.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.timestep_embedder.linear_1.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.timestep_embedder.linear_2.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "time_caption_embed.timestep_embedder.linear_2.weight": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "x_embedder.bias": "diffusion_pytorch_model-00001-of-00002.safetensors",
+    "x_embedder.weight": "diffusion_pytorch_model-00001-of-00002.safetensors"
+  }
+}

transformer/transformer_thinkgen.py ADDED Viewed

	@@ -0,0 +1,2457 @@

+import warnings
+import itertools
+from typing import Any, Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.attention_processor import Attention
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+from diffusers.models.activations import get_activation
+from diffusers.models.embeddings import Timesteps
+import importlib.util
+import sys
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+def _is_package_available(pkg_name: str):
+    pkg_exists = importlib.util.find_spec(pkg_name) is not None
+    pkg_version = "N/A"
+    if pkg_exists:
+        try:
+            pkg_version = importlib_metadata.version(pkg_name)
+        except (ImportError, importlib_metadata.PackageNotFoundError):
+            pkg_exists = False
+    return pkg_exists, pkg_version
+_triton_available, _triton_version = _is_package_available("triton")
+_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
+def is_triton_available():
+    return _triton_available
+def is_flash_attn_available():
+    return _flash_attn_available
+if is_flash_attn_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+else:
+    warnings.warn("Cannot import flash_attn, install flash_attn to use Flash2Varlen attention for better performance")
+if is_triton_available():
+    # from ...ops.triton.layer_norm import RMSNorm
+    import triton
+    import triton.language as tl
+    from typing import Callable
+    def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
+        def decorator(*args, **kwargs):
+            if cuda_amp_deprecated:
+                kwargs["device_type"] = "cuda"
+            return dec(*args, **kwargs)
+        return decorator
+    if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
+        deprecated = True
+        from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
+    else:
+        deprecated = False
+        from torch.cuda.amp import custom_fwd, custom_bwd
+    custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
+    custom_bwd = custom_amp_decorator(custom_bwd, deprecated)
+    def triton_autotune_configs():
+        # Return configs with a valid warp count for the current device
+        configs=[]
+        # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
+        max_threads_per_block=1024
+        # Default to warp size 32 if not defined by device
+        warp_size=getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
+        # Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
+        warp_count=1
+        while warp_count*warp_size <= max_threads_per_block:
+            configs.append(triton.Config({}, num_warps=warp_count))
+            warp_count*=2
+        return configs
+    @triton.autotune(
+        configs=triton_autotune_configs(),
+        key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
+    )
+    # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+    # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+    @triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
+    @triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
+    @triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
+    @triton.jit
+    def _layer_norm_fwd_1pass_kernel(
+        X,  # pointer to the input
+        Y,  # pointer to the output
+        W,  # pointer to the weights
+        B,  # pointer to the biases
+        RESIDUAL,  # pointer to the residual
+        X1,
+        W1,
+        B1,
+        Y1,
+        RESIDUAL_OUT,  # pointer to the residual
+        ROWSCALE,
+        SEEDS,  # Dropout seeds for each row
+        DROPOUT_MASK,
+        Mean,  # pointer to the mean
+        Rstd,  # pointer to the 1/std
+        stride_x_row,  # how much to increase the pointer when moving by 1 row
+        stride_y_row,
+        stride_res_row,
+        stride_res_out_row,
+        stride_x1_row,
+        stride_y1_row,
+        M,  # number of rows in X
+        N,  # number of columns in X
+        eps,  # epsilon to avoid division by zero
+        dropout_p,  # Dropout probability
+        zero_centered_weight,  # If true, add 1.0 to the weight
+        IS_RMS_NORM: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        HAS_RESIDUAL: tl.constexpr,
+        STORE_RESIDUAL_OUT: tl.constexpr,
+        HAS_BIAS: tl.constexpr,
+        HAS_DROPOUT: tl.constexpr,
+        STORE_DROPOUT_MASK: tl.constexpr,
+        HAS_ROWSCALE: tl.constexpr,
+        HAS_X1: tl.constexpr,
+        HAS_W1: tl.constexpr,
+        HAS_B1: tl.constexpr,
+    ):
+        # Map the program id to the row of X and Y it should compute.
+        row = tl.program_id(0)
+        X += row * stride_x_row
+        Y += row * stride_y_row
+        if HAS_RESIDUAL:
+            RESIDUAL += row * stride_res_row
+        if STORE_RESIDUAL_OUT:
+            RESIDUAL_OUT += row * stride_res_out_row
+        if HAS_X1:
+            X1 += row * stride_x1_row
+        if HAS_W1:
+            Y1 += row * stride_y1_row
+        # Compute mean and variance
+        cols = tl.arange(0, BLOCK_N)
+        x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+        if HAS_ROWSCALE:
+            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
+            x *= rowscale
+        if HAS_DROPOUT:
+            # Compute dropout mask
+            # 7 rounds is good enough, and reduces register pressure
+            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+            x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
+            if STORE_DROPOUT_MASK:
+                tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
+        if HAS_X1:
+            x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
+            if HAS_ROWSCALE:
+                rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
+                x1 *= rowscale
+            if HAS_DROPOUT:
+                # Compute dropout mask
+                # 7 rounds is good enough, and reduces register pressure
+                keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+                )
+                x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
+                if STORE_DROPOUT_MASK:
+                    tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
+            x += x1
+        if HAS_RESIDUAL:
+            residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
+            x += residual
+        if STORE_RESIDUAL_OUT:
+            tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
+        if not IS_RMS_NORM:
+            mean = tl.sum(x, axis=0) / N
+            tl.store(Mean + row, mean)
+            xbar = tl.where(cols < N, x - mean, 0.0)
+            var = tl.sum(xbar * xbar, axis=0) / N
+        else:
+            xbar = tl.where(cols < N, x, 0.0)
+            var = tl.sum(xbar * xbar, axis=0) / N
+        rstd = 1 / tl.sqrt(var + eps)
+        tl.store(Rstd + row, rstd)
+        # Normalize and apply linear transformation
+        mask = cols < N
+        w = tl.load(W + cols, mask=mask).to(tl.float32)
+        if zero_centered_weight:
+            w += 1.0
+        if HAS_BIAS:
+            b = tl.load(B + cols, mask=mask).to(tl.float32)
+        x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+        y = x_hat * w + b if HAS_BIAS else x_hat * w
+        # Write output
+        tl.store(Y + cols, y, mask=mask)
+        if HAS_W1:
+            w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+            if zero_centered_weight:
+                w1 += 1.0
+            if HAS_B1:
+                b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
+            y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
+            tl.store(Y1 + cols, y1, mask=mask)
+    def _layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        residual=None,
+        x1=None,
+        weight1=None,
+        bias1=None,
+        dropout_p=0.0,
+        rowscale=None,
+        out_dtype=None,
+        residual_dtype=None,
+        zero_centered_weight=False,
+        is_rms_norm=False,
+        return_dropout_mask=False,
+        out=None,
+        residual_out=None
+    ):
+        if residual is not None:
+            residual_dtype = residual.dtype
+        M, N = x.shape
+        assert x.stride(-1) == 1
+        if residual is not None:
+            assert residual.stride(-1) == 1
+            assert residual.shape == (M, N)
+        assert weight.shape == (N,)
+        assert weight.stride(-1) == 1
+        if bias is not None:
+            assert bias.stride(-1) == 1
+            assert bias.shape == (N,)
+        if x1 is not None:
+            assert x1.shape == x.shape
+            assert rowscale is None
+            assert x1.stride(-1) == 1
+        if weight1 is not None:
+            assert weight1.shape == (N,)
+            assert weight1.stride(-1) == 1
+        if bias1 is not None:
+            assert bias1.shape == (N,)
+            assert bias1.stride(-1) == 1
+        if rowscale is not None:
+            assert rowscale.is_contiguous()
+            assert rowscale.shape == (M,)
+        # allocate output
+        if out is None:
+            out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+        else:
+            assert out.shape == x.shape
+        assert out.stride(-1) == 1
+        if weight1 is not None:
+            y1 = torch.empty_like(out)
+            assert y1.stride(-1) == 1
+        else:
+            y1 = None
+        if (
+            residual is not None
+            or (residual_dtype is not None and residual_dtype != x.dtype)
+            or dropout_p > 0.0
+            or rowscale is not None
+            or x1 is not None
+        ):
+            if residual_out is None:
+                residual_out = torch.empty(
+                    M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
+                )
+            else:
+                assert residual_out.shape == x.shape
+            assert residual_out.stride(-1) == 1
+        else:
+            residual_out = None
+        mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
+        rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
+        if dropout_p > 0.0:
+            seeds = torch.randint(
+                2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
+            )
+        else:
+            seeds = None
+        if return_dropout_mask and dropout_p > 0.0:
+            dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
+        else:
+            dropout_mask = None
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+        if N > BLOCK_N:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+        with torch.cuda.device(x.device.index):
+            _layer_norm_fwd_1pass_kernel[(M,)](
+                x,
+                out,
+                weight,
+                bias,
+                residual,
+                x1,
+                weight1,
+                bias1,
+                y1,
+                residual_out,
+                rowscale,
+                seeds,
+                dropout_mask,
+                mean,
+                rstd,
+                x.stride(0),
+                out.stride(0),
+                residual.stride(0) if residual is not None else 0,
+                residual_out.stride(0) if residual_out is not None else 0,
+                x1.stride(0) if x1 is not None else 0,
+                y1.stride(0) if y1 is not None else 0,
+                M,
+                N,
+                eps,
+                dropout_p,
+                zero_centered_weight,
+                is_rms_norm,
+                BLOCK_N,
+                residual is not None,
+                residual_out is not None,
+                bias is not None,
+                dropout_p > 0.0,
+                dropout_mask is not None,
+                rowscale is not None,
+            )
+        # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
+        if dropout_mask is not None and x1 is not None:
+            dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
+        else:
+            dropout_mask1 = None
+        return (
+            out,
+            y1,
+            mean,
+            rstd,
+            residual_out if residual_out is not None else x,
+            seeds,
+            dropout_mask,
+            dropout_mask1,
+        )
+    @triton.autotune(
+        configs=triton_autotune_configs(),
+        key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
+    )
+    # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+    # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
+    # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+    @triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
+    @triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
+    @triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
+    @triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
+    @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
+    @triton.jit
+    def _layer_norm_bwd_kernel(
+        X,  # pointer to the input
+        W,  # pointer to the weights
+        B,  # pointer to the biases
+        Y,  # pointer to the output to be recomputed
+        DY,  # pointer to the output gradient
+        DX,  # pointer to the input gradient
+        DW,  # pointer to the partial sum of weights gradient
+        DB,  # pointer to the partial sum of biases gradient
+        DRESIDUAL,
+        W1,
+        DY1,
+        DX1,
+        DW1,
+        DB1,
+        DRESIDUAL_IN,
+        ROWSCALE,
+        SEEDS,
+        Mean,  # pointer to the mean
+        Rstd,  # pointer to the 1/std
+        stride_x_row,  # how much to increase the pointer when moving by 1 row
+        stride_y_row,
+        stride_dy_row,
+        stride_dx_row,
+        stride_dres_row,
+        stride_dy1_row,
+        stride_dx1_row,
+        stride_dres_in_row,
+        M,  # number of rows in X
+        N,  # number of columns in X
+        eps,  # epsilon to avoid division by zero
+        dropout_p,
+        zero_centered_weight,
+        rows_per_program,
+        IS_RMS_NORM: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        HAS_DRESIDUAL: tl.constexpr,
+        STORE_DRESIDUAL: tl.constexpr,
+        HAS_BIAS: tl.constexpr,
+        HAS_DROPOUT: tl.constexpr,
+        HAS_ROWSCALE: tl.constexpr,
+        HAS_DY1: tl.constexpr,
+        HAS_DX1: tl.constexpr,
+        HAS_B1: tl.constexpr,
+        RECOMPUTE_OUTPUT: tl.constexpr,
+    ):
+        # Map the program id to the elements of X, DX, and DY it should compute.
+        row_block_id = tl.program_id(0)
+        row_start = row_block_id * rows_per_program
+        # Do not early exit if row_start >= M, because we need to write DW and DB
+        cols = tl.arange(0, BLOCK_N)
+        mask = cols < N
+        X += row_start * stride_x_row
+        if HAS_DRESIDUAL:
+            DRESIDUAL += row_start * stride_dres_row
+        if STORE_DRESIDUAL:
+            DRESIDUAL_IN += row_start * stride_dres_in_row
+        DY += row_start * stride_dy_row
+        DX += row_start * stride_dx_row
+        if HAS_DY1:
+            DY1 += row_start * stride_dy1_row
+        if HAS_DX1:
+            DX1 += row_start * stride_dx1_row
+        if RECOMPUTE_OUTPUT:
+            Y += row_start * stride_y_row
+        w = tl.load(W + cols, mask=mask).to(tl.float32)
+        if zero_centered_weight:
+            w += 1.0
+        if RECOMPUTE_OUTPUT and HAS_BIAS:
+            b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
+        if HAS_DY1:
+            w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+            if zero_centered_weight:
+                w1 += 1.0
+        dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+        if HAS_BIAS:
+            db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+        if HAS_DY1:
+            dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
+            if HAS_B1:
+                db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
+        row_end = min((row_block_id + 1) * rows_per_program, M)
+        for row in range(row_start, row_end):
+            # Load data to SRAM
+            x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            if HAS_DY1:
+                dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
+            if not IS_RMS_NORM:
+                mean = tl.load(Mean + row)
+            rstd = tl.load(Rstd + row)
+            # Compute dx
+            xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+            xhat = tl.where(mask, xhat, 0.0)
+            if RECOMPUTE_OUTPUT:
+                y = xhat * w + b if HAS_BIAS else xhat * w
+                tl.store(Y + cols, y, mask=mask)
+            wdy = w * dy
+            dw += dy * xhat
+            if HAS_BIAS:
+                db += dy
+            if HAS_DY1:
+                wdy += w1 * dy1
+                dw1 += dy1 * xhat
+                if HAS_B1:
+                    db1 += dy1
+            if not IS_RMS_NORM:
+                c1 = tl.sum(xhat * wdy, axis=0) / N
+                c2 = tl.sum(wdy, axis=0) / N
+                dx = (wdy - (xhat * c1 + c2)) * rstd
+            else:
+                c1 = tl.sum(xhat * wdy, axis=0) / N
+                dx = (wdy - xhat * c1) * rstd
+            if HAS_DRESIDUAL:
+                dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
+                dx += dres
+            # Write dx
+            if STORE_DRESIDUAL:
+                tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
+            if HAS_DX1:
+                if HAS_DROPOUT:
+                    keep_mask = (
+                        tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+                    )
+                    dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
+                else:
+                    dx1 = dx
+                tl.store(DX1 + cols, dx1, mask=mask)
+            if HAS_DROPOUT:
+                keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+                dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
+            if HAS_ROWSCALE:
+                rowscale = tl.load(ROWSCALE + row).to(tl.float32)
+                dx *= rowscale
+            tl.store(DX + cols, dx, mask=mask)
+            X += stride_x_row
+            if HAS_DRESIDUAL:
+                DRESIDUAL += stride_dres_row
+            if STORE_DRESIDUAL:
+                DRESIDUAL_IN += stride_dres_in_row
+            if RECOMPUTE_OUTPUT:
+                Y += stride_y_row
+            DY += stride_dy_row
+            DX += stride_dx_row
+            if HAS_DY1:
+                DY1 += stride_dy1_row
+            if HAS_DX1:
+                DX1 += stride_dx1_row
+        tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+        if HAS_BIAS:
+            tl.store(DB + row_block_id * N + cols, db, mask=mask)
+        if HAS_DY1:
+            tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
+            if HAS_B1:
+                tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
+    def _layer_norm_bwd(
+        dy,
+        x,
+        weight,
+        bias,
+        eps,
+        mean,
+        rstd,
+        dresidual=None,
+        dy1=None,
+        weight1=None,
+        bias1=None,
+        seeds=None,
+        dropout_p=0.0,
+        rowscale=None,
+        has_residual=False,
+        has_x1=False,
+        zero_centered_weight=False,
+        is_rms_norm=False,
+        x_dtype=None,
+        recompute_output=False,
+    ):
+        M, N = x.shape
+        assert x.stride(-1) == 1
+        assert dy.stride(-1) == 1
+        assert dy.shape == (M, N)
+        if dresidual is not None:
+            assert dresidual.stride(-1) == 1
+            assert dresidual.shape == (M, N)
+        assert weight.shape == (N,)
+        assert weight.stride(-1) == 1
+        if bias is not None:
+            assert bias.stride(-1) == 1
+            assert bias.shape == (N,)
+        if dy1 is not None:
+            assert weight1 is not None
+            assert dy1.shape == dy.shape
+            assert dy1.stride(-1) == 1
+        if weight1 is not None:
+            assert weight1.shape == (N,)
+            assert weight1.stride(-1) == 1
+        if bias1 is not None:
+            assert bias1.shape == (N,)
+            assert bias1.stride(-1) == 1
+        if seeds is not None:
+            assert seeds.is_contiguous()
+            assert seeds.shape == (M if not has_x1 else M * 2,)
+        if rowscale is not None:
+            assert rowscale.is_contiguous()
+            assert rowscale.shape == (M,)
+        # allocate output
+        dx = (
+            torch.empty_like(x)
+            if x_dtype is None
+            else torch.empty(M, N, dtype=x_dtype, device=x.device)
+        )
+        dresidual_in = (
+            torch.empty_like(x)
+            if has_residual
+            and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
+            else None
+        )
+        dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+        y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
+        if recompute_output:
+            assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+        if N > BLOCK_N:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+        # Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
+        # latency of the gmem reads/writes, but will increase the time of summing up dw / db.
+        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
+        _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+        _db = (
+            torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
+            if bias is not None
+            else None
+        )
+        _dw1 = torch.empty_like(_dw) if weight1 is not None else None
+        _db1 = torch.empty_like(_db) if bias1 is not None else None
+        rows_per_program = math.ceil(M / sm_count)
+        grid = (sm_count,)
+        with torch.cuda.device(x.device.index):
+            _layer_norm_bwd_kernel[grid](
+                x,
+                weight,
+                bias,
+                y,
+                dy,
+                dx,
+                _dw,
+                _db,
+                dresidual,
+                weight1,
+                dy1,
+                dx1,
+                _dw1,
+                _db1,
+                dresidual_in,
+                rowscale,
+                seeds,
+                mean,
+                rstd,
+                x.stride(0),
+                0 if not recompute_output else y.stride(0),
+                dy.stride(0),
+                dx.stride(0),
+                dresidual.stride(0) if dresidual is not None else 0,
+                dy1.stride(0) if dy1 is not None else 0,
+                dx1.stride(0) if dx1 is not None else 0,
+                dresidual_in.stride(0) if dresidual_in is not None else 0,
+                M,
+                N,
+                eps,
+                dropout_p,
+                zero_centered_weight,
+                rows_per_program,
+                is_rms_norm,
+                BLOCK_N,
+                dresidual is not None,
+                dresidual_in is not None,
+                bias is not None,
+                dropout_p > 0.0,
+            )
+        dw = _dw.sum(0).to(weight.dtype)
+        db = _db.sum(0).to(bias.dtype) if bias is not None else None
+        dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
+        db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
+        # Don't need to compute dresidual_in separately in this case
+        if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
+            dresidual_in = dx
+        if has_x1 and dropout_p == 0.0:
+            dx1 = dx
+        return (
+            (dx, dw, db, dresidual_in, dx1, dw1, db1)
+            if not recompute_output
+            else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
+        )
+    class LayerNormFn(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            x,
+            weight,
+            bias,
+            residual=None,
+            x1=None,
+            weight1=None,
+            bias1=None,
+            eps=1e-6,
+            dropout_p=0.0,
+            rowscale=None,
+            prenorm=False,
+            residual_in_fp32=False,
+            zero_centered_weight=False,
+            is_rms_norm=False,
+            return_dropout_mask=False,
+            out=None,
+            residual_out=None
+        ):
+            x_shape_og = x.shape
+            # Check for zero sequence length
+            if x.numel() == 0:
+                ctx.zero_seq_length = True
+                # Only save minimal required tensors for backward
+                # ctx.save_for_backward(weight, bias, weight1, bias1)
+                ctx.x_shape_og = x_shape_og
+                ctx.weight_shape = weight.shape
+                ctx.weight_dtype = weight.dtype
+                ctx.weight_device = weight.device
+                ctx.has_bias = bias is not None
+                ctx.bias_shape = bias.shape if bias is not None else None
+                ctx.bias_dtype = bias.dtype if bias is not None else None
+                ctx.bias_device = bias.device if bias is not None else None
+                ctx.has_weight1 = weight1 is not None
+                ctx.weight1_shape = weight1.shape if weight1 is not None else None
+                ctx.weight1_dtype = weight1.dtype if weight1 is not None else None
+                ctx.weight1_device = weight1.device if weight1 is not None else None
+                ctx.has_bias1 = bias1 is not None
+                ctx.bias1_shape = bias1.shape if bias1 is not None else None
+                ctx.bias1_dtype = bias1.dtype if bias1 is not None else None
+                ctx.bias1_device = bias1.device if bias1 is not None else None
+                ctx.has_residual = residual is not None
+                ctx.has_x1 = x1 is not None
+                ctx.dropout_p = dropout_p
+                # Handle output tensors with correct dtype
+                y = x  # Preserve input tensor properties
+                y1 = torch.empty_like(x) if x1 is not None else None
+                # Only create residual_out if prenorm is True
+                residual_out = torch.empty(x.shape,
+                                        dtype=torch.float32 if residual_in_fp32 else x.dtype,
+                                        device=x.device) if prenorm else None
+                # Handle dropout masks
+                dropout_mask = None
+                dropout_mask1 = None
+                if return_dropout_mask:
+                    dropout_mask = torch.empty_like(x, dtype=torch.uint8)
+                    if x1 is not None:
+                        dropout_mask1 = torch.empty_like(x, dtype=torch.uint8)
+                # Return based on configuration
+                if not return_dropout_mask:
+                    if weight1 is None:
+                        return y if not prenorm else (y, residual_out)
+                    else:
+                        return (y, y1) if not prenorm else (y, y1, residual_out)
+                else:
+                    if weight1 is None:
+                        return ((y, dropout_mask, dropout_mask1) if not prenorm
+                            else (y, residual_out, dropout_mask, dropout_mask1))
+                    else:
+                        return ((y, y1, dropout_mask, dropout_mask1) if not prenorm
+                            else (y, y1, residual_out, dropout_mask, dropout_mask1))
+            ctx.zero_seq_length = False
+            # reshape input data into 2D tensor
+            x = x.reshape(-1, x.shape[-1])
+            if x.stride(-1) != 1:
+                x = x.contiguous()
+            if residual is not None:
+                assert residual.shape == x_shape_og
+                residual = residual.reshape(-1, residual.shape[-1])
+                if residual.stride(-1) != 1:
+                    residual = residual.contiguous()
+            if x1 is not None:
+                assert x1.shape == x_shape_og
+                assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+                x1 = x1.reshape(-1, x1.shape[-1])
+                if x1.stride(-1) != 1:
+                    x1 = x1.contiguous()
+            weight = weight.contiguous()
+            if bias is not None:
+                bias = bias.contiguous()
+            if weight1 is not None:
+                weight1 = weight1.contiguous()
+            if bias1 is not None:
+                bias1 = bias1.contiguous()
+            if rowscale is not None:
+                rowscale = rowscale.reshape(-1).contiguous()
+            residual_dtype = (
+                residual.dtype
+                if residual is not None
+                else (torch.float32 if residual_in_fp32 else None)
+            )
+            if out is not None:
+                out = out.reshape(-1, out.shape[-1])
+            if residual_out is not None:
+                residual_out = residual_out.reshape(-1, residual_out.shape[-1])
+            y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
+                x,
+                weight,
+                bias,
+                eps,
+                residual,
+                x1,
+                weight1,
+                bias1,
+                dropout_p=dropout_p,
+                rowscale=rowscale,
+                residual_dtype=residual_dtype,
+                zero_centered_weight=zero_centered_weight,
+                is_rms_norm=is_rms_norm,
+                return_dropout_mask=return_dropout_mask,
+                out=out,
+                residual_out=residual_out
+            )
+            ctx.save_for_backward(
+                residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
+            )
+            ctx.x_shape_og = x_shape_og
+            ctx.eps = eps
+            ctx.dropout_p = dropout_p
+            ctx.is_rms_norm = is_rms_norm
+            ctx.has_residual = residual is not None
+            ctx.has_x1 = x1 is not None
+            ctx.prenorm = prenorm
+            ctx.x_dtype = x.dtype
+            ctx.zero_centered_weight = zero_centered_weight
+            y = y.reshape(x_shape_og)
+            y1 = y1.reshape(x_shape_og) if y1 is not None else None
+            residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
+            dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+            dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
+            if not return_dropout_mask:
+                if weight1 is None:
+                    return y if not prenorm else (y, residual_out)
+                else:
+                    return (y, y1) if not prenorm else (y, y1, residual_out)
+            else:
+                if weight1 is None:
+                    return (
+                        (y, dropout_mask, dropout_mask1)
+                        if not prenorm
+                        else (y, residual_out, dropout_mask, dropout_mask1)
+                    )
+                else:
+                    return (
+                        (y, y1, dropout_mask, dropout_mask1)
+                        if not prenorm
+                        else (y, y1, residual_out, dropout_mask, dropout_mask1)
+                    )
+        @staticmethod
+        def backward(ctx, dy, *args):
+            if ctx.zero_seq_length:
+                return (
+                    torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device),
+                    torch.zeros(ctx.weight_shape, dtype=ctx.weight_dtype, device=ctx.weight_device),
+                    torch.zeros(ctx.bias_shape, dtype=ctx.bias_dtype, device=ctx.bias_device) if ctx.has_bias else None,
+                    torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_residual else None,
+                    torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_x1 and ctx.dropout_p > 0.0 else None,
+                    torch.zeros(ctx.weight1_shape, dtype=ctx.weight1_dtype, device=ctx.weight1_device) if ctx.has_weight1 else None,
+                    torch.zeros(ctx.bias1_shape, dtype=ctx.bias1_dtype, device=ctx.bias1_device) if ctx.has_bias1 else None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                )
+            x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
+            dy = dy.reshape(-1, dy.shape[-1])
+            if dy.stride(-1) != 1:
+                dy = dy.contiguous()
+            assert dy.shape == x.shape
+            if weight1 is not None:
+                dy1, args = args[0], args[1:]
+                dy1 = dy1.reshape(-1, dy1.shape[-1])
+                if dy1.stride(-1) != 1:
+                    dy1 = dy1.contiguous()
+                assert dy1.shape == x.shape
+            else:
+                dy1 = None
+            if ctx.prenorm:
+                dresidual = args[0]
+                dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+                if dresidual.stride(-1) != 1:
+                    dresidual = dresidual.contiguous()
+                assert dresidual.shape == x.shape
+            else:
+                dresidual = None
+            dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
+                dy,
+                x,
+                weight,
+                bias,
+                ctx.eps,
+                mean,
+                rstd,
+                dresidual,
+                dy1,
+                weight1,
+                bias1,
+                seeds,
+                ctx.dropout_p,
+                rowscale,
+                ctx.has_residual,
+                ctx.has_x1,
+                ctx.zero_centered_weight,
+                ctx.is_rms_norm,
+                x_dtype=ctx.x_dtype,
+            )
+            return (
+                dx.reshape(ctx.x_shape_og),
+                dw,
+                db,
+                dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+                dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
+                dw1,
+                db1,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+            )
+    def rms_norm_fn(
+        x,
+        weight,
+        bias,
+        residual=None,
+        x1=None,
+        weight1=None,
+        bias1=None,
+        eps=1e-6,
+        dropout_p=0.0,
+        rowscale=None,
+        prenorm=False,
+        residual_in_fp32=False,
+        zero_centered_weight=False,
+        return_dropout_mask=False,
+        out=None,
+        residual_out=None
+    ):
+        return LayerNormFn.apply(
+            x,
+            weight,
+            bias,
+            residual,
+            x1,
+            weight1,
+            bias1,
+            eps,
+            dropout_p,
+            rowscale,
+            prenorm,
+            residual_in_fp32,
+            zero_centered_weight,
+            True,
+            return_dropout_mask,
+            out,
+            residual_out
+        )
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
+                    device=None, dtype=None):
+            factory_kwargs = {"device": device, "dtype": dtype}
+            super().__init__()
+            self.eps = eps
+            if dropout_p > 0.0:
+                self.drop = torch.nn.Dropout(dropout_p)
+            else:
+                self.drop = None
+            self.zero_centered_weight = zero_centered_weight
+            self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+            self.register_parameter("bias", None)
+            self.reset_parameters()
+        def reset_parameters(self):
+            if not self.zero_centered_weight:
+                torch.nn.init.ones_(self.weight)
+            else:
+                torch.nn.init.zeros_(self.weight)
+        def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
+            return rms_norm_fn(
+                x,
+                self.weight,
+                self.bias,
+                residual=residual,
+                eps=self.eps,
+                dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
+                prenorm=prenorm,
+                residual_in_fp32=residual_in_fp32,
+                zero_centered_weight=self.zero_centered_weight,
+            )
+else:
+    from torch.nn import RMSNorm
+    warnings.warn("Cannot import triton, install triton to use fused RMSNorm for better performance")
+def swiglu(x, y):
+    return F.silu(x.float(), inplace=False).to(x.dtype) * y
+logger = logging.get_logger(__name__)
+@dataclass
+class TeaCacheParams:
+    previous_residual: Optional[torch.Tensor] = None
+    previous_modulated_inp: Optional[torch.Tensor] = None
+    accumulated_rel_l1_distance: float = 0
+    is_first_or_last_step: bool = False
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+        self.initialize_weights()
+    def initialize_weights(self):
+        nn.init.normal_(self.linear_1.weight, std=0.02)
+        nn.init.zeros_(self.linear_1.bias)
+        nn.init.normal_(self.linear_2.weight, std=0.02)
+        nn.init.zeros_(self.linear_2.bias)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        # x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], x.shape[-1] // 2, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class ThinkGenRotaryPosEmbed(nn.Module):
+    def __init__(self, theta: int,
+                 axes_dim: Tuple[int, int, int],
+                 axes_lens: Tuple[int, int, int] = (300, 512, 512),
+                 patch_size: int = 2):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.axes_lens = axes_lens
+        self.patch_size = patch_size
+    @staticmethod
+    def get_freqs_cis(axes_dim: Tuple[int, int, int],
+                      axes_lens: Tuple[int, int, int],
+                      theta: int) -> List[torch.Tensor]:
+        freqs_cis = []
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
+            emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
+            freqs_cis.append(emb)
+        return freqs_cis
+    def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        if ids.device.type == "mps":
+            ids = ids.to("cpu")
+        result = []
+        for i in range(len(self.axes_dim)):
+            freqs = freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
+        return torch.cat(result, dim=-1).to(device)
+    def forward(
+        self,
+        freqs_cis,
+        attention_mask,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        ref_img_sizes,
+        img_sizes,
+        device
+    ):
+        batch_size = len(attention_mask)
+        p = self.patch_size
+        encoder_seq_len = attention_mask.shape[1]
+        l_effective_cap_len = attention_mask.sum(dim=1).tolist()
+        seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]
+        max_seq_len = max(seq_lengths)
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        # Create position IDs
+        position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
+        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            # add text position ids
+            position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")
+            pe_shift = cap_seq_len
+            pe_shift_len = cap_seq_len
+            if ref_img_sizes[i] is not None:
+                for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
+                    H, W = ref_img_size
+                    ref_H_tokens, ref_W_tokens = H // p, W // p
+                    assert ref_H_tokens * ref_W_tokens == ref_img_len
+                    # add image position ids
+                    row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
+                    col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
+                    pe_shift += max(ref_H_tokens, ref_W_tokens)
+                    pe_shift_len += ref_img_len
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // p, W // p
+            assert H_tokens * W_tokens == l_effective_img_len[i]
+            row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
+            col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()
+            assert pe_shift_len + l_effective_img_len[i] == seq_len
+            position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
+            position_ids[i, pe_shift_len: seq_len, 1] = row_ids
+            position_ids[i, pe_shift_len: seq_len, 2] = col_ids
+        # Get combined rotary embeddings
+        freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)
+        # create separate rotary embeddings for captions and images
+        cap_freqs_cis = torch.zeros(
+            batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        ref_img_freqs_cis = torch.zeros(
+            batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        img_freqs_cis = torch.zeros(
+            batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
+            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
+            ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
+            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
+        return (
+            cap_freqs_cis,
+            ref_img_freqs_cis,
+            img_freqs_cis,
+            freqs_cis,
+            l_effective_cap_len,
+            seq_lengths,
+        )
+class LuminaRMSNormZero(nn.Module):
+    """
+    Norm layer adaptive RMS normalization zero.
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        norm_eps: float,
+        norm_elementwise_affine: bool,
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(
+            min(embedding_dim, 1024),
+            4 * embedding_dim,
+            bias=True,
+        )
+        self.norm = RMSNorm(embedding_dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(emb))
+        scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None])
+        return x, gate_msa, scale_mlp, gate_mlp
+class LuminaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+        out_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # AdaLN
+        self.silu = nn.SiLU()
+        self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+        self.linear_2 = None
+        if out_dim is not None:
+            self.linear_2 = nn.Linear(embedding_dim, out_dim, bias=bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditioning_embedding: torch.Tensor,
+    ) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
+        scale = emb
+        x = self.norm(x) * (1 + scale)[:, None, :]
+        if self.linear_2 is not None:
+            x = self.linear_2(x)
+        return x
+class LuminaFeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        hidden_size (`int`):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        intermediate_size (`int`): The intermediate dimension of the feedforward layer.
+        multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
+            of this value.
+        ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
+            dimension. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        inner_dim: int,
+        multiple_of: Optional[int] = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+    ):
+        super().__init__()
+        self.swiglu = swiglu
+        # custom hidden_size factor multiplier
+        if ffn_dim_multiplier is not None:
+            inner_dim = int(ffn_dim_multiplier * inner_dim)
+        inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
+        self.linear_1 = nn.Linear(
+            dim,
+            inner_dim,
+            bias=False,
+        )
+        self.linear_2 = nn.Linear(
+            inner_dim,
+            dim,
+            bias=False,
+        )
+        self.linear_3 = nn.Linear(
+            dim,
+            inner_dim,
+            bias=False,
+        )
+    def forward(self, x):
+        h1, h2 = self.linear_1(x), self.linear_3(x)
+        return self.linear_2(self.swiglu(h1, h2))
+class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        text_feat_dim: int = 204800,  # 2048
+        frequency_embedding_size: int = 256,
+        norm_eps: float = 1e-5,
+        timestep_scale: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.time_proj = Timesteps(
+            num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=timestep_scale
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024)
+        )
+        self.caption_embedder = nn.Sequential(
+            RMSNorm(text_feat_dim*2, eps=norm_eps),
+            nn.Linear(text_feat_dim*2, hidden_size, bias=True),
+        )
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for name, module in self.caption_embedder.named_modules():
+            if hasattr(module, 'weight') and module.weight is not None:
+                nn.init.trunc_normal_(module.weight, std=0.02)
+                print(name, "a")
+            if hasattr(module, 'bias') and module.bias is not None:
+                nn.init.zeros_(module.bias)
+                print(name, "b")
+        print("init caption_embedder done")
+    def forward(
+        self, timestep: torch.Tensor, text_hidden_states: torch.Tensor, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        timestep_proj = self.time_proj(timestep).to(dtype=dtype)
+        time_embed = self.timestep_embedder(timestep_proj)
+        caption_embed = self.caption_embedder(text_hidden_states)
+        return time_embed, caption_embed
+class ThinkGenAttnProcessor:
+    """
+    Processor for implementing scaled dot-product attention with flash attention and variable length sequences.
+    This processor is optimized for PyTorch 2.0 and implements:
+    - Flash attention with variable length sequences
+    - Rotary position embeddings (RoPE)
+    - Query-Key normalization
+    - Proportional attention scaling
+    Args:
+        None
+    Raises:
+        ImportError: If PyTorch version is less than 2.0
+    """
+    def __init__(self) -> None:
+        """Initialize the attention processor."""
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "ThinkGenAttnProcessorFlash2Varlen requires PyTorch 2.0. "
+                "Please upgrade PyTorch to version 2.0 or later."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Process attention computation with flash attention.
+        Args:
+            attn: Attention module
+            hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
+            encoder_hidden_states: Encoder hidden states tensor
+            attention_mask: Optional attention mask tensor
+            image_rotary_emb: Optional rotary embeddings for image tokens
+            base_sequence_length: Optional base sequence length for proportional attention
+        Returns:
+            torch.Tensor: Processed hidden states after attention computation
+        """
+        batch_size, sequence_length, _ = hidden_states.shape
+        # Get Query-Key-Value Pair
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        # Get key-value heads
+        kv_heads = inner_dim // head_dim
+        # Reshape tensors for attention computation
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        # Apply Query-Key normalization
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply Rotary Position Embeddings
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        # Calculate attention scale
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        # explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
+        key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.type_as(query)
+        # Apply output projection
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class ThinkGenAttnProcessorFlash2Varlen:
+    """
+    Processor for implementing scaled dot-product attention with flash attention and variable length sequences.
+    This processor implements:
+    - Flash attention with variable length sequences
+    - Rotary position embeddings (RoPE)
+    - Query-Key normalization
+    - Proportional attention scaling
+    Args:
+        None
+    """
+    def __init__(self) -> None:
+        """Initialize the attention processor."""
+        if not is_flash_attn_available():
+            raise ImportError(
+                "ThinkGenAttnProcessorFlash2Varlen requires flash_attn. "
+                "Please install flash_attn."
+            )
+    def _upad_input(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_length: int,
+        num_heads: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
+        """
+        Unpad the input tensors for flash attention.
+        Args:
+            query_layer: Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
+            key_layer: Key tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
+            value_layer: Value tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
+            attention_mask: Attention mask tensor of shape (batch_size, seq_len)
+            query_length: Length of the query sequence
+            num_heads: Number of attention heads
+        Returns:
+            Tuple containing:
+                - Unpadded query tensor
+                - Unpadded key tensor
+                - Unpadded value tensor
+                - Query indices
+                - Tuple of cumulative sequence lengths for query and key
+                - Tuple of maximum sequence lengths for query and key
+        """
+        def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
+            """Helper function to get unpadding data from attention mask."""
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+            return indices, cu_seqlens, max_seqlen_in_batch
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        # Unpad key and value layers
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        # Handle different query length cases
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Process attention computation with flash attention.
+        Args:
+            attn: Attention module
+            hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
+            encoder_hidden_states: Encoder hidden states tensor
+            attention_mask: Optional attention mask tensor
+            image_rotary_emb: Optional rotary embeddings for image tokens
+            base_sequence_length: Optional base sequence length for proportional attention
+        Returns:
+            torch.Tensor: Processed hidden states after attention computation
+        """
+        batch_size, sequence_length, _ = hidden_states.shape
+        # Get Query-Key-Value Pair
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        # Get key-value heads
+        kv_heads = inner_dim // head_dim
+        # Reshape tensors for attention computation
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        # Apply Query-Key normalization
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply Rotary Position Embeddings
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        # Calculate attention scale
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        # Unpad input for flash attention
+        (
+            query_states,
+            key_states,
+            value_states,
+            indices_q,
+            cu_seq_lens,
+            max_seq_lens,
+        ) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        # Handle different number of heads
+        if kv_heads < attn.heads:
+            key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
+            value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
+        # Apply flash attention
+        attn_output_unpad = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=0.0,
+            causal=False,
+            softmax_scale=softmax_scale,
+        )
+        # Pad output and apply final transformations
+        hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
+        hidden_states = hidden_states.flatten(-2)
+        hidden_states = hidden_states.type_as(query)
+        # Apply output projection
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class ThinkGenTransformerBlock(nn.Module):
+    """
+    Transformer block for ThinkGen model.
+    This block implements a transformer layer with:
+    - Multi-head attention with flash attention
+    - Feed-forward network with SwiGLU activation
+    - RMS normalization
+    - Optional modulation for conditional generation
+    Args:
+        dim: Dimension of the input and output tensors
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key-value heads
+        multiple_of: Multiple of which the hidden dimension should be
+        ffn_dim_multiplier: Multiplier for the feed-forward network dimension
+        norm_eps: Epsilon value for normalization layers
+        modulation: Whether to use modulation for conditional generation
+        use_fused_rms_norm: Whether to use fused RMS normalization
+        use_fused_swiglu: Whether to use fused SwiGLU activation
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        modulation: bool = True,
+    ) -> None:
+        """Initialize the transformer block."""
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+        self.modulation = modulation
+        try:
+            processor = ThinkGenAttnProcessorFlash2Varlen()
+        except ImportError:
+            processor = ThinkGenAttnProcessor()
+        # Initialize attention layer
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="rms_norm",
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=processor,
+        )
+        # Initialize feed-forward network
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier
+        )
+        # Initialize normalization layers
+        if modulation:
+            self.norm1 = LuminaRMSNormZero(
+                embedding_dim=dim,
+                norm_eps=norm_eps,
+                norm_elementwise_affine=True
+            )
+        else:
+            self.norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.initialize_weights()
+    def initialize_weights(self) -> None:
+        """
+        Initialize the weights of the transformer block.
+        Uses Xavier uniform initialization for linear layers and zero initialization for biases.
+        """
+        nn.init.xavier_uniform_(self.attn.to_q.weight)
+        nn.init.xavier_uniform_(self.attn.to_k.weight)
+        nn.init.xavier_uniform_(self.attn.to_v.weight)
+        nn.init.xavier_uniform_(self.attn.to_out[0].weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)
+        if self.modulation:
+            nn.init.zeros_(self.norm1.linear.weight)
+            nn.init.zeros_(self.norm1.linear.bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the transformer block.
+        Args:
+            hidden_states: Input hidden states tensor
+            attention_mask: Attention mask tensor
+            image_rotary_emb: Rotary embeddings for image tokens
+            temb: Optional timestep embedding tensor
+        Returns:
+            torch.Tensor: Output hidden states after transformer block processing
+        """
+        enable_taylorseer = getattr(self, 'enable_taylorseer', False)
+        if enable_taylorseer:
+            if self.modulation:
+                if temb is None:
+                    raise ValueError("temb must be provided when modulation is enabled")
+                if self.current['type'] == 'full':
+                    self.current['module'] = 'total'
+                    taylor_cache_init(cache_dic=self.cache_dic, current=self.current)
+                    norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+                    attn_output = self.attn(
+                        hidden_states=norm_hidden_states,
+                        encoder_hidden_states=norm_hidden_states,
+                        attention_mask=attention_mask,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+                    hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+                    mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+                    hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+                    derivative_approximation(cache_dic=self.cache_dic, current=self.current, feature=hidden_states)
+                elif self.current['type'] == 'Taylor':
+                    self.current['module'] = 'total'
+                    hidden_states = taylor_formula(cache_dic=self.cache_dic, current=self.current)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+                hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        else:
+            if self.modulation:
+                if temb is None:
+                    raise ValueError("temb must be provided when modulation is enabled")
+                norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+                hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+                hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        return hidden_states
+class ThinkGenTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    ThinkGen Transformer 2D Model.
+    A transformer-based diffusion model for image generation with:
+    - Patch-based image processing
+    - Rotary position embeddings
+    - Multi-head attention
+    - Conditional generation support
+    Args:
+        patch_size: Size of image patches
+        in_channels: Number of input channels
+        out_channels: Number of output channels (defaults to in_channels)
+        hidden_size: Size of hidden layers
+        num_layers: Number of transformer layers
+        num_refiner_layers: Number of refiner layers
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key-value heads
+        multiple_of: Multiple of which the hidden dimension should be
+        ffn_dim_multiplier: Multiplier for feed-forward network dimension
+        norm_eps: Epsilon value for normalization layers
+        axes_dim_rope: Dimensions for rotary position embeddings
+        axes_lens: Lengths for rotary position embeddings
+        text_feat_dim: Dimension of text features
+        timestep_scale: Scale factor for timestep embeddings
+        use_fused_rms_norm: Whether to use fused RMS normalization
+        use_fused_swiglu: Whether to use fused SwiGLU activation
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["ThinkGenTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["x_embedder", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        out_channels: Optional[int] = None,
+        hidden_size: int = 2304,
+        num_layers: int = 26,
+        num_refiner_layers: int = 2,
+        num_attention_heads: int = 24,
+        num_kv_heads: int = 8,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        text_feat_dim: int = 1024,
+        timestep_scale: float = 1.0
+    ) -> None:
+        """Initialize the ThinkGen transformer model."""
+        super().__init__()
+        # Validate configuration
+        if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
+            raise ValueError(
+                f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
+                f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
+            )
+        self.out_channels = out_channels or in_channels
+        # Initialize embeddings
+        self.rope_embedder = ThinkGenRotaryPosEmbed(
+            theta=10000,
+            axes_dim=axes_dim_rope,
+            axes_lens=axes_lens,
+            patch_size=patch_size,
+        )
+        self.x_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=hidden_size,
+        )
+        self.ref_image_patch_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=hidden_size,
+        )
+        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
+            hidden_size=hidden_size,
+            text_feat_dim=text_feat_dim,
+            norm_eps=norm_eps,
+            timestep_scale=timestep_scale
+        )
+        # Initialize transformer blocks
+        self.noise_refiner = nn.ModuleList([
+            ThinkGenTransformerBlock(
+                hidden_size,
+                num_attention_heads,
+                num_kv_heads,
+                multiple_of,
+                ffn_dim_multiplier,
+                norm_eps,
+                modulation=True
+            )
+            for _ in range(num_refiner_layers)
+        ])
+        self.ref_image_refiner = nn.ModuleList([
+            ThinkGenTransformerBlock(
+                hidden_size,
+                num_attention_heads,
+                num_kv_heads,
+                multiple_of,
+                ffn_dim_multiplier,
+                norm_eps,
+                modulation=True
+            )
+            for _ in range(num_refiner_layers)
+        ])
+        self.context_refiner = nn.ModuleList(
+            [
+                ThinkGenTransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=False
+                )
+                for _ in range(num_refiner_layers)
+            ]
+        )
+        # 3. Transformer blocks
+        self.layers = nn.ModuleList(
+            [
+                ThinkGenTransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Output norm & projection
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels
+        )
+        # Add learnable embeddings to distinguish different images
+        self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images
+        self.gradient_checkpointing = False
+        self.initialize_weights()
+        # TeaCache settings
+        self.enable_teacache = False
+        self.teacache_rel_l1_thresh = 0.05
+        self.teacache_params = TeaCacheParams()
+        coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
+        self.rescale_func = np.poly1d(coefficients)
+        self.prepad_embed = nn.Parameter(torch.randn(1, 23, 8192))
+        print("add prepad_embed parameter ! ")
+        self.register_buffer('prepad_mask', torch.ones(1, 23).to(torch.int64))
+    def initialize_weights(self) -> None:
+        """
+        Initialize the weights of the model.
+        Uses Xavier uniform initialization for linear layers.
+        """
+        nn.init.xavier_uniform_(self.x_embedder.weight)
+        nn.init.constant_(self.x_embedder.bias, 0.0)
+        nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
+        nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
+        nn.init.zeros_(self.norm_out.linear_1.weight)
+        nn.init.zeros_(self.norm_out.linear_1.bias)
+        nn.init.zeros_(self.norm_out.linear_2.weight)
+        nn.init.zeros_(self.norm_out.linear_2.bias)
+        nn.init.normal_(self.image_index_embedding, std=0.02)
+    def img_patch_embed_and_refine(
+        self,
+        hidden_states,
+        ref_image_hidden_states,
+        padded_img_mask,
+        padded_ref_img_mask,
+        noise_rotary_emb,
+        ref_img_rotary_emb,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        temb
+    ):
+        batch_size = len(hidden_states)
+        max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])
+        hidden_states = self.x_embedder(hidden_states)
+        ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
+        # 添加image_index_embedding
+        for i in range(batch_size):
+            shift = 0
+            for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
+                ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
+                shift += ref_img_len
+        for layer in self.noise_refiner:
+            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
+        flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
+        num_ref_images = len(flat_l_effective_ref_img_len)
+        max_ref_img_len = max(flat_l_effective_ref_img_len)
+        batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
+        batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
+        batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
+        batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
+        # sequence of ref imgs to batch
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                batch_ref_img_mask[idx, :ref_img_len] = True
+                batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
+                batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
+                batch_temb[idx] = temb[i]
+                shift += ref_img_len
+                idx += 1
+        # refine ref imgs separately
+        for layer in self.ref_image_refiner:
+            batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)
+        # batch of ref imgs to sequence
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
+                shift += ref_img_len
+                idx += 1
+        combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
+        for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
+            combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
+            combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
+        return combined_img_hidden_states
+    def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
+        batch_size = len(hidden_states)
+        p = self.config.patch_size
+        device = hidden_states[0].device
+        img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
+        l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
+        if ref_image_hidden_states is not None:
+            ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
+            l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
+        else:
+            ref_img_sizes = [None for _ in range(batch_size)]
+            l_effective_ref_img_len = [[0] for _ in range(batch_size)]
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        # ref image patch embeddings
+        flat_ref_img_hidden_states = []
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                imgs = []
+                for ref_img in ref_image_hidden_states[i]:
+                    C, H, W = ref_img.size()
+                    ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+                    imgs.append(ref_img)
+                img = torch.cat(imgs, dim=0)
+                flat_ref_img_hidden_states.append(img)
+            else:
+                flat_ref_img_hidden_states.append(None)
+        # image patch embeddings
+        flat_hidden_states = []
+        for i in range(batch_size):
+            img = hidden_states[i]
+            C, H, W = img.size()
+            img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+            flat_hidden_states.append(img)
+        padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
+        padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
+                padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
+        padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
+        padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
+            padded_img_mask[i, :l_effective_img_len[i]] = True
+        return (
+            padded_hidden_states,
+            padded_ref_img_hidden_states,
+            padded_img_mask,
+            padded_ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        )
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, List[torch.Tensor]],
+        timestep: torch.Tensor,
+        text_hidden_states: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        text_attention_mask: torch.Tensor,
+        ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        enable_taylorseer = getattr(self, 'enable_taylorseer', False)
+        # if self.prepad_embed.dtype != text_hidden_states.dtype:
+        #     self.prepad_embed = self.prepad_embed.to(text_hidden_states.dtype)
+        # if self.prepad_mask.device != text_attention_mask.device:
+        #     self.prepad_mask = self.prepad_mask.to(text_attention_mask.device)
+        bs = text_hidden_states.shape[0]
+        prepad_embed = self.prepad_embed.repeat(bs, 1, 1)
+        prepad_mask = self.prepad_mask.repeat(bs, 1)
+        text_hidden_states = torch.cat([prepad_embed, text_hidden_states], dim = 1)
+        text_attention_mask = torch.cat([prepad_mask, text_attention_mask], dim = 1)
+        if enable_taylorseer:
+            cal_type(self.cache_dic, self.current)
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        # 1. Condition, positional & patch embedding
+        batch_size = len(hidden_states)
+        is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
+        if is_hidden_states_tensor:
+            assert hidden_states.ndim == 4
+            hidden_states = [_hidden_states for _hidden_states in hidden_states]
+        device = hidden_states[0].device
+        temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
+        (
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
+        (
+            context_rotary_emb,
+            ref_img_rotary_emb,
+            noise_rotary_emb,
+            rotary_emb,
+            encoder_seq_lengths,
+            seq_lengths,
+        ) = self.rope_embedder(
+            freqs_cis,
+            text_attention_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+            device,
+        )
+        # 2. Context refinement
+        for layer in self.context_refiner:
+            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
+        combined_img_hidden_states = self.img_patch_embed_and_refine(
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            noise_rotary_emb,
+            ref_img_rotary_emb,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            temb,
+        )
+        # 3. Joint Transformer blocks (joint text embed 和 image embed)
+        max_seq_len = max(seq_lengths)
+        attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
+        joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            attention_mask[i, :seq_len] = True
+            joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
+            joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]
+        hidden_states = joint_hidden_states
+        if self.enable_teacache:
+            teacache_hidden_states = hidden_states.clone()
+            teacache_temb = temb.clone()
+            modulated_inp, _, _, _ = self.layers[0].norm1(teacache_hidden_states, teacache_temb)
+            if self.teacache_params.is_first_or_last_step:
+                should_calc = True
+                self.teacache_params.accumulated_rel_l1_distance = 0
+            else:
+                self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
+                    ((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean() \
+                        / self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
+                )
+                if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
+                    should_calc = False
+                else:
+                    should_calc = True
+                    self.teacache_params.accumulated_rel_l1_distance = 0
+            self.teacache_params.previous_modulated_inp = modulated_inp
+        if self.enable_teacache:
+            if not should_calc:
+                hidden_states += self.teacache_params.previous_residual
+            else:
+                ori_hidden_states = hidden_states.clone()
+                for layer_idx, layer in enumerate(self.layers):
+                    if torch.is_grad_enabled() and self.gradient_checkpointing:
+                        hidden_states = self._gradient_checkpointing_func(
+                            layer, hidden_states, attention_mask, rotary_emb, temb
+                        )
+                    else:
+                        hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
+                self.teacache_params.previous_residual = hidden_states - ori_hidden_states
+        else:
+            if enable_taylorseer:
+                self.current['stream'] = 'layers_stream'
+            for layer_idx, layer in enumerate(self.layers):
+                if enable_taylorseer:
+                    layer.current = self.current
+                    layer.cache_dic = self.cache_dic
+                    layer.enable_taylorseer = True
+                    self.current['layer'] = layer_idx
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+                    hidden_states = self._gradient_checkpointing_func(
+                        layer, hidden_states, attention_mask, rotary_emb, temb
+                    )
+                else:
+                    hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
+        # 4. Output norm & projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        p = self.config.patch_size
+        output = []
+        for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
+            height, width = img_size
+            output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
+        if is_hidden_states_tensor:
+            output = torch.stack(output, dim=0)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if enable_taylorseer:
+            self.current['step'] += 1
+        if not return_dict:
+            return output
+        return Transformer2DModelOutput(sample=output)

vae/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.34.1",
+  "_name_or_path": "/share_2/luoxin/modelscope/hub/models/FLUX.1-dev",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 16,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 1024,
+  "scaling_factor": 0.3611,
+  "shift_factor": 0.1159,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": false,
+  "use_quant_conv": false
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c717328c8ad41faab2ccfd52ae17332505c6833cf176aad56e7b58f2c4d4c94
+size 335306212