xinhe commited on 11 days ago

Commit

63aad33

verified ·

1 Parent(s): 463bbc3

Upload folder using huggingface_hub

Browse files

Files changed (47) hide show

.gitattributes +1 -0
added_tokens.json +34 -0
chat_template.jinja +54 -0
config.json +85 -0
config.py +45 -0
generation_config.json +6 -0
merges.txt +0 -0
model-00001-of-00011.safetensors +3 -0
model-00002-of-00011.safetensors +3 -0
model-00003-of-00011.safetensors +3 -0
model-00004-of-00011.safetensors +3 -0
model-00005-of-00011.safetensors +3 -0
model-00006-of-00011.safetensors +3 -0
model-00007-of-00011.safetensors +3 -0
model-00008-of-00011.safetensors +3 -0
model-00009-of-00011.safetensors +3 -0
model-00010-of-00011.safetensors +3 -0
model-00011-of-00011.safetensors +3 -0
model.safetensors.index.json +0 -0
models/__pycache__/config.cpython-312.pyc +0 -0
models/__pycache__/gen_pipeline.cpython-312.pyc +0 -0
models/__pycache__/heads.cpython-312.pyc +0 -0
models/__pycache__/llama_model.cpython-312.pyc +0 -0
models/__pycache__/nextstep_model.cpython-312.pyc +0 -0
models/config.py +45 -0
models/gen_pipeline.py +398 -0
models/heads.py +283 -0
models/llama_model.py +568 -0
models/nextstep_model.py +553 -0
quantization_config.json +12 -0
special_tokens_map.json +27 -0
tokenizer.json +3 -0
tokenizer_config.json +284 -0
utils/__pycache__/compile_utils.cpython-312.pyc +0 -0
utils/__pycache__/image_utils.cpython-312.pyc +0 -0
utils/__pycache__/misc.cpython-312.pyc +0 -0
utils/__pycache__/model_utils.cpython-312.pyc +0 -0
utils/aspect_ratio.py +107 -0
utils/compile_utils.py +122 -0
utils/image_utils.py +314 -0
utils/misc.py +51 -0
utils/model_utils.py +128 -0
vae/__pycache__/nextstep_ae.cpython-312.pyc +0 -0
vae/checkpoint.pt +3 -0
vae/config.json +14 -0
vae/nextstep_ae.py +494 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|begin_of_image|>": 151667,
+  "<|begin_of_prompt_refinement|>": 151670,
+  "<|begin_of_thinking|>": 151672,
+  "<|beginoftext|>": 151674,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|end_of_image|>": 151668,
+  "<|end_of_prompt_refinement|>": 151671,
+  "<|end_of_thinking|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_area|>": 151666,
+  "<|image_pad|>": 151655,
+  "<|image_placeholder|>": 151669,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "[PAD]": 151665
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "config.NextStepConfig",
+    "AutoModel": "models/nextstep_model.NextStep"
+  },
+  "base_image_grid_size": 64,
+  "boi": 151667,
+  "bos_token_id": 151643,
+  "create_kwargs": {
+    "snr_type": "lognorm"
+  },
+  "dtype": "float32",
+  "eoi": 151668,
+  "eos_token_id": 151643,
+  "fm_head_batch_mul": 4,
+  "fm_head_dim": 1536,
+  "fm_head_layers": 12,
+  "genloss_batch_mul": 4,
+  "genloss_depth": 12,
+  "genloss_net_arch": "mlp",
+  "genloss_num_sampling_steps": "100",
+  "genloss_type": "transport",
+  "genloss_width": 1536,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "im_loss_weight": 1.0,
+  "image_decoder_arch": "Trans_E",
+  "image_encoder_name": null,
+  "image_feature_layer": -2,
+  "image_loss_weight": 1.0,
+  "image_placeholder_id": 151669,
+  "image_size": 64,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "latent_channels": 16,
+  "latent_patch_size": 2,
+  "latent_size": 32,
+  "lm_loss_weight": 0.01,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "mlp_bias": false,
+  "model_type": "nextstep",
+  "noise_strength": 0.0,
+  "num_attention_heads": 40,
+  "num_channels": 16,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "o_attention_bias": false,
+  "pad_token_id_added": 151665,
+  "patch_size": 2,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "autoround_version": "0.13.0",
+    "batch_size": 1,
+    "bits": 4,
+    "block_name_to_quantize": "layers,image_head.net.res_blocks",
+    "data_type": "int",
+    "gradient_accumulate_steps": 8,
+    "group_size": 128,
+    "packing_format": "auto_round:auto_gptq",
+    "quant_method": "auto-round",
+    "sym": true
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.6",
+  "use_2d_rope": false,
+  "use_cache": true,
+  "use_gen_pos_embed": false,
+  "use_mlp_before_lm_head": false,
+  "use_sliding_window": false,
+  "use_token_length_weight": false,
+  "vae_name_or_path": "vae/",
+  "vocab_size": 152064
+}

config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers.models.llama.configuration_llama import LlamaConfig
+class NextStepConfig(LlamaConfig):
+    model_type = "nextstep"
+    def __init__(
+        self,
+        vae_name_or_path: str | None = None,
+        latent_size: int = 32,
+        latent_patch_size: int = 2,
+        latent_channels: int = 16,
+        boi: int | None = None,
+        eoi: int | None = None,
+        image_placeholder_id: int | None = None,
+        pad_token_id_added: int | None = None,
+        lm_loss_weight: float = 0.01,
+        im_loss_weight: float = 1.0,
+        fm_head_dim: int = 1536,
+        fm_head_layers: int = 12,
+        fm_head_batch_mul: int = 4,
+        o_attention_bias: bool | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vae_name_or_path = vae_name_or_path
+        self.latent_size = latent_size
+        self.latent_patch_size = latent_patch_size
+        self.latent_channels = latent_channels
+        self.boi = boi
+        self.eoi = eoi
+        self.image_placeholder_id = image_placeholder_id
+        self.pad_token_id_added = pad_token_id_added
+        self.lm_loss_weight = lm_loss_weight
+        self.im_loss_weight = im_loss_weight
+        self.fm_head_dim = fm_head_dim
+        self.fm_head_layers = fm_head_layers
+        self.fm_head_batch_mul = fm_head_batch_mul
+        self.o_attention_bias = self.attention_bias if o_attention_bias is None else o_attention_bias

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "transformers_version": "4.57.6"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:077443740552ca1200c38d6cd4c4b4d0b7a521dfde20c07ec2a029286c7164bc
+size 1070748352

model-00002-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf6f83a31891db5b95558615e56616954a0ef3953d9ecbd2b6065d3cfa4303c2
+size 1073712592

model-00003-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:334bfc82a3872b08794934450f2e6931d6e24bb69ed80516c373d49429248d8a
+size 1071875280

model-00004-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b8847b0405a3dad6e9568694260d06cbc78c782350a48f47c6274b99e22b866
+size 1073712648

model-00005-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5595e49f4bf51f164eb10f38a06f09d0bb0695a7d4bcd5f0c7447f5ef225a561
+size 1071875280

model-00006-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fce180ad24077aa642a5e24d059580267f87e390fa9e88068708213babc2550a
+size 1073712648

model-00007-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb2886adb554edd8f65d038e7d0d2ae49a7a8f68a1d27927e1edf65e805aa60f
+size 504065392

model-00008-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5f5fe250f5cbce219aadb61c9a44903739e510a66184cc960bfce87175bc34
+size 1557135464

model-00009-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e92a52899f9d6cbc039d0505ca8e138a6851d201dfa11c0c28a749e4291d8879
+size 10320

model-00010-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ddc7205f05cccae7a7ba77dda441576f4742084f47756571fd7e40c34e670c5
+size 1557135456

model-00011-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b93ab2f652b8ba6cc6889d50fdfd9cd89b6b6cce05f2f4282e8c1d048937177
+size 84188960

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.88 kB). View file

models/__pycache__/gen_pipeline.cpython-312.pyc ADDED Viewed

Binary file (19.3 kB). View file

models/__pycache__/heads.cpython-312.pyc ADDED Viewed

Binary file (18.5 kB). View file

models/__pycache__/llama_model.cpython-312.pyc ADDED Viewed

Binary file (27.8 kB). View file

models/__pycache__/nextstep_model.cpython-312.pyc ADDED Viewed

Binary file (26.9 kB). View file

models/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers.models.llama.configuration_llama import LlamaConfig
+class NextStepConfig(LlamaConfig):
+    model_type = "nextstep"
+    def __init__(
+        self,
+        vae_name_or_path: str | None = None,
+        latent_size: int = 32,
+        latent_patch_size: int = 2,
+        latent_channels: int = 16,
+        boi: int | None = None,
+        eoi: int | None = None,
+        image_placeholder_id: int | None = None,
+        pad_token_id_added: int | None = None,
+        lm_loss_weight: float = 0.01,
+        im_loss_weight: float = 1.0,
+        fm_head_dim: int = 1536,
+        fm_head_layers: int = 12,
+        fm_head_batch_mul: int = 4,
+        o_attention_bias: bool | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vae_name_or_path = vae_name_or_path
+        self.latent_size = latent_size
+        self.latent_patch_size = latent_patch_size
+        self.latent_channels = latent_channels
+        self.boi = boi
+        self.eoi = eoi
+        self.image_placeholder_id = image_placeholder_id
+        self.pad_token_id_added = pad_token_id_added
+        self.lm_loss_weight = lm_loss_weight
+        self.im_loss_weight = im_loss_weight
+        self.fm_head_dim = fm_head_dim
+        self.fm_head_layers = fm_head_layers
+        self.fm_head_batch_mul = fm_head_batch_mul
+        self.o_attention_bias = self.attention_bias if o_attention_bias is None else o_attention_bias

models/gen_pipeline.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import re
+import copy
+from typing import Literal
+from PIL import Image
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from transformers import AutoTokenizer
+from transformers.cache_utils import Cache, StaticCache
+from models.nextstep_model import NextStep
+from vae.nextstep_ae import AutoencoderKL
+from utils.image_utils import to_pil
+from utils.model_utils import layer_norm
+from utils.compile_utils import compile_manager
+from utils.misc import set_seed
+DEFAULT_IMAGE_AREA_TOKEN = "<|image_area|>"
+def hw2str(h: int, w: int) -> str:
+    return f"{h}*{w}"
+class NextStepPipeline:
+    def __init__(
+        self,
+        model_name_or_path: str | None = None,
+        vae_name_or_path: str | None = None,
+        tokenizer: AutoTokenizer | None = None,
+        model: nn.Module | None = None,
+        vae: AutoencoderKL | None = None,
+    ):
+        if model is not None:
+            self.tokenizer = copy.deepcopy(tokenizer)
+            self.tokenizer.padding_side = "left"
+            self.model = model
+        elif model_name_or_path is not None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name_or_path,
+                local_files_only=True,
+                padding_side="left",
+                use_fast=True,
+            )
+            self.model: NextStep = NextStep.from_pretrained(model_name_or_path, local_files_only=True)
+        else:
+            raise ValueError("model or model_name_or_path is required")
+        self.tokenizer.add_eos_token = False
+        if vae_name_or_path is None:
+            vae_name_or_path = getattr(self.model.config, "vae_name_or_path", None)
+        if vae is not None:
+            self.vae = vae
+        elif vae_name_or_path is not None:
+            self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
+        else:
+            raise ValueError("vae or vae_name_or_path is required")
+        self.model.eval()
+        self.vae.eval()
+        vae_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.down_factor = vae_factor * self.model.config.latent_patch_size
+        self.shift_factor = getattr(self.vae.config, "shift_factor", 0.0)
+        self.scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+        self.boi = self.model.config.boi
+        self.eoi = self.model.config.eoi
+        self.image_placeholder_id = self.model.config.image_placeholder_id
+        self.pil2tensor = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ]
+        )
+        self.__device = self.model.device
+        self.__dtype = self.model.dtype
+        self.to(self.device, self.dtype)
+    @property
+    def device(self):
+        return self.__device
+    @property
+    def device_type(self):
+        if isinstance(self.__device, str):
+            return self.__device
+        return self.__device.type
+    @property
+    def dtype(self):
+        return self.__dtype
+    def to(self, device: str | None = None, dtype: torch.dtype | None = None):
+        if device is not None:
+            self.__device = device
+        if dtype is not None:
+            self.__dtype = dtype
+        self.model.to(self.__device, dtype=self.__dtype)
+        self.vae.to(self.__device, dtype=self.__dtype)
+        return self
+    def _image_str(self, hw: tuple[int, int] = (256, 256)):
+        latent_hw = (hw[0] // self.down_factor, hw[1] // self.down_factor)
+        image_ids = [self.boi] + [self.image_placeholder_id] * (latent_hw[0] * latent_hw[1]) + [self.eoi]
+        image_str = DEFAULT_IMAGE_AREA_TOKEN + hw2str(*latent_hw) + self.tokenizer.decode(image_ids)
+        return image_str
+    def _check_input(
+        self, captions: str | list[str], images: Image.Image | list[Image.Image] | None
+    ) -> tuple[list[str], list[Image.Image] | None]:
+        if not isinstance(captions, list):
+            captions = [captions]
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            # Validate image count matches <image> tokens in captions
+            image_token_count = 0
+            for caption in captions:
+                num_image_token = len(re.findall(r"<image>", caption))
+                assert num_image_token == 1, f"Caption `{caption}` has {num_image_token} image tokens, but only 1 is allowed."
+                image_token_count += num_image_token
+            if image_token_count != len(images):
+                raise ValueError(
+                    f"Number of images ({len(images)}) does not match number of image tokens ({image_token_count}).\n"
+                    f"Captions: {captions}"
+                )
+            hws = [(image.size[1], image.size[0]) for image in images]
+            # Replace <image> tokens sequentially with corresponding image_str based on hw
+            processed_captions = []
+            image_idx = 0
+            for caption in captions:
+                # Process each caption
+                processed_caption = caption
+                num_image_tokens = processed_caption.count("<image>")
+                # Replace each <image> token in order
+                for _ in range(num_image_tokens):
+                    processed_caption = processed_caption.replace("<image>", self._image_str(hws[image_idx]), 1)
+                    image_idx += 1
+                processed_captions.append(processed_caption)
+            captions = processed_captions
+        return captions, images
+    def _build_captions(
+        self,
+        captions: str | list[str],
+        images: list[Image.Image] | None = None,
+        num_images_per_caption: int = 1,
+        positive_prompt: str | None = None,
+        negative_prompt: str | None = None,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+    ):
+        # 1. repeat captions and images
+        if not isinstance(captions, list):
+            captions = [captions]
+        captions = [caption for caption in captions for _ in range(num_images_per_caption)]
+        if images is not None:
+            images = [image for image in images for _ in range(num_images_per_caption)]
+        # 2. add positive prompt
+        if positive_prompt is not None and positive_prompt != "":
+            captions = [f"{caption} {positive_prompt}" for caption in captions]
+        # 3. add negative prompt
+        if negative_prompt is None:
+            negative_prompt = ""
+        num_samples = len(captions)
+        if cfg != 1.0 and cfg_img != 1.0:  # use both image and text CFG
+            w, h = images[0].size
+            captions = (
+                captions + [self._image_str((h, w)) + negative_prompt] * num_samples
+            )
+            images = images + images
+            captions = captions + [negative_prompt] * num_samples
+        elif cfg != 1.0 and cfg_img == 1.0:  # use text CFG
+            captions = captions + [negative_prompt] * num_samples
+        elif cfg == 1.0 and cfg_img == 1.0:
+            pass
+        return captions, images
+    def _add_prefix_ids(self, hw: tuple[int, int], input_ids: torch.Tensor, attention_mask: torch.Tensor):
+        prefix_str = DEFAULT_IMAGE_AREA_TOKEN + hw2str(hw[0] // self.down_factor, hw[1] // self.down_factor)
+        prefix_output = self.tokenizer(
+            prefix_str,
+            truncation=False,
+            add_special_tokens=True,
+            return_tensors="pt"
+        )
+        prefix_input_ids = prefix_output.input_ids.to(input_ids.device, dtype=input_ids.dtype)
+        prefix_attention_mask = prefix_output.attention_mask.to(attention_mask.device, dtype=attention_mask.dtype)
+        # remove bos token
+        if self.tokenizer.bos_token is not None:
+            prefix_input_ids = prefix_input_ids[:, 1:]
+            prefix_attention_mask = prefix_attention_mask[:, 1:]
+        # add boi token
+        prefix_input_ids = torch.cat(
+            [
+                prefix_input_ids,
+                prefix_input_ids.new_tensor([self.model.config.boi]).unsqueeze(0),
+            ],
+            dim=1,
+        )
+        prefix_attention_mask = torch.cat(
+            [
+                prefix_attention_mask,
+                prefix_attention_mask.new_ones((prefix_attention_mask.shape[0], 1)),
+            ],
+            dim=1,
+        )
+        bsz = input_ids.shape[0]
+        input_ids = torch.cat([input_ids, prefix_input_ids.expand(bsz, -1)], dim=1)
+        attention_mask = torch.cat([attention_mask, prefix_attention_mask.expand(bsz, -1)], dim=1)
+        return input_ids, attention_mask
+    @torch.no_grad()
+    def decoding(
+        self,
+        c: torch.Tensor,
+        attention_mask: torch.Tensor,
+        past_key_values: Cache,
+        max_new_len: int,
+        num_images_per_caption: int,
+        use_norm: bool = False,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        cfg_schedule: Literal["linear", "constant"] = "constant",
+        timesteps_shift: float = 1.0,
+        num_sampling_steps: int = 20,
+        progress: bool = True,
+        hw: tuple[int, int] = (256, 256),
+        step: int = 0,
+    ):
+        indices = list(range(max_new_len))
+        indices = tqdm(indices, unit="tokens") if progress else indices
+        tokens = None
+        for step in indices:
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                tokens_len = 0 if tokens is None else tokens.shape[1]
+                cfg_iter = max(cfg / 2, 1 + (cfg - 1) * tokens_len / max_new_len)
+                cfg_img_iter = max(cfg_img / 2, 1 + (cfg_img - 1) * tokens_len / max_new_len)
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+                cfg_img_iter = cfg_img
+            else:
+                raise NotImplementedError
+            c = self.model.image_out_projector(c)
+            token_sampled = self.model.image_head.sample(
+                c=c.squeeze(1),
+                cfg=cfg_iter,
+                cfg_img=cfg_img_iter,
+                timesteps_shift=timesteps_shift,
+                num_sampling_steps=num_sampling_steps,
+                noise_repeat=num_images_per_caption,
+            )
+            if use_norm:
+                token_sampled = layer_norm(token_sampled, normalized_shape=token_sampled.size()[1:])
+            if tokens is not None:
+                tokens = torch.cat([tokens, token_sampled.unsqueeze(1)], dim=1)
+            else:
+                tokens = token_sampled.unsqueeze(1)
+            cur_inputs_embeds = self.model.image_in_projector(tokens[:, -1:])
+            if cfg != 1.0 and cfg_img == 1.0:
+                cur_inputs_embeds = torch.cat([cur_inputs_embeds, cur_inputs_embeds], dim=0)
+            elif cfg != 1.0 and cfg_img != 1.0:
+                cur_inputs_embeds = torch.cat([cur_inputs_embeds, cur_inputs_embeds, cur_inputs_embeds], dim=0)
+            attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+            outputs = self.model.forward_model(
+                inputs_embeds=cur_inputs_embeds,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            past_key_values = outputs.past_key_values
+            c = outputs.last_hidden_state[:, -1:]
+            if self.model.config.use_gen_pos_embed:
+                c = c + self.model.gen_pos_embed_with_ar(hw[0], hw[1])[:, step + 1 : step + 2, :]
+        return tokens
+    @torch.no_grad()
+    def generate_image(
+        self,
+        captions: str | list[str],
+        images: list[Image.Image] | None = None,
+        num_images_per_caption: int = 1,
+        positive_prompt: str | None = None,
+        negative_prompt: str | None = None,
+        hw: tuple[int, int] = (256, 256),
+        use_norm: bool = False,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        cfg_schedule: Literal["linear", "constant"] = "constant",
+        num_sampling_steps: int = 20,
+        timesteps_shift: float = 1.0,
+        seed: int = 42,
+        progress: bool = True,
+    ) -> list[Image.Image]:
+        # 0. set seed
+        if seed is not None:
+            set_seed(seed)
+        # 1. check input
+        captions, images = self._check_input(captions, images)
+        # 2. build captions
+        captions, images = self._build_captions(
+            captions, images, num_images_per_caption, positive_prompt, negative_prompt, cfg, cfg_img
+        )
+        # 3. encode images
+        # `images` must be processed by `process_images` before calling this function
+        latents = None
+        if images is not None:
+            pixel_values = [self.pil2tensor(image) for image in images]
+            pixel_values = torch.stack(pixel_values).to(self.device)
+            with compile_manager.compile_disabled():
+                posterior = self.vae.encode(pixel_values.to(self.vae.dtype)).latent_dist
+            latents = (posterior.sample() - self.shift_factor) * self.scaling_factor
+        captions = [self.tokenizer.bos_token + caption if self.tokenizer.bos_token is not None else caption for caption in captions]
+        # 4. tokenize caption & add prefix ids
+        output = self.tokenizer(
+            captions,
+            padding="longest",
+            truncation=False,
+            add_special_tokens=True,
+            return_tensors="pt",
+            padding_side="left"
+        )
+        input_ids = output.input_ids.to(self.device)
+        attention_mask = output.attention_mask.to(self.device)
+        input_ids, attention_mask = self._add_prefix_ids(hw, input_ids, attention_mask)
+        # 5. LLM prefill
+        max_new_len = (hw[0] // self.down_factor) * (hw[1] // self.down_factor)
+        max_cache_len = input_ids.shape[1] + max_new_len
+        past_key_values = StaticCache(
+            config=self.model.config,
+            max_batch_size=input_ids.shape[0],
+            max_cache_len=max_cache_len,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        inputs_embeds = self.model.prepare_inputs_embeds(input_ids, latents)
+        with compile_manager.compile_disabled():
+            outputs = self.model.forward_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+        past_key_values = outputs.past_key_values
+        c = outputs.last_hidden_state[:, -1:]
+        if self.model.config.use_gen_pos_embed:
+            c = c + self.model.gen_pos_embed_with_ar(hw[0], hw[1])[:, 0:1, :]
+        # 6. decoding
+        tokens = self.decoding(
+            c=c,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            max_new_len=max_new_len,
+            num_images_per_caption=num_images_per_caption,
+            use_norm=use_norm,
+            cfg=cfg,
+            cfg_img=cfg_img,
+            cfg_schedule=cfg_schedule,
+            timesteps_shift=timesteps_shift,
+            num_sampling_steps=num_sampling_steps,
+            progress=progress,
+            hw=hw,
+        )
+        # 7. unpatchify
+        latents = self.model.unpatchify(tokens)
+        latents = (latents / self.scaling_factor) + self.shift_factor
+        # 8. decode latents
+        with compile_manager.compile_disabled():
+            sampled_images = self.vae.decode(latents.to(self.vae.dtype)).sample
+        sampled_images = sampled_images.detach().cpu().to(torch.float32)
+        pil_images = [to_pil(img) for img in sampled_images]
+        return pil_images

models/heads.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers.activations import ACT2FN
+from models.config import LlamaConfig
+from utils.misc import LargeInt
+from utils.model_utils import expand_t, randn_tensor
+from utils.compile_utils import smart_compile
+class LlamaMLP(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def modulate(x, shift, scale=None):
+    if shift is None:
+        return x * (1 + scale)
+    return x * (1 + scale) + shift
+class ResBlock(nn.Module):
+    def __init__(self, channels, mlp_ratio=1.0):
+        super().__init__()
+        self.channels = channels
+        self.intermediate_size = int(channels * mlp_ratio)
+        self.in_ln = nn.LayerNorm(self.channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.channels, self.intermediate_size),
+            nn.SiLU(),
+            nn.Linear(self.intermediate_size, self.channels),
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(model_channels, 2 * model_channels, bias=True))
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+            device=t.device
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        return t_emb
+class SimpleMLPAdaLN(nn.Module):
+    def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
+        super().__init__()
+        self.input_dim = input_dim
+        self.cond_dim = cond_dim
+        self.dim = dim
+        self.layers = layers
+        self.mlp_ratio = mlp_ratio
+        self.time_embed = TimestepEmbedder(dim)
+        self.cond_embed = nn.Linear(cond_dim, dim)
+        self.input_proj = nn.Linear(input_dim, dim)
+        res_blocks = []
+        for _ in range(layers):
+            res_blocks.append(ResBlock(dim, mlp_ratio))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(dim, input_dim)
+        self.grad_checkpointing = False
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    @smart_compile()
+    def forward(self, x, t, c):
+        """
+        x.shape = (bsz, input_dim)
+        t.shape = (bsz,)
+        c.shape = (bsz, cond_dim)
+        """
+        x = self.input_proj(x)
+        t = self.time_embed(t)
+        c = self.cond_embed(c)
+        y = t + c
+        for block in self.res_blocks:
+            if self.grad_checkpointing and self.training:
+                x = checkpoint(block, x, y, use_reentrant=True)
+            else:
+                x = block(x, y)
+        return self.final_layer(x, y)
+class FlowMatchingHead(nn.Module):
+    def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
+        super(FlowMatchingHead, self).__init__()
+        self.input_dim = input_dim
+        self.net = SimpleMLPAdaLN(input_dim=input_dim, cond_dim=cond_dim, dim=dim, layers=layers, mlp_ratio=mlp_ratio)
+    @property
+    def dtype(self):
+        return self.net.input_proj.weight.dtype
+    @property
+    def device(self):
+        return self.net.input_proj.weight.device
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+    def get_score_from_velocity(self, velocity, x, t):
+        """Wrapper function: transfrom velocity prediction model to score
+        Args:
+            velocity: [bsz, ...] shaped tensor; velocity model output
+            x:        [bsz, ...] shaped tensor; x_t data point
+            t:        [bsz,] time tensor
+        """
+        t = expand_t(t, x)
+        alpha_t, d_alpha_t = t, 1
+        sigma_t, d_sigma_t = 1 - t, -1
+        mean = x
+        reverse_alpha_ratio = alpha_t / d_alpha_t
+        var = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
+        score = (reverse_alpha_ratio * velocity - mean) / var
+        return score
+    def get_velocity_from_cfg(self, velocity, cfg, cfg_img, cfg_mult):
+        if cfg_mult == 2:
+            cond_v, uncond_v = torch.chunk(velocity, 2, dim=0)
+            velocity = uncond_v + cfg * (cond_v - uncond_v)
+        elif cfg_mult == 3:
+            cond_v, uncond_v1, uncond_v2 = torch.chunk(velocity, 3, dim=0)
+            velocity = uncond_v2 + cfg_img * (uncond_v1 - uncond_v2) + cfg * (cond_v - uncond_v1)
+        return velocity
+    @smart_compile(options={"triton.cudagraphs": True}, fullgraph=True)
+    @torch.no_grad()
+    def sample(
+        self,
+        c: torch.Tensor,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        timesteps_shift: float = 1.0,
+        num_sampling_steps: int = 20,
+        last_step_size: float = 0.0,
+        noise_repeat: int = 1,
+    ):
+        # """c.shape = (bsz, cond_dim)"""
+        cfg_mult = 1
+        if cfg > 1.0:
+            cfg_mult += 1
+        if cfg_img > 1.0:
+            cfg_mult += 1
+        noise = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, self.device)
+        mean_x = noise
+        x = noise
+        xs = []
+        t0, t1 = 0, 1
+        timesteps = torch.linspace(t0, t1, num_sampling_steps + 1, device=c.device)[:-1]
+        timesteps = timesteps / (timesteps_shift - (timesteps_shift - 1) * timesteps)
+        timesteps = torch.cat([timesteps, torch.ones(1, device=c.device)])
+        for ti, tj in zip(timesteps[:-1], timesteps[1:]):
+            dt = tj - ti
+            combined = torch.cat([x] * cfg_mult, dim=0)
+            velocity = self.net(combined.to(c.dtype), ti.expand(c.shape[0]).to(c), c)
+            velocity = velocity.to(torch.float32)
+            velocity = self.get_velocity_from_cfg(velocity, cfg, cfg_img, cfg_mult)
+            score = self.get_score_from_velocity(velocity, x, ti.expand(x.shape[0]).to(x))
+            drift = velocity + (1 - expand_t(ti.expand(x.shape[0]).to(x), x)) * score
+            w_cur = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, self.device)
+            dw = w_cur * torch.sqrt(dt)
+            mean_x = x + drift * dt
+            x = mean_x + torch.sqrt(2 * (1 - expand_t(ti.expand(x.shape[0]).to(x), x))) * dw
+            xs.append(x)
+        if len(xs) != num_sampling_steps:
+            raise ValueError(f"Samples ({len(xs)}) does not match the number of steps ({num_sampling_steps})")
+        return xs[-1].to(c.dtype)

models/llama_model.py ADDED Viewed

	@@ -0,0 +1,568 @@

+from typing import Optional, Tuple
+from loguru import logger
+import math
+import torch
+import torch.nn as nn
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.utils import is_flash_attn_greater_or_equal_2_10
+from transformers import ROPE_INIT_FUNCTIONS
+from transformers.models.llama.configuration_llama import LlamaConfig
+from models.heads import LlamaMLP
+from utils.model_utils import apply_rotary_pos_emb, repeat_kv
+from models.config import NextStepConfig
+class LlamaRMSNorm(nn.Module):
+    """LlamaRMSNorm is equivalent to T5LayerNorm"""
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, device=None, config: Optional[LlamaConfig] = None):
+        super().__init__()
+        self.rope_type = "default"
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class LlamaAttention(nn.Module):
+    def __init__(self, config: NextStepConfig, layer_idx: Optional[int]):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, self.hidden_size, bias=getattr(config, "o_attention_bias", config.attention_bias)
+        )
+        self._flash_attn_uses_top_left_mask = False
+    def forward_sdpa(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+    def forward_flash(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at GitHub - huggingface/transformers: 🤗 Transformers: the model-definition framework for state-of-the-a"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=None,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attention_2": LlamaFlashAttention2,
+    "sdpa": LlamaSdpaAttention,
+}
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs

models/nextstep_model.py ADDED Viewed

	@@ -0,0 +1,553 @@

+import os
+import json
+import inspect
+from loguru import logger
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from safetensors.torch import safe_open
+from transformers.modeling_utils import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from models.config import NextStepConfig
+from models.llama_model import LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding
+from models.heads import FlowMatchingHead
+from utils.misc import LargeInt
+from utils.compile_utils import smart_compile
+from utils.model_utils import get_2d_sincos_pos_embed
+@dataclass
+class NextStepOutputWithPast(CausalLMOutputWithPast):
+    lm_loss: torch.FloatTensor | None = None
+    im_loss: torch.FloatTensor | None = None
+class NextStepPreTrainedModel(PreTrainedModel):
+    config_class = NextStepConfig
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+class NextStep(NextStepPreTrainedModel):
+    def __init__(self, config: NextStepConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        token_dim = self.config.latent_channels * self.config.latent_patch_size**2
+        self.image_in_projector = nn.Linear(token_dim, config.hidden_size)
+        self.image_in_projector.weight.data.normal_(mean=0.0, std=config.initializer_range)
+        self.image_in_projector.bias.data.zero_()
+        self.image_out_projector = nn.Linear(config.hidden_size, config.hidden_size)
+        self.image_out_projector.weight.data.normal_(mean=0.0, std=config.initializer_range)
+        self.image_out_projector.bias.data.zero_()
+        self.image_head = FlowMatchingHead(
+            input_dim=token_dim,
+            cond_dim=config.hidden_size,
+            dim=config.fm_head_dim,
+            layers=config.fm_head_layers,
+        )
+        if config.use_gen_pos_embed:
+            self.init_gen_pos_embed()
+    def init_gen_pos_embed(self):
+        self.register_buffer(
+            "gen_pos_embed",
+            torch.from_numpy(
+                get_2d_sincos_pos_embed(
+                    self.config.hidden_size, self.config.base_image_grid_size
+                )
+            ).float().unsqueeze(0),
+        )
+    def gen_pos_embed_with_ar(self, h, w):
+        bsz, hw, dim = self.gen_pos_embed.shape
+        gen_pos_embed = self.gen_pos_embed.reshape(bsz, int(hw**0.5), int(hw**0.5), dim)
+        gen_pos_embed = gen_pos_embed[:, :h, :w, :]
+        gen_pos_embed = gen_pos_embed.reshape(bsz, -1, dim)
+        return gen_pos_embed
+    @property
+    def image_size(self):
+        return self.config.image_size
+    @property
+    def image_patch_size(self):
+        return self.config.patch_size
+    @property
+    def image_grid_size(self):
+        return round(self.image_size / self.image_patch_size)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def load_lm_head(self, lm_head_dir: str | None = None):
+        index_json_file = os.path.join(lm_head_dir, "model.safetensors.index.json")
+        head_weight_name = "lm_head.weight" if not self.config.tie_word_embeddings else "model.embed_tokens.weight"
+        if os.path.exists(index_json_file):
+            with open(index_json_file, "r") as f:
+                index = json.load(f)
+            model_name = index["weight_map"][head_weight_name]
+        else:
+            model_name = "model.safetensors"
+        with safe_open(os.path.join(lm_head_dir, model_name), framework="pt") as f:
+            loaded_weight = f.get_tensor(head_weight_name)
+            loaded_weight = loaded_weight.to(dtype=self.lm_head.weight.dtype, device=self.lm_head.weight.device)
+            self.lm_head.weight.data.copy_(loaded_weight)
+    def patchify(self, img: torch.Tensor):
+        """
+        img: (bsz, C, H, W)
+        x: (bsz, H * W / patch_size**2, patch_size**2 * C)
+        """
+        bsz, c, h, w = img.shape
+        p = self.config.latent_patch_size
+        h_, w_ = h // p, w // p
+        img = img.reshape(bsz, c, h_, p, w_, p)
+        img = torch.einsum("nchpwq->nhwcpq", img)
+        x = img.reshape(bsz, h_ * w_, c * p**2)
+        return x
+    def unpatchify(self, x: torch.Tensor, h: int = None, w: int = None):
+        """
+        x: (bsz, H * W / patch_size**2, patch_size**2 * C)
+        img: (bsz, C, H, W)
+        """
+        bsz = x.shape[0]
+        p = self.config.latent_patch_size
+        c = self.config.latent_channels
+        if h is None and w is None:
+            h_ = w_ = int(x.shape[1] ** 0.5)
+        else:
+            h_, w_ = h, w
+        assert h_ * w_ == x.shape[1], f"Invalid sequence length {x.shape[1]}."
+        x = x.reshape(bsz, h_, w_, c, p, p)
+        x = torch.einsum("nhwcpq->nchpwq", x)
+        img = x.reshape(bsz, c, h_ * p, w_ * p)
+        return img
+    def prepare_inputs_embeds(self, input_ids: torch.LongTensor | None = None, latents: torch.FloatTensor | None = None):
+        if latents is None:
+            if not self.training:
+                return self.embed_tokens(input_ids)
+            else:  # dummy forward for image pass, for the consistent shape of gradient.
+                raise NotImplementedError("Dummy forward for image pass is not implemented.")
+        else:
+            bs, seq_length = input_ids.shape
+            inputs_embeds = torch.zeros(
+                (bs, seq_length, self.config.hidden_size),
+                device=self.embed_tokens.weight.device,
+                dtype=self.embed_tokens.weight.dtype,
+            )
+            im_indices = input_ids == self.config.image_placeholder_id
+            lm_indices = ~im_indices
+            if isinstance(latents, list):
+                tokens = torch.cat([self.patchify(latent) for latent in latents], dim=1)
+            else:
+                tokens = self.patchify(latents)
+                # tokens = tokens.reshape(1, -1, tokens.shape[-1])
+            image_embeds = self.image_in_projector(tokens)
+            image_embeds = image_embeds.view(-1, self.config.hidden_size)
+            token_embeds = self.embed_tokens(input_ids[lm_indices])
+            inputs_embeds[im_indices] = image_embeds.to(inputs_embeds.dtype)
+            inputs_embeds[lm_indices] = token_embeds
+            return inputs_embeds
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+        return causal_mask
+    @smart_compile()
+    def forward_model(
+        self,
+        inputs_embeds: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | list[torch.FloatTensor] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+    ) -> tuple | BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Cache | None = None,
+        attention_mask: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs,
+    ):
+        """
+        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
+        slicing inputs given the existing cache.
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
+        """
+        # 1. Handle BC:
+        model_inputs = {}
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        # 2. Generic cache-dependent input preparation
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+            if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]:  # Exception 1 or Exception 3
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        # 3. Prepare base model inputs
+        input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if not self.config.is_encoder_decoder:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                model_inputs[input_ids_key] = None
+                model_inputs["inputs_embeds"] = inputs_embeds
+            else:
+                # `clone` calls in this function ensure a consistent stride. See #32227
+                model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+                model_inputs["inputs_embeds"] = None
+        else:
+            model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+        # 4. Create missing `position_ids` on the fly
+        if (
+            attention_mask is not None
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+        ):
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values:
+                    model_input = model_input[:, -input_ids.shape[1] :]
+                    model_input = model_input.clone(memory_format=torch.contiguous_format)
+                model_inputs[model_input_name] = model_input
+        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs[input_ids_key].shape
+                device = model_inputs[input_ids_key].device
+            # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
+            # the 4D causal mask exists, it should be present in the base model (XXXModel class).
+            base_model = getattr(self, self.base_model_prefix, None)
+            if base_model is None:
+                causal_mask_creation_function = getattr(self, "_prepare_4d_causal_attention_mask_with_cache_position", None)
+            else:
+                causal_mask_creation_function = getattr(
+                    base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                )
+            if causal_mask_creation_function is None:
+                logger.warning_once(
+                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
+                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
+                    "writing code, see Llama for an example implementation. If you're a user, please report this "
+                    "issue on GitHub."
+                )
+            else:
+                attention_mask = causal_mask_creation_function(
+                    attention_mask,
+                    sequence_length=sequence_length,
+                    target_length=past_key_values.get_max_cache_shape(),
+                    dtype=self.dtype,
+                    device=device,
+                    cache_position=cache_position,
+                    batch_size=batch_size,
+                    config=self.config,
+                    past_key_values=past_key_values,
+                )
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+    @torch.no_grad()
+    def generate(self, inputs: torch.LongTensor = None, **kwargs):
+        input_ids = kwargs.pop("input_ids")
+        latents = kwargs.pop("latents", None)
+        inputs_embeds = self.prepare_inputs_embeds(input_ids, latents)
+        return super().generate(inputs=inputs, input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
+    def gradient_checkpointing_enable(self, **kwargs):
+        super().gradient_checkpointing_enable(**kwargs)
+        self.image_head.net.grad_checkpointing = True

quantization_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bits": 4,
+  "data_type": "int",
+  "group_size": 128,
+  "sym": true,
+  "batch_size": 1,
+  "gradient_accumulate_steps": 8,
+  "autoround_version": "0.13.0",
+  "block_name_to_quantize": "layers,image_head.net.res_blocks",
+  "quant_method": "auto-round",
+  "packing_format": "auto_round:auto_gptq"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<|image_area|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|image_placeholder|>",
+    "<|begin_of_prompt_refinement|>",
+    "<|end_of_prompt_refinement|>",
+    "<|begin_of_thinking|>",
+    "<|end_of_thinking|>",
+    "<|beginoftext|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:310b48c809fba04c32e7f7cdac4d0fb1c00140d8914e0b0163307f64e5330a92
+size 11423853

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,284 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|image_area|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|image_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|begin_of_prompt_refinement|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|end_of_prompt_refinement|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|begin_of_thinking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_thinking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|beginoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|image_area|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|image_placeholder|>",
+    "<|begin_of_prompt_refinement|>",
+    "<|end_of_prompt_refinement|>",
+    "<|begin_of_thinking|>",
+    "<|end_of_thinking|>",
+    "<|beginoftext|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

utils/__pycache__/compile_utils.cpython-312.pyc ADDED Viewed

Binary file (3.72 kB). View file

utils/__pycache__/image_utils.cpython-312.pyc ADDED Viewed

Binary file (16.6 kB). View file

utils/__pycache__/misc.cpython-312.pyc ADDED Viewed

Binary file (3.1 kB). View file

utils/__pycache__/model_utils.cpython-312.pyc ADDED Viewed

Binary file (7.4 kB). View file

utils/aspect_ratio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+import PIL.Image
+ANY_ASPECT_RATIO = (0, 0)
+HW_ASPECT_RATIOS = [
+    (8, 32),  # 256
+    (9, 28),  # 252
+    (10, 25),  # 250
+    (11, 23),  # 253
+    (12, 21),  # 252
+    (13, 19),  # 247
+    (14, 18),  # 252
+    (15, 17),  # 255
+    (16, 16),  # 256
+    (17, 15),  # 255
+    (18, 14),  # 252
+    (19, 13),  # 247
+    (21, 12),  # 252
+    (23, 11),  # 253
+    (25, 10),  # 250
+    (28, 9),  # 252
+    (32, 8),  # 256
+]
+def get_ar_base(ars: list[tuple[int, int]] = HW_ASPECT_RATIOS):
+    sqrt_products = [round(np.sqrt(h * w)) for h, w in ars]
+    return round(np.mean(sqrt_products))
+def ar2str(h: int, w: int) -> str:
+    return f"{h}*{w}"
+def str2ar(s: str) -> tuple[int, int]:
+    return tuple(map(int, s.split("*")))
+def center_crop_arr_with_buckets(pil_image, ars: list[tuple[int, int]] = HW_ASPECT_RATIOS, crop=True, buckets: list[int] = [256, 512, 768, 1024]):
+    """
+    Center crop the image to match the closest aspect ratio from the provided list.
+    Args:
+        pil_image: PIL Image to be cropped
+        image_size: Target size for the smaller dimension
+        ars: List of aspect ratios as (height, width) tuples
+    Returns:
+        PIL Image cropped to the closest aspect ratio
+    """
+    # ar_base = get_ar_base(ars)
+    # Get current image dimensions
+    width, height = pil_image.size
+    buckets = sorted(buckets, reverse=True)
+    image_size = buckets[-1]
+    for bucket in buckets:
+        if width * height >= bucket * bucket:
+            image_size = bucket
+            break
+    return center_crop_arr_with_ar(pil_image, image_size, ars, crop)
+def center_crop_arr_with_ar(pil_image, image_size: int, ars: list[tuple[int, int]] = HW_ASPECT_RATIOS, crop=True):
+    """
+    Center crop the image to match the closest aspect ratio from the provided list.
+    Args:
+        pil_image: PIL Image to be cropped
+        image_sizes: Target size for the smaller dimension
+        ars: List of aspect ratios as (height, width) tuples
+    Returns:
+        PIL Image cropped to the closest aspect ratio
+    """
+    ar_base = get_ar_base(ars)
+    assert image_size % ar_base == 0, f"image_size must be divisible by {ar_base}"
+    # Get current image dimensions
+    width, height = pil_image.size
+    current_ar = height / width
+    # Find the closest aspect ratio
+    closest_ar_idx = np.argmin([abs(current_ar - (h / w)) for h, w in ars])
+    target_h, target_w = ars[closest_ar_idx]
+    if crop:
+        target_h, target_w = round(image_size / ar_base * target_h), round(image_size / ar_base * target_w)
+        # First, resize the image while maintaining aspect ratio to ensure the smaller dimension is at least the target size
+        scale = max(target_h / height, target_w / width)
+        new_height = round(height * scale)
+        new_width = round(width * scale)
+        pil_image = pil_image.resize((new_width, new_height), resample=PIL.Image.LANCZOS)
+        arr = np.array(pil_image)
+        # Then perform center crop to the target dimensions
+        crop_y = (new_height - target_h) // 2
+        crop_x = (new_width - target_w) // 2
+        return PIL.Image.fromarray(arr[crop_y : crop_y + target_h, crop_x : crop_x + target_w])
+    else:
+        scale = image_size // ar_base
+        return pil_image.resize((round(target_w * scale), round(target_h * scale)), resample=PIL.Image.LANCZOS)

utils/compile_utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import contextlib
+import functools
+import os
+from typing import Callable, Dict, Optional
+import torch
+from loguru import logger
+"""
+Usage:
+1. Control through environment variable (at startup):
+    export TORCH_COMPILE_ENABLE=true
+    python your_script.py
+2. Control through environment variable (disable):
+    export TORCH_COMPILE_ENABLE=false  # or not set
+    python your_script.py
+3. Dynamically control in code:
+    compile_manager.set_compile_enabled(True)   # enable
+    compile_manager.set_compile_enabled(False)  # disable
+4. Select version at runtime:
+    # use the version configured
+    result = my_function(args)
+    # force use the original version
+    result = my_function.original(args)
+    # force use the compiled version
+    result = my_function.compiled(args)
+"""
+# Global configuration: control whether to enable compile through environment variables
+# Default set this env to true
+ENABLE_TORCH_COMPILE = os.getenv("ENABLE_TORCH_COMPILE", "false").lower() == "true"
+class CompileManager:
+    """Global controller for torch.compile"""
+    def __init__(self):
+        self.compile_enabled = ENABLE_TORCH_COMPILE
+        self.compiled_functions: Dict[str, Callable] = {}
+        self.original_functions: Dict[str, Callable] = {}
+    def set_compile_enabled(self, enabled: bool):
+        """Dynamic setting of whether to enable compile"""
+        self.compile_enabled = enabled
+    def get_compile_status(self):
+        """Get the current compile status"""
+        return self.compile_enabled
+    @contextlib.contextmanager
+    def compile_disabled(self):
+        """Temporarily disable compile within the context"""
+        original_status = self.compile_enabled
+        try:
+            self.compile_enabled = False
+            yield
+        finally:
+            self.compile_enabled = original_status
+# global instance
+compile_manager = CompileManager()
+def smart_compile(func: Optional[Callable] = None, **compile_kwargs):
+    """
+    Smart compile decorator
+    Args:
+        func: The function to decorate
+        **compile_kwargs: Other compile parameters, see https://pytorch.org/docs/stable/generated/torch.compile.html
+    """
+    def decorator(fn: Callable) -> Callable:
+        # save the original function
+        original_func = fn
+        # Use qualified name to handle functions with same name in different classes
+        # Include module name to handle functions with same name in different files
+        func_name = f"{fn.__module__}.{fn.__qualname__}"
+        compile_manager.original_functions[func_name] = original_func
+        # if compile is disabled, return the original function
+        if not compile_manager.compile_enabled:
+            # add attributes to the original function for later access
+            original_func.original = original_func
+            original_func.compiled = original_func  # point to itself
+            return original_func
+        # create the compiled function
+        try:
+            compiled_func = torch.compile(original_func, **compile_kwargs)
+            compile_manager.compiled_functions[func_name] = compiled_func
+        except Exception as e:
+            logger.warning(f"[WARNING] Failed to compile function {func_name}: {e}")
+            # if compile fails, revert to the original function
+            compiled_func = original_func
+        @functools.wraps(original_func)
+        def wrapper(*args, **kwargs):
+            # check whether to use the compiled version at runtime
+            if compile_manager.compile_enabled:
+                return compiled_func(*args, **kwargs)
+            else:
+                return original_func(*args, **kwargs)
+        # add attributes to the wrapper for later access
+        wrapper.original = original_func
+        wrapper.compiled = compiled_func
+        return wrapper
+    # support direct use of @smart_compile or @smart_compile(...)
+    if func is not None:
+        return decorator(func)
+    return decorator

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import io
+import os
+from typing import Literal, TypeAlias
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+import requests
+import torch
+"""
+- pil: `PIL.Image.Image`, size (w, h), seamless conversion between `uint8`
+- np: `np.ndarray`, shape (h, w, c), default `np.uint8`
+- pt: `torch.Tensor`, shape (c, h, w), default `torch.uint8`
+"""
+ImageType: TypeAlias = PIL.Image.Image | np.ndarray | torch.Tensor
+ImageTypeStr: TypeAlias = Literal["pil", "np", "pt"]
+ImageFormat: TypeAlias = Literal["JPEG", "PNG"]
+DataFormat: TypeAlias = Literal["255", "01", "11"]
+IMG_SUPPORT_MODE = ["L", "LA", "RGB", "RGBA", "CMYK", "P", "1"]
+IMAGE_EXT_LOWER = ["png", "jpeg", "jpg", "webp"]
+IMAGE_EXT = IMAGE_EXT_LOWER + [_ext.upper() for _ext in IMAGE_EXT_LOWER]
+def check_image_type(image: ImageType):
+    if not (isinstance(image, PIL.Image.Image) or isinstance(image, np.ndarray) or isinstance(image, torch.Tensor)):
+        raise TypeError(f"`image` should be PIL Image, ndarray or Tensor. Got `{type(image)}`.")
+def to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+    # Automatically adjust the orientation of the image to match the direction it was taken.
+    image = PIL.ImageOps.exif_transpose(image)
+    if image.mode not in IMG_SUPPORT_MODE:
+        raise ValueError(f"Only support mode in `{IMG_SUPPORT_MODE}`, got `{image.mode}`")
+    if image.mode == "LA":
+        image = image.convert("RGBA")
+    # add white background for RGBA images, and convert to RGB
+    if image.mode == "RGBA":
+        background = PIL.Image.new("RGBA", image.size, "white")
+        image = PIL.Image.alpha_composite(background, image).convert("RGB")
+    # then convert to RGB
+    image = image.convert("RGB")
+    return image
+def load_image(
+    image: str | os.PathLike | PIL.Image.Image | bytes,
+    *,
+    output_type: ImageTypeStr = "pil",
+) -> ImageType:
+    """
+    Loads `image` to a PIL Image, NumPy array or PyTorch tensor.
+    Args:
+        image (str | PIL.Image.Image): The path to image or PIL Image.
+        mode (ImageMode, optional): The mode to convert to. Defaults to None (no conversion).
+            The current version supports all possible conversions between "L", "RGB", "RGBA".
+        output_type (ImageTypeStr, optional): The type of the output image. Defaults to "pil".
+            The current version supports "pil", "np", "pt".
+    Returns:
+        ImageType: The loaded image in the given type.
+    """
+    timeout = 10
+    # Load the `image` into a PIL Image.
+    if isinstance(image, str) or isinstance(image, os.PathLike):
+        if image.startswith("http://") or image.startswith("https://"):
+            try:
+                image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw)
+            except requests.exceptions.Timeout:
+                raise ValueError(f"HTTP request timed out after {timeout} seconds")
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://`, `https://` or `s3+[profile]://`, and `{image}` is not a valid path."
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    elif isinstance(image, bytes):
+        image = PIL.Image.open(io.BytesIO(image))
+    else:
+        raise ValueError(f"`image` must be a path or PIL Image, got `{type(image)}`")
+    image = to_rgb(image)
+    if output_type == "pil":
+        image = image
+    elif output_type == "np":
+        image = to_np(image)
+    elif output_type == "pt":
+        image = to_pt(image)
+    else:
+        raise ValueError(f"`output_type` must be one of `{ImageTypeStr}`, got `{output_type}`")
+    return image
+def to_pil(image: ImageType, image_mode: DataFormat | None = None) -> PIL.Image.Image:
+    """
+    Convert a NumPy array or a PyTorch tensor to a PIL image.
+    """
+    check_image_type(image)
+    if isinstance(image, PIL.Image.Image):
+        return image
+    elif isinstance(image, np.ndarray):
+        image = normalize_np(image, image_mode)
+    elif isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        image = image.cpu().permute(1, 2, 0).numpy()
+        assert image.dtype == np.uint8, f"Supposed to convert `torch.uint8` to `np.uint8`, but got `{image.dtype}`"
+    mode_map = {1: "L", 3: "RGB"}
+    mode = mode_map[image.shape[-1]]
+    if image.shape[-1] == 1:
+        image = image[:, :, 0]
+    return PIL.Image.fromarray(image, mode=mode)
+def to_np(image: ImageType, image_mode: DataFormat | None = None) -> np.ndarray:
+    """
+    Convert a PIL image or a PyTorch tensor to a NumPy array.
+    """
+    check_image_type(image)
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+    if isinstance(image, np.ndarray):
+        image = normalize_np(image, image_mode)
+    elif isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        image = image.cpu().permute(1, 2, 0).numpy()
+        assert image.dtype == np.uint8, f"Supposed to convert `torch.uint8` to `np.uint8`, but got `{image.dtype}`"
+    return image
+def to_pt(image: ImageType, image_mode: DataFormat | None = None) -> torch.Tensor:
+    """
+    Convert a PIL image or a NumPy array to a PyTorch tensor.
+    """
+    check_image_type(image)
+    if isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        return image
+    # convert PIL Image to NumPy array
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+    image = normalize_np(image, image_mode)
+    image = torch.from_numpy(image.transpose((2, 0, 1))).contiguous()
+    assert image.dtype == torch.uint8, f"Supposed to convert `np.uint8` to `torch.uint8`, but got `{image.dtype}`"
+    return image
+def normalize_np(image: np.ndarray, image_mode: DataFormat | None = None) -> np.ndarray:
+    """
+    Normalize a NumPy array to the standard format of shape (h, w, c) and uint8.
+    """
+    if image.ndim not in {2, 3}:
+        raise ValueError(f"`image` should be 2 or 3 dimensions. Got {image.ndim} dimensions.")
+    elif image.ndim == 2:
+        # if 2D image, add channel dimension (HWC)
+        image = np.expand_dims(image, 2)
+    if image.shape[-1] not in {1, 3}:
+        raise ValueError(f"`image` should have 1 (`L`) or 3 (`RGB`) channels. Got {image.shape[-1]} channels.")
+    image = to_dataformat(image, image_mode=image_mode, mode="255")
+    return image
+def normalize_pt(image: torch.Tensor, image_mode: DataFormat | None = None) -> torch.Tensor:
+    """
+    Normalize a PyTorch tensor to the standard format of shape (c, h, w) and uint8.
+    """
+    if image.ndimension() not in {2, 3}:
+        raise ValueError(f"`image` should be 2 or 3 dimensions. Got {image.ndimension()} dimensions.")
+    elif image.ndimension() == 2:
+        # if 2D image, add channel dimension (CHW)
+        image = image.unsqueeze(0)
+    # check number of channels
+    if image.shape[-3] not in {1, 3}:
+        raise ValueError(f"`image` should have 1 (`L`) or 3 (`RGB`) channels. Got {image.shape[-3]} channels.")
+    image = to_dataformat(image, image_mode=image_mode, mode="255")
+    return image
+def to_dataformat(
+    image: ImageType,
+    *,
+    image_mode: DataFormat | None = None,
+    mode: DataFormat = "255",
+) -> np.ndarray | torch.Tensor:
+    check_image_type(image)
+    # convert PIL Image to NumPy array
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+        image_mode = "255"
+    # guess image mode
+    if image.dtype == np.uint8 or image.dtype == torch.uint8:
+        guess_image_mode = "255"
+    elif image.dtype == np.float32 or image.dtype == np.float16 or image.dtype == torch.float32 or image.dtype == torch.float16:
+        if image.min() < 0.0:
+            guess_image_mode = "11"
+        else:
+            guess_image_mode = "01"
+    else:
+        raise ValueError(f"Unsupported dtype `{image.dtype}`")
+    if image_mode is None:
+        image_mode = guess_image_mode
+    else:
+        if guess_image_mode != image_mode:
+            print(f"Guess image mode is `{guess_image_mode}`, but image mode is `{image_mode}`")
+    if isinstance(image, np.ndarray):
+        if image_mode == "255" and mode != "255":
+            np.clip((image.astype(np.float32) / 255), 0, 1, out=image)
+            if mode == "11":
+                np.clip((image * 2 - 1), -1, 1, out=image)
+        elif image_mode == "01" and mode != "01":
+            if mode == "255":
+                np.clip(image, 0, 1, out=image)
+                image = (image * 255).round().astype(np.uint8)
+            elif mode == "11":
+                np.clip((image * 2 - 1), -1, 1, out=image)
+        elif image_mode == "11" and mode != "11":
+            np.clip((image / 2 + 0.5), 0, 1, out=image)
+            if mode == "255":
+                image = (image * 255).round().astype(np.uint8)
+    elif isinstance(image, torch.Tensor):
+        if image_mode == "255" and mode != "255":
+            image = image.to(dtype=torch.float32).div(255).clamp(0, 1)
+            if mode == "11":
+                image = (image * 2 - 1).clamp(-1, 1)
+        elif image_mode == "01" and mode != "01":
+            if mode == "255":
+                image = image.clamp(0, 1)
+                image = (image * 255).round().to(dtype=torch.uint8)
+            elif mode == "11":
+                image = (image * 2 - 1).clamp(-1, 1)
+        elif image_mode == "11" and mode != "11":
+            image = (image / 2 + 0.5).clamp(0, 1)
+            if mode == "255":
+                image = image.mul(255).round().to(dtype=torch.uint8)
+    return image
+def resize_image(pil_image, image_size):
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=PIL.Image.BOX)
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=PIL.Image.BICUBIC)
+    return pil_image
+def center_crop_arr(pil_image, image_size, crop=True):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    if crop:
+        pil_image = resize_image(pil_image, image_size)
+        arr = np.array(pil_image)
+        crop_y = (arr.shape[0] - image_size) // 2
+        crop_x = (arr.shape[1] - image_size) // 2
+        return PIL.Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
+    else:
+        # 将图像填充为正方形
+        width, height = pil_image.size
+        if width != height:
+            # 创建一个正方形画布，尺寸为较大的边长
+            max_dim = max(width, height)
+            padded_img = PIL.Image.new(pil_image.mode, (max_dim, max_dim), (0, 0, 0))
+            # 将原图居中粘贴到正方形画布上
+            padded_img.paste(pil_image, ((max_dim - width) // 2, (max_dim - height) // 2))
+            pil_image = padded_img
+        pil_image = resize_image(pil_image, image_size)
+        return pil_image

utils/misc.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import numpy as np
+import random
+import torch
+def set_seed(seed: int, rank: int = 0):
+    random.seed(seed + rank)
+    np.random.seed(seed + rank)
+    torch.manual_seed(seed + rank)
+    torch.cuda.manual_seed_all(seed + rank)
+    torch.backends.cudnn.deterministic = True
+    os.environ["PYTHONHASHSEED"] = str(seed + rank)
+class LargeInt(int):
+    def __new__(cls, value):
+        if isinstance(value, str):
+            units = {"K": 1e3, "M": 1e6, "B": 1e9, "T": 1e12}
+            last_char = value[-1].upper()
+            if last_char in units:
+                num = float(value[:-1]) * units[last_char]
+                return super(LargeInt, cls).__new__(cls, int(num))
+            else:
+                return super(LargeInt, cls).__new__(cls, int(value))
+        else:
+            return super(LargeInt, cls).__new__(cls, value)
+    def __str__(self):
+        value = int(self)
+        if abs(value) < 1000:
+            return f"{value}"
+        for unit in ["", "K", "M", "B", "T"]:
+            if abs(value) < 1000:
+                return f"{value:.1f}{unit}"
+            value /= 1000
+        return f"{value:.1f}P"  # P stands for Peta, or 10^15
+    def __repr__(self):
+        return f'"{self.__str__()}"'  # Ensure repr also returns the string with quotes
+    def __json__(self):
+        return f'"{self.__str__()}"'
+    def __add__(self, other):
+        if isinstance(other, int):
+            return LargeInt(super().__add__(other))
+        return NotImplemented
+    def __radd__(self, other):
+        return self.__add__(other)  # This ensures commutativity

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+import numpy as np
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32) / pe_interpolation
+    grid_w = np.arange(grid_size, dtype=np.float32) / pe_interpolation
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def expand_t(t, x):
+    """Function to reshape time t to broadcastable dimension of x
+    Args:
+        t: [bsz,], time vector
+        x: [bsz,...], data point
+    """
+    dims = [1] * (len(x.size()) - 1)
+    t = t.view(t.size(0), *dims)
+    return t
+def randn_tensor(shape, noise_repeat, device, dtype=torch.float32):
+    bsz = shape[0]
+    if bsz % noise_repeat != 0:
+        raise ValueError(f"Batch size ({bsz}) must be divisible by noise repeat ({noise_repeat})")
+    _shape = (noise_repeat,) + shape[1:]
+    _tensor = torch.randn(_shape, device=device, dtype=dtype).repeat(bsz // noise_repeat, 1)
+    return _tensor
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def identity(input: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    return input
+def rms_norm(
+    input: torch.Tensor,
+    normalized_shape: torch.Size,
+    eps: float = 1e-6,
+    ) -> torch.Tensor:
+    dtype = input.dtype
+    input = input.to(torch.float32)
+    variance = input.flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    input = input * torch.rsqrt(variance + eps)
+    return input.to(dtype)
+def layer_norm(
+    input: torch.Tensor,
+    normalized_shape: torch.Size,
+    eps: float = 1e-6,
+    ) -> torch.Tensor:
+    dtype = input.dtype
+    input = input.to(torch.float32)
+    mean = input.flatten(-len(normalized_shape)).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    variance = (input - mean).flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    input = (input - mean) * torch.rsqrt(variance + eps)
+    return input.to(dtype)

vae/__pycache__/nextstep_ae.cpython-312.pyc ADDED Viewed

Binary file (28.3 kB). View file

vae/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99293255229a29297e2851858db3794497d1b0b09b20c308c1062636ea4bcdd9
+size 335365010

vae/config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "shift_factor": 0,
+    "scaling_factor": 1,
+    "deterministic": true,
+    "encoder_norm": true,
+    "psz": 1
+}

vae/nextstep_ae.py ADDED Viewed

	@@ -0,0 +1,494 @@

+import os
+import json
+import inspect
+from dataclasses import dataclass, field, asdict
+from loguru import logger
+from omegaconf import OmegaConf
+from tabulate import tabulate
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from utils.misc import LargeInt
+from utils.model_utils import randn_tensor
+from utils.compile_utils import smart_compile
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scaling_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    deterministic: bool = False
+    encoder_norm: bool = False
+    psz: int | None = None
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        self.grad_checkpointing = False
+    @smart_compile()
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                block_fn = self.down[i_level].block[i_block]
+                if self.grad_checkpointing:
+                    h = checkpoint(block_fn, hs[-1])
+                else:
+                    h = block_fn(hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    attn_fn = self.down[i_level].attn[i_block]
+                    if self.grad_checkpointing:
+                        h = checkpoint(attn_fn, h)
+                    else:
+                        h = attn_fn(h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+        self.grad_checkpointing = False
+    @smart_compile()
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                block_fn = self.up[i_level].block[i_block]
+                if self.grad_checkpointing:
+                    h = checkpoint(block_fn, h)
+                else:
+                    h = block_fn(h)
+                if len(self.up[i_level].attn) > 0:
+                    attn_fn = self.up[i_level].attn[i_block]
+                    if self.grad_checkpointing:
+                        h = checkpoint(attn_fn, h)
+                    else:
+                        h = attn_fn(h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+def layer_norm_2d(input: torch.Tensor, normalized_shape: torch.Size, eps: float = 1e-6) -> torch.Tensor:
+    # input.shape = (bsz, c, h, w)
+    _input = input.permute(0, 2, 3, 1)
+    _input = F.layer_norm(_input, normalized_shape, None, None, eps)
+    _input = _input.permute(0, 3, 1, 2)
+    return _input
+class AutoencoderKL(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.config = params
+        self.config = OmegaConf.create(asdict(self.config))
+        self.config.latent_channels = params.z_channels
+        self.config.block_out_channels = params.ch_mult
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.encoder_norm = params.encoder_norm
+        self.psz = params.psz
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, (nn.Conv2d, nn.Linear)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.GroupNorm):
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def gradient_checkpointing_enable(self):
+        self.encoder.grad_checkpointing = True
+        self.decoder.grad_checkpointing = True
+    @property
+    def dtype(self):
+        return self.encoder.conv_in.weight.dtype
+    @property
+    def device(self):
+        return self.encoder.conv_in.weight.device
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+    @property
+    def params_info(self) -> str:
+        encoder_params = str(LargeInt(sum(p.numel() for p in self.encoder.parameters())))
+        decoder_params = str(LargeInt(sum(p.numel() for p in self.decoder.parameters())))
+        table = [["encoder", encoder_params], ["decoder", decoder_params]]
+        return tabulate(table, headers=["Module", "Params"], tablefmt="grid")
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def patchify(self, img: torch.Tensor):
+        """
+        img: (bsz, C, H, W)
+        x: (bsz, patch_size**2 * C, H / patch_size, W / patch_size)
+        """
+        bsz, c, h, w = img.shape
+        p = self.psz
+        h_, w_ = h // p, w // p
+        img = img.reshape(bsz, c, h_, p, w_, p)
+        img = torch.einsum("nchpwq->ncpqhw", img)
+        x = img.reshape(bsz, c * p**2, h_, w_)
+        return x
+    def unpatchify(self, x: torch.Tensor):
+        """
+        x: (bsz, patch_size**2 * C, H / patch_size, W / patch_size)
+        img: (bsz, C, H, W)
+        """
+        bsz = x.shape[0]
+        p = self.psz
+        c = self.config.latent_channels
+        h_, w_ = x.shape[2], x.shape[3]
+        x = x.reshape(bsz, c, p, p, h_, w_)
+        x = torch.einsum("ncpqhw->nchpwq", x)
+        img = x.reshape(bsz, c, h_ * p, w_ * p)
+        return img
+    def encode(self, x: torch.Tensor, return_dict: bool = True):
+        moments = self.encoder(x)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        if self.psz is not None:
+            mean = self.patchify(mean)
+        if self.encoder_norm:
+            mean = layer_norm_2d(mean, mean.size()[-1:])
+        if self.psz is not None:
+            mean = self.unpatchify(mean)
+        moments = torch.cat([mean, logvar], dim=1).contiguous()
+        posterior = DiagonalGaussianDistribution(moments, deterministic=self.params.deterministic)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def decode(self, z: torch.Tensor, return_dict: bool = True):
+        dec = self.decoder(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(self, input, sample_posterior=True, noise_strength=0.0):
+        posterior = self.encode(input).latent_dist
+        z = posterior.sample() if sample_posterior else posterior.mode()
+        if noise_strength > 0.0:
+            p = torch.distributions.Uniform(0, noise_strength)
+            z = z + p.sample((z.shape[0],)).reshape(-1, 1, 1, 1).to(z.device) * randn_tensor(
+                z.shape, device=z.device, dtype=z.dtype
+            )
+        dec = self.decode(z).sample
+        return dec, posterior
+    @classmethod
+    def from_pretrained(cls, model_path, **kwargs):
+        config_path = os.path.join(model_path, "config.json")
+        ckpt_path = os.path.join(model_path, "checkpoint.pt")
+        if not os.path.isdir(model_path) or not os.path.isfile(config_path) or not os.path.isfile(ckpt_path):
+            raise ValueError(
+                f"Invalid model path: {model_path}. The path should contain both config.json and checkpoint.pt files."
+            )
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(config_path, "r") as f:
+            config: dict = json.load(f)
+        config.update(kwargs)
+        kwargs = config
+        # Filter out kwargs that are not in AutoEncoderParams
+        # This ensures we only pass parameters that the model can accept
+        valid_kwargs = {}
+        param_signature = inspect.signature(AutoEncoderParams.__init__).parameters
+        for key, value in kwargs.items():
+            if key in param_signature:
+                valid_kwargs[key] = value
+            else:
+                logger.info(f"Ignoring parameter '{key}' as it's not defined in AutoEncoderParams")
+        params = AutoEncoderParams(**valid_kwargs)
+        model = cls(params)
+        try:
+            msg = model.load_state_dict(state_dict, strict=False)
+            logger.info(f"Loaded state_dict from {ckpt_path}")
+            logger.info(f"Missing keys:\n{msg.missing_keys}")
+            logger.info(f"Unexpected keys:\n{msg.unexpected_keys}")
+        except Exception as e:
+            logger.error(e)
+            logger.warning(f"Failed to load state_dict from {ckpt_path}, using random initialization")
+        return model

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff