SupraLabs
/

SupraMNST-IMG-200k

@@ -1,61 +1,22 @@
 {
-  "_class_name": "UNet2DConditionModel",
-  "_diffusers_version": "0.37.1",
-  "act_fn": "silu",
-  "addition_embed_type": null,
-  "addition_embed_type_num_heads": 64,
-  "addition_time_embed_dim": null,
-  "attention_head_dim": 8,
-  "attention_type": "default",
-  "block_out_channels": [
-    12,
-    16
-  ],
-  "center_input_sample": false,
   "class_embed_type": null,
-  "class_embeddings_concat": false,
-  "conv_in_kernel": 3,
-  "conv_out_kernel": 3,
   "cross_attention_dim": 8,
-  "cross_attention_norm": null,
-  "down_block_types": [
-    "DownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "dropout": 0.0,
-  "dual_cross_attention": false,
-  "encoder_hid_dim": null,
-  "encoder_hid_dim_type": null,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
   "in_channels": 1,
   "layers_per_block": 8,
-  "mid_block_only_cross_attention": null,
-  "mid_block_scale_factor": 1,
   "mid_block_type": "UNetMidBlock2D",
-  "norm_eps": 1e-05,
   "norm_num_groups": 4,
-  "num_attention_heads": null,
   "num_class_embeds": 10,
-  "only_cross_attention": false,
   "out_channels": 1,
-  "projection_class_embeddings_input_dim": null,
-  "resnet_out_scale_factor": 1.0,
-  "resnet_skip_time_act": false,
-  "resnet_time_scale_shift": "default",
-  "reverse_transformer_layers_per_block": null,
   "sample_size": 32,
-  "time_cond_proj_dim": null,
-  "time_embedding_act_fn": null,
-  "time_embedding_dim": null,
-  "time_embedding_type": "positional",
-  "timestep_post_act": null,
-  "transformer_layers_per_block": 1,
-  "up_block_types": [
-    "UpBlock2D",
-    "UpBlock2D"
-  ],
-  "upcast_attention": false,
-  "use_linear_projection": false
 }

 {
+  "architectures": ["DigitDiffusionModel"],
+  "auto_map": {
+    "AutoConfig": "configuration.DigitDiffusionConfig",
+    "AutoModel": "modeling.DigitDiffusionModel"
+  },
+  "block_out_channels": [12, 16, 20],
   "class_embed_type": null,
   "cross_attention_dim": 8,
+  "down_block_types": ["DownBlock2D", "DownBlock2D", "DownBlock2D"],
+  "image_size": 32,
   "in_channels": 1,
   "layers_per_block": 8,
   "mid_block_type": "UNetMidBlock2D",
+  "model_type": "digit_diffusion",
   "norm_num_groups": 4,
   "num_class_embeds": 10,
+  "num_classes": 10,
   "out_channels": 1,
   "sample_size": 32,
+  "up_block_types": ["UpBlock2D", "UpBlock2D", "UpBlock2D"]
 }

configuration.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python3
+#Configuration for the MNiST-IMG-390k
+from __future__ import annotations
+from typing import Iterable, Tuple
+from transformers import PretrainedConfig
+class DigitDiffusionConfig(PretrainedConfig):
+    model_type = "digit_diffusion"
+    def __init__(
+        self,
+        image_size: int = 32,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        num_classes: int = 10,
+        block_out_channels: Iterable[int] = (12, 16, 20),
+        layers_per_block: int = 8,
+        norm_num_groups: int = 4,
+        cross_attention_dim: int = 8,
+        class_embed_type: str | None = None,
+        sample_size: int | None = None,
+        **kwargs,
+    ) -> None:
+        image_size = int(image_size)
+        sample_size = int(sample_size) if sample_size is not None else image_size
+        block_out_channels = tuple(int(v) for v in block_out_channels)
+        if not block_out_channels:
+            raise ValueError("block_out_channels must contain at least one entry.")
+        if any(v <= 0 for v in block_out_channels):
+            raise ValueError("block_out_channels must contain only positive integers.")
+        if image_size <= 0:
+            raise ValueError("image_size must be a positive integer.")
+        if sample_size <= 0:
+            raise ValueError("sample_size must be a positive integer.")
+        if in_channels <= 0 or out_channels <= 0:
+            raise ValueError("in_channels and out_channels must be positive integers.")
+        if num_classes <= 0:
+            raise ValueError("num_classes must be a positive integer.")
+        if layers_per_block <= 0:
+            raise ValueError("layers_per_block must be a positive integer.")
+        if norm_num_groups <= 0:
+            raise ValueError("norm_num_groups must be a positive integer.")
+        if cross_attention_dim <= 0:
+            raise ValueError("cross_attention_dim must be a positive integer.")
+        self.image_size = image_size
+        self.sample_size = sample_size
+        self.in_channels = int(in_channels)
+        self.out_channels = int(out_channels)
+        self.num_classes = int(num_classes)
+        self.block_out_channels = block_out_channels
+        self.layers_per_block = int(layers_per_block)
+        self.norm_num_groups = int(norm_num_groups)
+        self.cross_attention_dim = int(cross_attention_dim)
+        self.class_embed_type = class_embed_type
+        # Handy for HF model pages and AutoClass loading.
+        kwargs.setdefault("architectures", ["DigitDiffusionModel"])
+        super().__init__(**kwargs)
+    @property
+    def num_blocks(self) -> int:
+        return len(self.block_out_channels)
+    def to_dict(self):
+        data = super().to_dict()
+        # Keep the serialized values compact and JSON-friendly.
+        data["block_out_channels"] = list(self.block_out_channels)
+        return data
+DigitDiffusionConfig.register_for_auto_class()

modeling.py ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env python3
+# Model for MNiST-IMG-390k
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+from diffusers import UNet2DConditionModel
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+from configuration import DigitDiffusionConfig
+@dataclass
+class DigitDiffusionOutput(ModelOutput):
+    sample: torch.FloatTensor | None = None
+class DigitDiffusionModel(PreTrainedModel):
+    config_class = DigitDiffusionConfig
+    base_model_prefix = "unet"
+    main_input_name = "noisy_images"
+    def __init__(self, config: DigitDiffusionConfig) -> None:
+        super().__init__(config)
+        block_count = len(config.block_out_channels)
+        self.unet = UNet2DConditionModel(
+            sample_size=config.sample_size,
+            in_channels=config.in_channels,
+            out_channels=config.out_channels,
+            layers_per_block=config.layers_per_block,
+            block_out_channels=tuple(config.block_out_channels),
+            down_block_types=("DownBlock2D",) * block_count,
+            up_block_types=("UpBlock2D",) * block_count,
+            mid_block_type="UNetMidBlock2D",
+            norm_num_groups=config.norm_num_groups,
+            num_class_embeds=config.num_classes,
+            cross_attention_dim=config.cross_attention_dim,
+            class_embed_type=config.class_embed_type,
+        )
+    def _init_weights(self, module):
+        # Diffusers initializes the UNet internally, so there is nothing extra
+        # to initialize here.
+        return
+    def _make_dummy_context(
+        self,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        return torch.zeros(
+            batch_size,
+            1,
+            self.config.cross_attention_dim,
+            device=device,
+            dtype=dtype,
+        )
+    def _normalize_inputs(
+        self,
+        noisy_images: Optional[torch.Tensor] = None,
+        timesteps: Optional[torch.Tensor | int] = None,
+        sample: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.Tensor | int] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if noisy_images is None:
+            noisy_images = sample
+        if timesteps is None:
+            timesteps = timestep
+        if noisy_images is None:
+            raise ValueError("Either `noisy_images` or `sample` must be provided.")
+        if timesteps is None:
+            raise ValueError("Either `timesteps` or `timestep` must be provided.")
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor(
+                timesteps,
+                device=noisy_images.device,
+                dtype=torch.long,
+            )
+        if timesteps.ndim == 0:
+            timesteps = timesteps.expand(noisy_images.shape[0])
+        elif timesteps.shape[0] != noisy_images.shape[0]:
+            timesteps = timesteps.reshape(-1)
+            if timesteps.numel() == 1:
+                timesteps = timesteps.expand(noisy_images.shape[0])
+            elif timesteps.shape[0] != noisy_images.shape[0]:
+                raise ValueError(
+                    "Timesteps must be a scalar, a batch-sized tensor, or a single-value tensor."
+                )
+        return noisy_images, timesteps.to(device=noisy_images.device, dtype=torch.long)
+    def forward(
+        self,
+        noisy_images: Optional[torch.Tensor] = None,
+        timesteps: Optional[torch.Tensor | int] = None,
+        class_labels: Optional[torch.Tensor] = None,
+        sample: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.Tensor | int] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        **kwargs: Any,
+    ):
+        noisy_images, timesteps = self._normalize_inputs(
+            noisy_images=noisy_images,
+            timesteps=timesteps,
+            sample=sample,
+            timestep=timestep,
+        )
+        batch_size = noisy_images.shape[0]
+        if class_labels is None:
+            class_labels = torch.zeros(
+                batch_size,
+                device=noisy_images.device,
+                dtype=torch.long,
+            )
+        else:
+            class_labels = class_labels.to(device=noisy_images.device, dtype=torch.long)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = self._make_dummy_context(
+                batch_size=batch_size,
+                device=noisy_images.device,
+                dtype=noisy_images.dtype,
+            )
+        noise_pred = self.unet(
+            sample=noisy_images,
+            timestep=timesteps,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=class_labels,
+            return_dict=True,
+            **kwargs,
+        ).sample
+        if return_dict:
+            return DigitDiffusionOutput(sample=noise_pred)
+        return (noise_pred,)
+    def load_state_dict(self, state_dict, strict: bool = True, assign: bool = False):
+        if state_dict:
+            keys = list(state_dict.keys())
+            has_prefixed = any(k.startswith("unet.") for k in keys)
+            has_plain_unet = any(
+                k.startswith(
+                    (
+                        "conv_in.",
+                        "conv_norm_out.",
+                        "conv_out.",
+                        "time_embedding.",
+                        "class_embedding.",
+                        "down_blocks.",
+                        "up_blocks.",
+                        "mid_block.",
+                    )
+                )
+                for k in keys
+            )
+            if has_plain_unet and not has_prefixed:
+                state_dict = {f"unet.{k}": v for k, v in state_dict.items()}
+        return super().load_state_dict(state_dict, strict=strict, assign=assign)
+DigitDiffusionModel.register_for_auto_class("AutoModel")