Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

fcdm_diffae/__init__.py +1 -1
fcdm_diffae/__pycache__/__init__.cpython-312.pyc +0 -0
fcdm_diffae/__pycache__/config.cpython-312.pyc +0 -0
fcdm_diffae/__pycache__/decoder.cpython-312.pyc +0 -0
fcdm_diffae/__pycache__/encoder.cpython-312.pyc +0 -0
fcdm_diffae/__pycache__/model.cpython-312.pyc +0 -0
fcdm_diffae/__pycache__/samplers.cpython-312.pyc +0 -0
fcdm_diffae/config.py +18 -0
fcdm_diffae/model.py +32 -11

fcdm_diffae/__init__.py CHANGED Viewed

@@ -26,8 +26,8 @@ from .encoder import EncoderPosterior
 from .model import FCDMDiffAE
 __all__ = [
     "FCDMDiffAE",
     "FCDMDiffAEConfig",
     "FCDMDiffAEInferenceConfig",
-    "EncoderPosterior",
 ]

 from .model import FCDMDiffAE
 __all__ = [
+    "EncoderPosterior",
     "FCDMDiffAE",
     "FCDMDiffAEConfig",
     "FCDMDiffAEInferenceConfig",
 ]

fcdm_diffae/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/__init__.cpython-312.pyc and b/fcdm_diffae/__pycache__/__init__.cpython-312.pyc differ

fcdm_diffae/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/config.cpython-312.pyc and b/fcdm_diffae/__pycache__/config.cpython-312.pyc differ

fcdm_diffae/__pycache__/decoder.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/decoder.cpython-312.pyc and b/fcdm_diffae/__pycache__/decoder.cpython-312.pyc differ

fcdm_diffae/__pycache__/encoder.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/encoder.cpython-312.pyc and b/fcdm_diffae/__pycache__/encoder.cpython-312.pyc differ

fcdm_diffae/__pycache__/model.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/model.cpython-312.pyc and b/fcdm_diffae/__pycache__/model.cpython-312.pyc differ

fcdm_diffae/__pycache__/samplers.cpython-312.pyc CHANGED Viewed

Binary files a/fcdm_diffae/__pycache__/samplers.cpython-312.pyc and b/fcdm_diffae/__pycache__/samplers.cpython-312.pyc differ

fcdm_diffae/config.py CHANGED Viewed

@@ -26,12 +26,30 @@ class FCDMDiffAEConfig:
     bottleneck_posterior_kind: str = "diagonal_gaussian"
     # Post-bottleneck normalization: "channel_wise" or "disabled"
     bottleneck_norm_mode: str = "disabled"
     # VP diffusion schedule endpoints
     logsnr_min: float = -10.0
     logsnr_max: float = 10.0
     # Pixel-space noise std for VP diffusion initialization
     pixel_noise_std: float = 0.558
     def save(self, path: str | Path) -> None:
         """Save config as JSON."""
         p = Path(path)

     bottleneck_posterior_kind: str = "diagonal_gaussian"
     # Post-bottleneck normalization: "channel_wise" or "disabled"
     bottleneck_norm_mode: str = "disabled"
+    # Bottleneck patchification: "off" or "patch_2x2"
+    # When "patch_2x2", encoder latents are 2x2 patchified after the bottleneck
+    # (channels * 4, spatial / 2), and decode unpatchifies before the decoder.
+    bottleneck_patchify_mode: str = "off"
     # VP diffusion schedule endpoints
     logsnr_min: float = -10.0
     logsnr_max: float = 10.0
     # Pixel-space noise std for VP diffusion initialization
     pixel_noise_std: float = 0.558
+    @property
+    def latent_channels(self) -> int:
+        """Channel width of the exported latent space."""
+        if self.bottleneck_patchify_mode == "patch_2x2":
+            return self.bottleneck_dim * 4
+        return self.bottleneck_dim
+    @property
+    def effective_patch_size(self) -> int:
+        """Effective spatial stride from image to latent grid."""
+        if self.bottleneck_patchify_mode == "patch_2x2":
+            return self.patch_size * 2
+        return self.patch_size
     def save(self, path: str | Path) -> None:
         """Save config as JSON."""
         p = Path(path)

fcdm_diffae/model.py CHANGED Viewed

@@ -71,14 +71,14 @@ class FCDMDiffAE(nn.Module):
         super().__init__()
         self.config = config
-        # Latent running stats for whitening/dewhitening
         self.register_buffer(
             "latent_norm_running_mean",
-            torch.zeros((config.bottleneck_dim,), dtype=torch.float32),
         )
         self.register_buffer(
             "latent_norm_running_var",
-            torch.ones((config.bottleneck_dim,), dtype=torch.float32),
         )
         self.encoder = Encoder(
@@ -205,6 +205,20 @@ class FCDMDiffAE(nn.Module):
         mean, std = self._latent_norm_stats()
         return z * std.to(device=z.device) + mean.to(device=z.device)
     def encode(self, images: Tensor) -> Tensor:
         """Encode images to whitened latents (posterior mode).
@@ -212,16 +226,19 @@ class FCDMDiffAE(nn.Module):
         use by downstream latent-space diffusion models.
         Args:
-            images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
         Returns:
-            Whitened latents [B, bottleneck_dim, H/patch, W/patch].
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
         z = self.encoder(images.to(dtype=model_dtype))
         return self.whiten(z).to(dtype=model_dtype)
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
@@ -250,12 +267,13 @@ class FCDMDiffAE(nn.Module):
     ) -> Tensor:
         """Decode whitened latents to images via VP diffusion.
-        Latents are dewhitened internally before being passed to the decoder.
         Args:
-            latents: [B, bottleneck_dim, h, w] whitened encoder latents.
-            height: Output image height (divisible by patch_size).
-            width: Output image width (divisible by patch_size).
             inference_config: Optional inference parameters.
         Returns:
@@ -271,8 +289,11 @@ class FCDMDiffAE(nn.Module):
         except StopIteration:
             model_dtype = torch.float32
-        # Dewhiten back to raw encoder scale for the decoder
-        latents = self.dewhiten(latents).to(dtype=model_dtype)
         if height % config.patch_size != 0 or width % config.patch_size != 0:
             raise ValueError(

         super().__init__()
         self.config = config
+        # Latent running stats for whitening/dewhitening (at exported latent channels)
         self.register_buffer(
             "latent_norm_running_mean",
+            torch.zeros((config.latent_channels,), dtype=torch.float32),
         )
         self.register_buffer(
             "latent_norm_running_var",
+            torch.ones((config.latent_channels,), dtype=torch.float32),
         )
         self.encoder = Encoder(
         mean, std = self._latent_norm_stats()
         return z * std.to(device=z.device) + mean.to(device=z.device)
+    def _patchify(self, z: Tensor) -> Tensor:
+        """2x2 patchify: [B, C, H, W] -> [B, 4C, H/2, W/2]."""
+        b, c, h, w = z.shape
+        z = z.reshape(b, c, h // 2, 2, w // 2, 2)
+        z = z.permute(0, 1, 3, 5, 2, 4)
+        return z.reshape(b, c * 4, h // 2, w // 2)
+    def _unpatchify(self, z: Tensor) -> Tensor:
+        """2x2 unpatchify: [B, 4C, H/2, W/2] -> [B, C, H, W]."""
+        b, c, h, w = z.shape
+        z = z.reshape(b, c // 4, 2, 2, h, w)
+        z = z.permute(0, 1, 4, 2, 5, 3)
+        return z.reshape(b, c // 4, h * 2, w * 2)
     def encode(self, images: Tensor) -> Tensor:
         """Encode images to whitened latents (posterior mode).
         use by downstream latent-space diffusion models.
         Args:
+            images: [B, 3, H, W] in [-1, 1], H and W divisible by
+                effective_patch_size.
         Returns:
+            Whitened latents [B, latent_channels, H/effective_patch, W/effective_patch].
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
         z = self.encoder(images.to(dtype=model_dtype))
+        if self.config.bottleneck_patchify_mode == "patch_2x2":
+            z = self._patchify(z)
         return self.whiten(z).to(dtype=model_dtype)
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
     ) -> Tensor:
         """Decode whitened latents to images via VP diffusion.
+        Latents are dewhitened and (if applicable) unpatchified internally
+        before being passed to the decoder.
         Args:
+            latents: [B, latent_channels, h, w] whitened encoder latents.
+            height: Output image height (divisible by effective_patch_size).
+            width: Output image width (divisible by effective_patch_size).
             inference_config: Optional inference parameters.
         Returns:
         except StopIteration:
             model_dtype = torch.float32
+        # Dewhiten and unpatchify back to raw encoder scale for the decoder
+        latents = self.dewhiten(latents)
+        if config.bottleneck_patchify_mode == "patch_2x2":
+            latents = self._unpatchify(latents)
+        latents = latents.to(dtype=model_dtype)
         if height % config.patch_size != 0 or width % config.patch_size != 0:
             raise ValueError(