data-archetype
/

semdisdiffae_p32

@@ -232,6 +232,13 @@ class FCDMDiffAE(nn.Module):
         Returns:
             Whitened latents [B, latent_channels, H/effective_patch, W/effective_patch].
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
@@ -244,17 +251,28 @@ class FCDMDiffAE(nn.Module):
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
         """Encode images and return the full posterior (mean + logsnr).
         Args:
-            images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
         Returns:
-            EncoderPosterior with mean and logsnr tensors.
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
-        return self.encoder.encode_posterior(images.to(dtype=model_dtype))
     @torch.no_grad()
     def decode(
@@ -289,18 +307,19 @@ class FCDMDiffAE(nn.Module):
         except StopIteration:
             model_dtype = torch.float32
         # Dewhiten and unpatchify back to raw encoder scale for the decoder
         latents = self.dewhiten(latents)
         if config.bottleneck_patchify_mode == "patch_2x2":
             latents = self._unpatchify(latents)
         latents = latents.to(dtype=model_dtype)
-        if height % config.patch_size != 0 or width % config.patch_size != 0:
-            raise ValueError(
-                f"height={height} and width={width} must be divisible by "
-                f"patch_size={config.patch_size}"
-            )
         shape = (batch, config.in_channels, height, width)
         noise = sample_noise(
             shape,

         Returns:
             Whitened latents [B, latent_channels, H/effective_patch, W/effective_patch].
         """
+        eff_patch = self.config.effective_patch_size
+        h, w = int(images.shape[2]), int(images.shape[3])
+        if h % eff_patch != 0 or w % eff_patch != 0:
+            raise ValueError(
+                f"Image height={h} and width={w} must be divisible by "
+                f"effective_patch_size={eff_patch}"
+            )
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
         """Encode images and return the full posterior (mean + logsnr).
+        In patch-32 mode, the posterior is returned in the patchified space
+        (512ch at H/32), consistent with encode() and whiten().
         Args:
+            images: [B, 3, H, W] in [-1, 1], H and W divisible by
+                effective_patch_size.
         Returns:
+            EncoderPosterior with mean and logsnr tensors in the exported
+            latent space.
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
+        posterior = self.encoder.encode_posterior(images.to(dtype=model_dtype))
+        if self.config.bottleneck_patchify_mode == "patch_2x2":
+            return EncoderPosterior(
+                mean=self._patchify(posterior.mean),
+                logsnr=self._patchify(posterior.logsnr),
+            )
+        return posterior
     @torch.no_grad()
     def decode(
         except StopIteration:
             model_dtype = torch.float32
+        eff_patch = config.effective_patch_size
+        if height % eff_patch != 0 or width % eff_patch != 0:
+            raise ValueError(
+                f"height={height} and width={width} must be divisible by "
+                f"effective_patch_size={eff_patch}"
+            )
         # Dewhiten and unpatchify back to raw encoder scale for the decoder
         latents = self.dewhiten(latents)
         if config.bottleneck_patchify_mode == "patch_2x2":
             latents = self._unpatchify(latents)
         latents = latents.to(dtype=model_dtype)
         shape = (batch, config.in_channels, height, width)
         noise = sample_noise(
             shape,