data-archetype
/

semdisdiffae

@@ -206,19 +206,23 @@ class CapacitorDiffAE(nn.Module):
         return z * std.to(device=z.device) + mean.to(device=z.device)
     def encode(self, images: Tensor) -> Tensor:
-        """Encode images to latents (posterior mode).
         Args:
             images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
         Returns:
-            Latents [B, bottleneck_dim, H/patch, W/patch].
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
-        return self.encoder(images.to(dtype=model_dtype))
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
         """Encode images and return the full posterior (mean + logsnr).
@@ -244,10 +248,12 @@ class CapacitorDiffAE(nn.Module):
         *,
         inference_config: CapacitorDiffAEInferenceConfig | None = None,
     ) -> Tensor:
-        """Decode latents to images via VP diffusion.
         Args:
-            latents: [B, bottleneck_dim, h, w] encoder latents.
             height: Output image height (divisible by patch_size).
             width: Output image width (divisible by patch_size).
             inference_config: Optional inference parameters.
@@ -265,6 +271,9 @@ class CapacitorDiffAE(nn.Module):
         except StopIteration:
             model_dtype = torch.float32
         if height % config.patch_size != 0 or width % config.patch_size != 0:
             raise ValueError(
                 f"height={height} and width={width} must be divisible by "

         return z * std.to(device=z.device) + mean.to(device=z.device)
     def encode(self, images: Tensor) -> Tensor:
+        """Encode images to whitened latents (posterior mode).
+        Returns latents whitened using per-channel running stats, ready for
+        use by downstream latent-space diffusion models.
         Args:
             images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
         Returns:
+            Whitened latents [B, bottleneck_dim, H/patch, W/patch].
         """
         try:
             model_dtype = next(self.parameters()).dtype
         except StopIteration:
             model_dtype = torch.float32
+        z = self.encoder(images.to(dtype=model_dtype))
+        return self.whiten(z).to(dtype=model_dtype)
     def encode_posterior(self, images: Tensor) -> EncoderPosterior:
         """Encode images and return the full posterior (mean + logsnr).
         *,
         inference_config: CapacitorDiffAEInferenceConfig | None = None,
     ) -> Tensor:
+        """Decode whitened latents to images via VP diffusion.
+        Latents are dewhitened internally before being passed to the decoder.
         Args:
+            latents: [B, bottleneck_dim, h, w] whitened encoder latents.
             height: Output image height (divisible by patch_size).
             width: Output image width (divisible by patch_size).
             inference_config: Optional inference parameters.
         except StopIteration:
             model_dtype = torch.float32
+        # Dewhiten back to raw encoder scale for the decoder
+        latents = self.dewhiten(latents).to(dtype=model_dtype)
         if height % config.patch_size != 0 or width % config.patch_size != 0:
             raise ValueError(
                 f"height={height} and width={width} must be divisible by "