asdf98
/

microforge

Model card Files Files and versions

xet

Community

asdf98 commited on 9 days ago

Commit

6e28f73

verified ·

1 Parent(s): b0bf58d

Add microforge/vae.py

Browse files

Files changed (1) hide show

microforge/vae.py +337 -0

microforge/vae.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+MicroForge VAE: Deep Compression Autoencoder
+=============================================
+Inspired by DC-AE (arxiv:2410.10733) and TinyVAE (DreamLite).
+Key innovations for mobile:
+- 32x spatial compression (512px -> 16x16 latent grid)
+- Residual autoencoding with space-to-channel shortcuts
+- Lightweight decoder (<3M params) for mobile deployment
+- KL-regularized continuous latent space
+Architecture:
+  Encoder: [3,H,W] -> conv_in -> DownBlock x4 (stride 2 each) -> [C_latent, H/32, W/32]
+  Each DownBlock: ResBlock + optional Attention (only at lowest res) + Downsample
+  Residual shortcut: space_to_channel rearrange on skip connections
+  Decoder: Mirror of encoder with PixelShuffle upsampling
+For 512px input:
+  Latent = [32, 16, 16] = 8192 values (vs SD-VAE's 16384)
+  Spatial tokens for backbone = 256 (16x16) = 16x fewer than SD-VAE's 4096
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+class ResBlock(nn.Module):
+    """Efficient residual block with optional group norm."""
+    def __init__(self, in_ch: int, out_ch: int, groups: int = 8):
+        super().__init__()
+        self.norm1 = nn.GroupNorm(groups, in_ch)
+        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
+        self.norm2 = nn.GroupNorm(groups, out_ch)
+        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
+        self.skip = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
+        self.act = nn.SiLU(inplace=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.act(self.norm1(x))
+        h = self.conv1(h)
+        h = self.act(self.norm2(h))
+        h = self.conv2(h)
+        return h + self.skip(x)
+class ExpandedSeparableConv(nn.Module):
+    """
+    UIB-style expanded separable convolution (from SnapGen).
+    DW -> PW expand -> PW project. 24% fewer params than standard conv.
+    """
+    def __init__(self, channels: int, expansion: int = 2):
+        super().__init__()
+        expanded = channels * expansion
+        self.dw = nn.Conv2d(channels, channels, 3, padding=1, groups=channels)
+        self.pw_expand = nn.Conv2d(channels, expanded, 1)
+        self.act = nn.SiLU(inplace=True)
+        self.pw_project = nn.Conv2d(expanded, channels, 1)
+        self.norm = nn.GroupNorm(8, channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm(x)
+        h = self.dw(h)
+        h = self.pw_expand(h)
+        h = self.act(h)
+        h = self.pw_project(h)
+        return h + x
+class SpaceToChannel(nn.Module):
+    """
+    Residual space-to-channel shortcut (DC-AE key innovation).
+    Rearranges spatial dims into channels for non-parametric skip.
+    [B, C, H, W] -> [B, C*factor^2, H/factor, W/factor]
+    """
+    def __init__(self, factor: int = 2):
+        super().__init__()
+        self.factor = factor
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        f = self.factor
+        # Rearrange: (B, C, H, W) -> (B, C*f*f, H/f, W/f)
+        x = x.reshape(B, C, H // f, f, W // f, f)
+        x = x.permute(0, 1, 3, 5, 2, 4).contiguous()
+        x = x.reshape(B, C * f * f, H // f, W // f)
+        return x
+class ChannelToSpace(nn.Module):
+    """Inverse of SpaceToChannel for decoder skip connections."""
+    def __init__(self, factor: int = 2):
+        super().__init__()
+        self.factor = factor
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        f = self.factor
+        c_out = C // (f * f)
+        x = x.reshape(B, c_out, f, f, H, W)
+        x = x.permute(0, 1, 4, 2, 5, 3).contiguous()
+        x = x.reshape(B, c_out, H * f, W * f)
+        return x
+class EncoderBlock(nn.Module):
+    """Encoder block: ResBlocks + optional attention + downsample."""
+    def __init__(self, in_ch: int, out_ch: int, num_res: int = 2, use_attn: bool = False):
+        super().__init__()
+        self.res_blocks = nn.ModuleList()
+        self.res_blocks.append(ResBlock(in_ch, out_ch))
+        for _ in range(num_res - 1):
+            self.res_blocks.append(ResBlock(out_ch, out_ch))
+        self.sep_conv = ExpandedSeparableConv(out_ch)
+        # Self-attention only at bottleneck (lowest resolution)
+        self.use_attn = use_attn
+        if use_attn:
+            self.attn_norm = nn.GroupNorm(8, out_ch)
+            self.attn = nn.MultiheadAttention(out_ch, num_heads=4, batch_first=True)
+        self.downsample = nn.Conv2d(out_ch, out_ch, 3, stride=2, padding=1)
+        # Residual shortcut
+        self.space_to_channel = SpaceToChannel(factor=2)
+        self.shortcut_proj = nn.Conv2d(in_ch * 4, out_ch, 1)  # project after space-to-channel
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Space-to-channel residual shortcut
+        shortcut = self.space_to_channel(x)
+        shortcut = self.shortcut_proj(shortcut)
+        for res in self.res_blocks:
+            x = res(x)
+        x = self.sep_conv(x)
+        if self.use_attn:
+            B, C, H, W = x.shape
+            h = self.attn_norm(x).reshape(B, C, -1).permute(0, 2, 1)
+            h, _ = self.attn(h, h, h)
+            x = x + h.permute(0, 2, 1).reshape(B, C, H, W)
+        x = self.downsample(x)
+        x = x + shortcut  # Residual autoencoding
+        return x
+class DecoderBlock(nn.Module):
+    """Decoder block: Upsample + ResBlocks + optional attention."""
+    def __init__(self, in_ch: int, out_ch: int, num_res: int = 2, use_attn: bool = False):
+        super().__init__()
+        # Upsample first
+        self.upsample = nn.Sequential(
+            nn.Conv2d(in_ch, in_ch * 4, 3, padding=1),
+            nn.PixelShuffle(2),
+        )
+        self.channel_to_space = ChannelToSpace(factor=2)
+        self.shortcut_proj = nn.Conv2d(in_ch // 4, out_ch, 1) if in_ch // 4 != out_ch else nn.Identity()
+        self.res_blocks = nn.ModuleList()
+        self.res_blocks.append(ResBlock(in_ch, out_ch))
+        for _ in range(num_res - 1):
+            self.res_blocks.append(ResBlock(out_ch, out_ch))
+        self.sep_conv = ExpandedSeparableConv(out_ch)
+        self.use_attn = use_attn
+        if use_attn:
+            self.attn_norm = nn.GroupNorm(8, out_ch)
+            self.attn = nn.MultiheadAttention(out_ch, num_heads=4, batch_first=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Channel-to-space residual shortcut
+        shortcut = self.channel_to_space(x)
+        shortcut = self.shortcut_proj(shortcut)
+        x = self.upsample(x)
+        for res in self.res_blocks:
+            x = res(x)
+        x = self.sep_conv(x)
+        if self.use_attn:
+            B, C, H, W = x.shape
+            h = self.attn_norm(x).reshape(B, C, -1).permute(0, 2, 1)
+            h, _ = self.attn(h, h, h)
+            x = x + h.permute(0, 2, 1).reshape(B, C, H, W)
+        x = x + shortcut
+        return x
+class MicroForgeVAE(nn.Module):
+    """
+    MicroForge VAE: Deep Compression Autoencoder
+    32× spatial compression with residual space-to-channel shortcuts.
+    For 512px input: latent = [32, 16, 16] = 8192 values
+    Architecture sizes:
+    - Tiny (for mobile decode): ~2.5M params decoder
+    - Small (for training): ~12M params total
+    - Base (full quality): ~25M params total
+    """
+    CONFIGS = {
+        'tiny': {
+            'enc_channels': [32, 64, 128, 256],
+            'latent_channels': 16,
+            'num_res_blocks': 1,
+        },
+        'small': {
+            'enc_channels': [64, 128, 256, 512],
+            'latent_channels': 32,
+            'num_res_blocks': 2,
+        },
+        'base': {
+            'enc_channels': [128, 256, 512, 512],
+            'latent_channels': 32,
+            'num_res_blocks': 2,
+        }
+    }
+    def __init__(
+        self,
+        in_channels: int = 3,
+        config: str = 'small',
+        latent_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        cfg = self.CONFIGS[config]
+        channels = cfg['enc_channels']
+        self.latent_channels = latent_channels or cfg['latent_channels']
+        num_res = cfg['num_res_blocks']
+        # Encoder: 5 stages of 2× downsample = 32× total
+        self.conv_in = nn.Conv2d(in_channels, channels[0], 3, padding=1)
+        self.encoder_blocks = nn.ModuleList()
+        in_ch = channels[0]
+        for i, out_ch in enumerate(channels):
+            use_attn = (i == len(channels) - 1)  # Attention only at bottleneck
+            self.encoder_blocks.append(EncoderBlock(in_ch, out_ch, num_res, use_attn))
+            in_ch = out_ch
+        # Extra downsample to reach 32× (4 blocks = 16×, need one more 2×)
+        self.extra_down = nn.Sequential(
+            ResBlock(channels[-1], channels[-1]),
+            nn.Conv2d(channels[-1], channels[-1], 3, stride=2, padding=1),
+        )
+        # To latent: mu and log_var
+        self.to_mu = nn.Conv2d(channels[-1], self.latent_channels, 1)
+        self.to_logvar = nn.Conv2d(channels[-1], self.latent_channels, 1)
+        # From latent
+        self.from_latent = nn.Conv2d(self.latent_channels, channels[-1], 1)
+        # Extra upsample
+        self.extra_up = nn.Sequential(
+            ResBlock(channels[-1], channels[-1]),
+            nn.Conv2d(channels[-1], channels[-1] * 4, 3, padding=1),
+            nn.PixelShuffle(2),
+        )
+        # Decoder: mirror of encoder
+        self.decoder_blocks = nn.ModuleList()
+        dec_channels = list(reversed(channels))
+        in_ch = dec_channels[0]
+        for i, out_ch in enumerate(dec_channels):
+            use_attn = (i == 0)  # Attention at first (lowest res) decoder block
+            self.decoder_blocks.append(DecoderBlock(in_ch, out_ch, num_res, use_attn))
+            in_ch = out_ch
+        self.conv_out = nn.Sequential(
+            nn.GroupNorm(8, dec_channels[-1]),
+            nn.SiLU(),
+            nn.Conv2d(dec_channels[-1], in_channels, 3, padding=1),
+        )
+        self._init_weights()
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def encode(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Encode image to latent distribution parameters."""
+        h = self.conv_in(x)
+        for block in self.encoder_blocks:
+            h = block(h)
+        h = self.extra_down(h)
+        mu = self.to_mu(h)
+        logvar = self.to_logvar(h).clamp(-30.0, 20.0)  # Clamp for numerical stability
+        return mu, logvar
+    def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        """Sample from latent distribution using reparameterization trick."""
+        if self.training:
+            std = torch.exp(0.5 * logvar)
+            eps = torch.randn_like(std)
+            return mu + eps * std
+        return mu
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        """Decode latent to image."""
+        h = self.from_latent(z)
+        h = self.extra_up(h)
+        for block in self.decoder_blocks:
+            h = block(h)
+        return self.conv_out(h)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Full forward pass: encode -> reparameterize -> decode."""
+        mu, logvar = self.encode(x)
+        z = self.reparameterize(mu, logvar)
+        x_recon = self.decode(z)
+        return x_recon, mu, logvar
+    def get_latent(self, x: torch.Tensor) -> torch.Tensor:
+        """Get deterministic latent (mu only, for inference)."""
+        mu, _ = self.encode(x)
+        return mu
+    @staticmethod
+    def kl_loss(mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        """KL divergence loss for VAE."""
+        return -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
+    @staticmethod
+    def recon_loss(x_recon: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        """Reconstruction loss (L1 + perceptual placeholder)."""
+        return F.l1_loss(x_recon, x)