asdf98
/

LiquidGen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

4f46baa

verified ·

1 Parent(s): fe0d9c3

Add model architecture

Browse files

Files changed (1) hide show

model.py +474 -0

model.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+LiquidGen: A Novel Liquid Neural Network Image Generation Model
+Architecture Overview:
+- Frozen VAE encoder/decoder (FLUX.1-schnell, 16ch latent, 8x compression)
+- Liquid backbone for denoising (fully parallelizable, no attention, no sequential ODE)
+- Flow matching training objective (velocity prediction)
+Key Innovation: Replaces attention with Liquid Neural Network dynamics:
+- CfC-inspired closed-form update: x_new = α·x + (1-α)·h(x)
+- Per-channel learnable decay rates (liquid time constants)
+- Depthwise + pointwise convolutions for spatial context (no attention needed)
+- Zigzag spatial scanning for global receptive field
+- Gated stimulus with biologically-inspired sign constraints
+- U-Net style long skip connections from shallow to deep blocks
+Math Foundation (from Hasani et al., CfC paper):
+  x_{t+1} = exp(-Δt/τ_t) · x_t + (1 - exp(-Δt/τ_t)) · h(x_t, u_t)
+Our parallelizable adaptation (inspired by LiquidTAD):
+  α = exp(-softplus(ρ))  [per-channel learnable decay]
+  h = gate · stimulus    [gated depthwise conv output]
+  out = α · x + (1 - α) · h  [liquid relaxation blend]
+This removes the input-dependent τ (which requires sequential computation)
+and replaces it with a per-channel learned decay — making it fully parallel
+while preserving the liquid dynamics' ability to blend old state with new input.
+Design for 16GB VRAM (Colab free tier):
+- VAE frozen: ~1GB
+- Backbone: ~55-280M params (~100-550MB in fp16)
+- Training overhead (grads + optimizer): ~3-8GB
+- Batch of latents: ~1-2GB
+- Total: fits comfortably in 16GB
+References:
+- Hasani et al., "Liquid Time-constant Networks" (NeurIPS 2020)
+- Hasani et al., "Closed-form Continuous-depth Models" (Nature Machine Intelligence 2022)
+- Lechner et al., "Neural Circuit Policies" (Nature Machine Intelligence 2020)
+- LiquidTAD (2025) - Parallelized liquid dynamics
+- ZigMa (ECCV 2024) - Zigzag scanning for SSM-based diffusion
+- DiMSUM (NeurIPS 2024) - Attention-free diffusion
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional, Tuple
+# =============================================================================
+# Building Blocks
+# =============================================================================
+class LiquidTimeConstant(nn.Module):
+    """
+    Core liquid time-constant module.
+    Implements the CfC closed-form dynamics in a fully parallelizable way:
+      out = α · x + (1 - α) · stimulus
+    where α = exp(-softplus(ρ)) is a learnable per-channel decay rate,
+    derived from the liquid time constant τ = 1/softplus(ρ).
+    This preserves the key property of Liquid Neural Networks:
+    - Exponential relaxation toward a target (stimulus)
+    - Rate controlled by τ (how fast to adapt)
+    - No sequential ODE solving required
+    Stability guarantee (from LTC Theorem 1):
+    τ_sys ∈ [τ/(1+τW), τ] — time constants NEVER explode
+    """
+    def __init__(self, channels: int):
+        super().__init__()
+        # ρ parameterizes the decay: λ = softplus(ρ), α = exp(-λ)
+        # Initialize ρ=0 → λ≈0.693 → α≈0.5 (equal blend of old and new)
+        self.rho = nn.Parameter(torch.zeros(channels))
+    def forward(self, x: torch.Tensor, stimulus: torch.Tensor) -> torch.Tensor:
+        """
+        x: [B, C, H, W] - current state (residual path)
+        stimulus: [B, C, H, W] - computed target from context
+        returns: [B, C, H, W] - liquid-blended output
+        """
+        lam = F.softplus(self.rho) + 1e-5
+        alpha = torch.exp(-lam).view(1, -1, 1, 1)
+        return alpha * x + (1.0 - alpha) * stimulus
+class GatedDepthwiseStimulusConv(nn.Module):
+    """
+    Computes the spatial stimulus using depthwise-separable convolutions
+    with a sigmoid gate (inspired by GLU / gated mechanisms in SSMs).
+    This replaces attention for capturing local spatial context:
+    - Depthwise conv: captures local spatial patterns per channel
+    - Pointwise conv: mixes channel information
+    - Sigmoid gate: controls information flow (like synaptic gating in NCP)
+    Two parallel paths (inspired by NCP inter→command split):
+    1. Stimulus path: DW-conv → PW-conv → GELU → project back
+    2. Gate path: DW-conv → PW-conv → sigmoid
+    Output = stimulus * gate
+    """
+    def __init__(self, channels: int, kernel_size: int = 7, expand_ratio: float = 2.0):
+        super().__init__()
+        hidden = int(channels * expand_ratio)
+        self.stim_dw = nn.Conv2d(channels, channels, kernel_size,
+                                  padding=kernel_size // 2, groups=channels, bias=False)
+        self.stim_pw = nn.Conv2d(channels, hidden, 1, bias=False)
+        self.stim_act = nn.GELU()
+        self.stim_proj = nn.Conv2d(hidden, channels, 1, bias=False)
+        self.gate_dw = nn.Conv2d(channels, channels, kernel_size,
+                                  padding=kernel_size // 2, groups=channels, bias=False)
+        self.gate_pw = nn.Conv2d(channels, channels, 1, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        stim = self.stim_proj(self.stim_act(self.stim_pw(self.stim_dw(x))))
+        gate = torch.sigmoid(self.gate_pw(self.gate_dw(x)))
+        return stim * gate
+class ChannelMixMLP(nn.Module):
+    """Channel mixing MLP with GELU activation (command neuron processing in NCP)."""
+    def __init__(self, channels: int, expand_ratio: float = 4.0):
+        super().__init__()
+        hidden = int(channels * expand_ratio)
+        self.fc1 = nn.Conv2d(channels, hidden, 1, bias=True)
+        self.act = nn.GELU()
+        self.fc2 = nn.Conv2d(hidden, channels, 1, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class AdaptiveGroupNorm(nn.Module):
+    """
+    Adaptive Group Normalization conditioned on timestep embedding.
+    Applies: out = (1 + scale) * GroupNorm(x) + shift
+    """
+    def __init__(self, channels: int, cond_dim: int, num_groups: int = 32):
+        super().__init__()
+        self.norm = nn.GroupNorm(num_groups, channels, affine=False)
+        self.proj = nn.Linear(cond_dim, channels * 2)
+        nn.init.zeros_(self.proj.weight)
+        nn.init.zeros_(self.proj.bias)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        h = self.norm(x)
+        params = self.proj(cond)
+        scale, shift = params.chunk(2, dim=-1)
+        return h * (1.0 + scale.unsqueeze(-1).unsqueeze(-1)) + shift.unsqueeze(-1).unsqueeze(-1)
+class ZigzagScan1D(nn.Module):
+    """
+    1D global mixing via zigzag-scanned depthwise conv.
+    Gives quasi-global receptive field without attention's O(n²) cost.
+    Zigzag scan preserves spatial continuity (from ZigMa, ECCV 2024).
+    """
+    def __init__(self, channels: int, kernel_size: int = 31):
+        super().__init__()
+        self.conv1d = nn.Conv1d(channels, channels, kernel_size,
+                                padding=kernel_size // 2, groups=channels, bias=False)
+        self.pw = nn.Conv1d(channels, channels, 1, bias=True)
+        self.act = nn.GELU()
+    def _zigzag_indices(self, H: int, W: int, device: torch.device) -> torch.Tensor:
+        indices = []
+        for i in range(H):
+            row = list(range(i * W, (i + 1) * W))
+            if i % 2 == 1:
+                row = row[::-1]
+            indices.extend(row)
+        return torch.tensor(indices, device=device, dtype=torch.long)
+    def _inverse_zigzag_indices(self, H: int, W: int, device: torch.device) -> torch.Tensor:
+        fwd = self._zigzag_indices(H, W, device)
+        inv = torch.empty_like(fwd)
+        inv[fwd] = torch.arange(H * W, device=device)
+        return inv
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        zz_idx = self._zigzag_indices(H, W, x.device)
+        inv_idx = self._inverse_zigzag_indices(H, W, x.device)
+        x_flat = x.reshape(B, C, H * W)
+        x_zz = x_flat[:, :, zz_idx]
+        x_mixed = self.pw(self.act(self.conv1d(x_zz)))
+        x_restored = x_mixed[:, :, inv_idx]
+        return x_restored.reshape(B, C, H, W)
+# =============================================================================
+# Liquid Block: The core building block
+# =============================================================================
+class LiquidBlock(nn.Module):
+    """
+    A single Liquid Neural Network block for image denoising.
+    Architecture (maps to NCP hierarchy):
+    1. [SENSORY] AdaGN conditioning → spatial context extraction
+    2. [INTER]   Zigzag 1D scan for global mixing
+    3. [COMMAND] Liquid time-constant blend (CfC dynamics)
+    4. [MOTOR]   Channel mixing MLP for output projection
+    All operations are fully parallelizable — no sequential dependencies.
+    """
+    def __init__(
+        self, channels: int, cond_dim: int, spatial_kernel: int = 7,
+        scan_kernel: int = 31, expand_ratio: float = 2.0, mlp_ratio: float = 4.0,
+        drop_rate: float = 0.0, use_zigzag: bool = True,
+    ):
+        super().__init__()
+        self.norm1 = AdaptiveGroupNorm(channels, cond_dim)
+        self.norm2 = AdaptiveGroupNorm(channels, cond_dim)
+        self.spatial_stim = GatedDepthwiseStimulusConv(channels, spatial_kernel, expand_ratio)
+        self.use_zigzag = use_zigzag
+        if use_zigzag:
+            self.zigzag = ZigzagScan1D(channels, scan_kernel)
+            self.zigzag_gate = nn.Parameter(torch.zeros(1))
+        self.liquid = LiquidTimeConstant(channels)
+        self.channel_mix = ChannelMixMLP(channels, mlp_ratio)
+        self.liquid2 = LiquidTimeConstant(channels)
+        self.drop = nn.Dropout2d(drop_rate) if drop_rate > 0 else nn.Identity()
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x, cond)
+        stim = self.spatial_stim(h)
+        if self.use_zigzag:
+            zz = self.zigzag(h)
+            stim = stim + torch.sigmoid(self.zigzag_gate) * zz
+        stim = self.drop(stim)
+        x = self.liquid(x, stim)
+        h2 = self.norm2(x, cond)
+        ch_out = self.drop(self.channel_mix(h2))
+        x = self.liquid2(x, ch_out)
+        return x
+# =============================================================================
+# Timestep and Class Embeddings
+# =============================================================================
+class TimestepEmbedding(nn.Module):
+    """Sinusoidal timestep embedding followed by MLP projection."""
+    def __init__(self, dim: int, freq_dim: int = 256):
+        super().__init__()
+        self.freq_dim = freq_dim
+        self.mlp = nn.Sequential(nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        half = self.freq_dim // 2
+        freqs = torch.exp(-math.log(10000.0) * torch.arange(half, device=t.device, dtype=t.dtype) / half)
+        args = t.unsqueeze(-1) * freqs.unsqueeze(0)
+        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return self.mlp(emb)
+class ClassEmbedding(nn.Module):
+    """Optional class-conditional embedding with CFG null embedding."""
+    def __init__(self, num_classes: int, dim: int):
+        super().__init__()
+        self.embed = nn.Embedding(num_classes, dim)
+        self.null_embed = nn.Parameter(torch.randn(dim) * 0.02)
+    def forward(self, labels: torch.Tensor, drop_prob: float = 0.0) -> torch.Tensor:
+        emb = self.embed(labels)
+        if self.training and drop_prob > 0:
+            mask = torch.rand(labels.shape[0], 1, device=labels.device) < drop_prob
+            emb = torch.where(mask, self.null_embed.unsqueeze(0).expand_as(emb), emb)
+        return emb
+# =============================================================================
+# LiquidGen: Full Model
+# =============================================================================
+class LiquidGen(nn.Module):
+    """
+    LiquidGen: Liquid Neural Network Image Generator
+    A novel attention-free diffusion model that uses Liquid Neural Network
+    dynamics (CfC closed-form continuous-depth) for image generation.
+    Features:
+    - NO self-attention anywhere — O(n) complexity
+    - NO sequential ODE solving — fully parallelizable
+    - Liquid time constants for adaptive information blending
+    - Zigzag scanning for global context
+    - Depthwise convolutions for local spatial structure
+    - Gated stimulus (biologically-inspired from NCP)
+    - U-Net long skip connections (from U-ViT/DiM)
+    Config Presets:
+    - LiquidGen-S: ~55M params (256px, fast training)
+    - LiquidGen-B: ~140M params (256/512px, balanced)
+    - LiquidGen-L: ~280M params (512px, high quality)
+    """
+    def __init__(
+        self,
+        in_channels: int = 16,
+        patch_size: int = 2,
+        embed_dim: int = 512,
+        depth: int = 16,
+        spatial_kernel: int = 7,
+        scan_kernel: int = 31,
+        expand_ratio: float = 2.0,
+        mlp_ratio: float = 4.0,
+        drop_rate: float = 0.0,
+        num_classes: int = 0,
+        class_drop_prob: float = 0.1,
+        use_zigzag: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_classes = num_classes
+        self.class_drop_prob = class_drop_prob
+        cond_dim = embed_dim
+        self.time_embed = TimestepEmbedding(cond_dim)
+        self.class_embed = ClassEmbedding(num_classes, cond_dim) if num_classes > 0 else None
+        self.patch_embed = nn.Conv2d(in_channels, embed_dim, patch_size, stride=patch_size)
+        self.pos_embed_size = 32
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, embed_dim, self.pos_embed_size, self.pos_embed_size) * 0.02
+        )
+        self.input_proj = nn.Sequential(
+            nn.Conv2d(embed_dim, embed_dim, 3, padding=1, groups=embed_dim, bias=False),
+            nn.Conv2d(embed_dim, embed_dim, 1, bias=True),
+            nn.GELU(),
+        )
+        self.blocks = nn.ModuleList([
+            LiquidBlock(embed_dim, cond_dim, spatial_kernel, scan_kernel,
+                       expand_ratio, mlp_ratio, drop_rate, use_zigzag)
+            for _ in range(depth)
+        ])
+        self.final_norm = nn.GroupNorm(32, embed_dim)
+        self.final_proj = nn.Sequential(
+            nn.Conv2d(embed_dim, embed_dim, 3, padding=1, bias=True),
+            nn.GELU(),
+        )
+        self.unpatch = nn.ConvTranspose2d(embed_dim, in_channels, patch_size, stride=patch_size)
+        nn.init.zeros_(self.unpatch.weight)
+        nn.init.zeros_(self.unpatch.bias)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+    def _interpolate_pos_embed(self, H: int, W: int) -> torch.Tensor:
+        if H == self.pos_embed_size and W == self.pos_embed_size:
+            return self.pos_embed
+        return F.interpolate(self.pos_embed, size=(H, W), mode='bilinear', align_corners=False)
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, class_labels: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Predict velocity field for flow matching.
+        Args:
+            x: [B, C, H, W] noisy latent (C=16 for Flux VAE)
+            t: [B] timestep in [0, 1]
+            class_labels: [B] optional class labels
+        Returns:
+            v: [B, C, H, W] predicted velocity
+        """
+        cond = self.time_embed(t)
+        if self.class_embed is not None and class_labels is not None:
+            drop_p = self.class_drop_prob if self.training else 0.0
+            cond = cond + self.class_embed(class_labels, drop_prob=drop_p)
+        h = self.patch_embed(x)
+        B, C, H_p, W_p = h.shape
+        h = h + self._interpolate_pos_embed(H_p, W_p)
+        h = self.input_proj(h)
+        # U-Net style long skip connections
+        skip_connections = []
+        mid = self.depth // 2
+        for i, block in enumerate(self.blocks):
+            if i < mid:
+                skip_connections.append(h)
+            elif i >= mid and len(skip_connections) > 0:
+                skip = skip_connections.pop()
+                h = h + skip
+            h = block(h, cond)
+        h = self.final_norm(h)
+        h = self.final_proj(h)
+        v = self.unpatch(h)
+        return v
+    def count_params(self) -> int:
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+# =============================================================================
+# Model Presets
+# =============================================================================
+def liquidgen_small(**kwargs) -> LiquidGen:
+    """~55M params - for 256px, fast training/testing"""
+    defaults = dict(
+        embed_dim=512, depth=12, spatial_kernel=7, scan_kernel=31,
+        expand_ratio=2.0, mlp_ratio=3.0, use_zigzag=True,
+    )
+    defaults.update(kwargs)
+    return LiquidGen(**defaults)
+def liquidgen_base(**kwargs) -> LiquidGen:
+    """~140M params - for 256/512px, balanced (fits T4 16GB easily)"""
+    defaults = dict(
+        embed_dim=640, depth=18, spatial_kernel=7, scan_kernel=31,
+        expand_ratio=2.0, mlp_ratio=4.0, use_zigzag=True,
+    )
+    defaults.update(kwargs)
+    return LiquidGen(**defaults)
+def liquidgen_large(**kwargs) -> LiquidGen:
+    """~280M params - for 512px, high quality (fits T4 16GB with small batch)"""
+    defaults = dict(
+        embed_dim=768, depth=24, spatial_kernel=7, scan_kernel=31,
+        expand_ratio=2.5, mlp_ratio=4.0, use_zigzag=True,
+    )
+    defaults.update(kwargs)
+    return LiquidGen(**defaults)
+if __name__ == "__main__":
+    device = "cpu"
+    for name, factory in [("Small", liquidgen_small), ("Base", liquidgen_base), ("Large", liquidgen_large)]:
+        model = factory(num_classes=27).to(device)
+        print(f"LiquidGen-{name}: {model.count_params() / 1e6:.1f}M params")
+        x = torch.randn(2, 16, 32, 32, device=device)
+        t = torch.rand(2, device=device)
+        labels = torch.randint(0, 27, (2,), device=device)
+        v = model(x, t, labels)
+        assert v.shape == x.shape
+        x512 = torch.randn(1, 16, 64, 64, device=device)
+        v512 = model(x512, t[:1], labels[:1])
+        assert v512.shape == x512.shape
+        print(f"  256px ✅ 512px ✅")
+        del model
+    print("\n✅ All tests passed!")