asdf98
/

liqmamba-image-generator

Model card Files Files and versions

xet

Community

asdf98 commited on 5 days ago

Commit

192c527

verified ·

1 Parent(s): b43a040

Upload liqmamba/mamba2_ssd.py

Browse files

Files changed (1) hide show

liqmamba/mamba2_ssd.py +235 -200

liqmamba/mamba2_ssd.py CHANGED Viewed

@@ -1,270 +1,208 @@
 """
 Mamba-2 SSD Block with Liquid (CfC) Gating
-Implements the Structured State Space Duality (SSD) from Mamba-2 paper
-(Dao & Gu, 2024) with CfC-based gating instead of standard SiLU.
-The SSD framework unifies SSMs and attention through structured
-semiseparable matrices, giving us:
-- Linear-time training (no quadratic attention)
-- Parallelizable scans (no sequential loops at train time)
-- 2-8x faster than original Mamba
-For 2D images: we use multi-directional scanning patterns
-with learnable end-of-line tokens (from DiM paper).
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional
 from .cfc import CfCGate
 class Mamba2SSDBlock(nn.Module):
     """
-    Single Mamba-2 SSD block with CfC liquid gating.
-    Architecture (per Mamba-2):
-        x -> Norm -> [in_proj -> conv1d -> SiLU] -> SSD scan -> CfC-gate -> out_proj
-    Key changes from standard Mamba-2:
-    - SiLU activation replaced with CfC-gate for adaptive per-token computation
-    - Optional CfC state modulation in the SSD path
     Args:
         dim: Hidden dimension
-        d_state: SSM state dimension (default 16 for lightweight)
         d_conv: Convolution kernel size
-        expand: Expansion factor for inner dimension
-        n_groups: Number of groups for head structure (like GQA)
-        chunk_size: Scan chunk size for efficient computation
-        use_cfc_modulation: Whether to apply CfC to SSM state transitions
     """
-    def __init__(
-        self,
-        dim: int,
-        d_state: int = 16,
-        d_conv: int = 4,
-        expand: int = 2,
-        n_groups: int = 1,
-        chunk_size: int = 256,
-        use_cfc_modulation: bool = True,
-        dropout: float = 0.0,
-    ):
         super().__init__()
         self.dim = dim
         self.d_state = d_state
-        self.d_conv = d_conv
-        self.expand = expand
         self.inner_dim = dim * expand
         self.n_groups = n_groups
-        self.chunk_size = chunk_size
         self.use_cfc_modulation = use_cfc_modulation
-        # Layer normalization (RMSNorm style for efficiency)
         self.norm = nn.RMSNorm(dim)
-        # Input projection: x -> (x, z) where z is the gate branch
         self.in_proj = nn.Linear(dim, self.inner_dim * 2, bias=False)
-        # 1D convolution for local feature mixing
-        self.conv1d = nn.Conv1d(
-            in_channels=self.inner_dim,
-            out_channels=self.inner_dim,
-            kernel_size=d_conv,
-            groups=self.inner_dim,
-            padding=d_conv - 1,
-        )
-        # SSD parameters
-        # A: diagonal state transition (learned per-head)
-        # Using log-space for stability
-        self.A_log = nn.Parameter(
-            torch.randn(self.inner_dim // n_groups, d_state) * 0.01
-        )
-        # D: skip connection parameter
-        self.D = nn.Parameter(torch.ones(self.inner_dim // n_groups))
-        # Projections for B and C (input-dependent)
         dt_rank = max(1, dim // 16)
-        self.dt_proj = nn.Linear(dim, self.inner_dim)
-        self.B_proj = nn.Linear(dim, d_state * n_groups, bias=False)
-        self.C_proj = nn.Linear(dim, d_state * n_groups, bias=False)
-        # CfC gate (replaces SiLU)
-        if use_cfc_modulation:
-            self.cfc_gate = CfCGate(self.inner_dim)
-        else:
-            self.gate_act = nn.SiLU()
-        # Output projection
         self.out_proj = nn.Linear(self.inner_dim, dim, bias=False)
-        # Dropout
         self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
-        # For 2D scan patterns: learnable EOL (end-of-line) tokens
-        self.register_buffer("init_conv_state", None)
-    def _apply_conv1d(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply 1D causal convolution with proper padding handling."""
-        # x: (B, L, inner_dim) -> (B, inner_dim, L)
-        x = x.transpose(1, 2)
         x = self.conv1d(x)
-        # Remove extra padding (causal: keep only first L outputs)
-        x = x[..., :x.shape[-1] - (self.d_conv - 1)]
-        # Apply SiLU activation (kept here as per Mamba-2 spec)
         x = F.silu(x)
-        # (B, inner_dim, L) -> (B, L, inner_dim)
-        x = x.transpose(1, 2)
-        return x
-    def _selective_scan(self, u: torch.Tensor, delta: torch.Tensor,
-                        A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                        D: torch.Tensor) -> torch.Tensor:
         """
-        Selective scan operation (SSD core).
-        This is the key computation that replaces attention.
-        Uses chunked parallel scan for O(N) complexity.
-        Args:
-            u: Input (B, L, inner_dim)
-            delta: Time step (B, L, inner_dim)
-            A: State matrix (inner_dim//n_groups, d_state)
-            B: Input projection (B, L, n_groups*d_state)
-            C: Output projection (B, L, n_groups*d_state)
-            D: Skip connection (inner_dim//n_groups,)
-        Returns:
-            y: Output (B, L, inner_dim)
         """
-        B, L, D = u.shape
         G = self.n_groups
         N = self.d_state
-        DG = D // G  # dim per group
         # Reshape for grouped processing
-        u = u.view(B, L, G, DG)
-        delta = delta.view(B, L, G, DG)
-        B = B.view(B, L, G, N)
-        C = C.view(B, L, G, N)
-        # Discretize A: A_bar = exp(delta * A)
-        # A_log shape: (DG, N), delta shape: (B, L, G, DG)
-        A = -torch.exp(A_log)  # (DG, N) - keep A negative for stability
-        deltaA = torch.exp(delta.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0))  # (B, L, G, DG, N)
-        deltaB = delta.unsqueeze(-1) * B.unsqueeze(-2)  # (B, L, G, DG, N)
-        # Chunked parallel scan (associative scan)
-        # For simplicity and Colab compatibility, we use a
-        # memory-efficient sequential scan rather than requiring
-        # custom CUDA kernels
-        y = self._parallel_scan(u, deltaA, deltaB, C)
         # Add skip connection
         y = y + u * D.view(1, 1, G, DG)
-        return y.view(B, L, D)
-    def _parallel_scan(self, u: torch.Tensor, deltaA: torch.Tensor,
-                       deltaB: torch.Tensor, C: torch.Tensor) -> torch.Tensor:
         """
-        Efficient parallel scan implementation using PyTorch operations.
-        This implements the SSD (State Space Duality) scan that is:
-        - Parallelizable (no sequential dependency at train time)
-        - Linear in sequence length (O(N) instead of O(N²))
-        Uses the matrix form: y_i = sum_{j<=i} C_i * A_{i:j} * B_j * u_j
         """
-        B, L, G, DG, N = deltaB.shape
-        # Compute the SSD kernel using cumulative products
-        # This leverages PyTorch's efficient cumprod/cumsum operations
-        x = deltaB * u.unsqueeze(-1)  # (B, L, G, DG, N)
-        # Parallel prefix scan using associative property
-        # We use a work-efficient scan algorithm
-        y = torch.zeros_like(x)
-        y[:, 0] = x[:, 0]
-        # Segment the scan into chunks for memory efficiency
-        chunk_size = min(self.chunk_size, L)
-        for start in range(0, L, chunk_size):
-            end = min(start + chunk_size, L)
-            chunk_len = end - start
-            # Within chunk: compute using matrix operations
-            if start == 0:
-                # First chunk: standard scan
-                for i in range(1, chunk_len):
-                    idx = start + i
-                    y[:, idx] = deltaA[:, idx] * y[:, idx-1] + x[:, idx]
-            else:
-                # Subsequent chunks: carry over final state from previous chunk
-                carry = y[:, start-1:start]  # (B, 1, G, DG, N)
-                for i in range(chunk_len):
-                    idx = start + i
-                    if i == 0:
-                        y[:, idx] = deltaA[:, idx] * carry.squeeze(1) + x[:, idx]
-                    else:
-                        y[:, idx] = deltaA[:, idx] * y[:, idx-1] + x[:, idx]
         # Project through C
-        y = (y * C.unsqueeze(-2)).sum(dim=-1)  # (B, L, G, DG)
         return y
-    def forward(self, x: torch.Tensor,
-                scan_direction: str = "forward") -> torch.Tensor:
-        """
-        Args:
-            x: (B, L, D) input sequence
-            scan_direction: 'forward' or 'reverse' (for bidirectional scanning)
-        Returns:
-            (B, L, D) output
-        """
         residual = x
-        # Optional: reverse sequence for bidirectional scanning
         if scan_direction == "reverse":
             x = torch.flip(x, dims=[1])
         # Pre-norm
-        x = self.norm(x)
-        # Input projection -> split into u and z branches
-        proj = self.in_proj(x)  # (B, L, 2*inner_dim)
         u, z = proj.chunk(2, dim=-1)
-        # 1D convolution on u branch
         u = self._apply_conv1d(u)
-        # Compute SSD parameters
-        delta = F.softplus(self.dt_proj(x))  # (B, L, inner_dim)
-        B = self.B_proj(x)  # (B, L, n_groups*d_state)
-        C = self.C_proj(x)  # (B, L, n_groups*d_state)
         # Selective scan
-        u = self._selective_scan(u, delta, -torch.exp(self.A_log), B, C, self.D)
-        # CfC gating (replaces SiLU in standard Mamba-2)
-        if self.use_cfc_modulation:
-            gate = self.cfc_gate(x)
             u = u * gate
         else:
             u = u * F.silu(z)
-        # Output projection
         out = self.out_proj(u)
         out = self.dropout(out)
-        # Restore direction
         if scan_direction == "reverse":
             out = torch.flip(out, dims=[1])
@@ -272,22 +210,119 @@ class Mamba2SSDBlock(nn.Module):
 class BidirectionalMambaBlock(nn.Module):
     """
-    Bidirectional Mamba block for 2D image processing.
-    Combines forward and reverse scans to give each token
-    access to both left and right context.
-    """
-    def __init__(self, dim: int, **kwargs):
         super().__init__()
-        self.forward_ssd = Mamba2SSDBlock(dim, **kwargs)
-        self.reverse_ssd = Mamba2SSDBlock(dim, **kwargs)
-        self.merge = nn.Linear(dim * 2, dim, bias=False)
-        self.norm = nn.LayerNorm(dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        fwd = self.forward_ssd(x, "forward")
-        rev = self.reverse_ssd(x, "reverse")
-        out = self.merge(torch.cat([fwd, rev], dim=-1))
-        return self.norm(out)

 """
 Mamba-2 SSD Block with Liquid (CfC) Gating
+Implements Structured State Space Duality (SSD) from Mamba-2 with CfC gating.
+O(N) complexity, fully parallelizable scan (no sequential loops at train time).
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .cfc import CfCGate
 class Mamba2SSDBlock(nn.Module):
     """
+    Mamba-2 SSD block with CfC liquid gating.
     Args:
         dim: Hidden dimension
+        d_state: SSM state dimension (default 16)
         d_conv: Convolution kernel size
+        expand: Expansion factor
+        n_groups: Number of groups (like GQA heads)
+        use_cfc_modulation: Use CfC gate instead of SiLU
     """
+    def __init__(self, dim, d_state=16, d_conv=4, expand=2, n_groups=1,
+                 use_cfc_modulation=True, dropout=0.0):
         super().__init__()
         self.dim = dim
         self.d_state = d_state
         self.inner_dim = dim * expand
         self.n_groups = n_groups
+        self.d_inner_group = self.inner_dim // n_groups
         self.use_cfc_modulation = use_cfc_modulation
+        # Pre-norm (RMSNorm)
         self.norm = nn.RMSNorm(dim)
+        # Input projection (x -> x, z gate branch)
         self.in_proj = nn.Linear(dim, self.inner_dim * 2, bias=False)
+        # 1D conv for local mixing
+        self.conv1d = nn.Conv1d(self.inner_dim, self.inner_dim, d_conv,
+                                groups=self.inner_dim, padding=d_conv-1)
+        # A parameter (log-space for stability)
+        self.A_log = nn.Parameter(torch.randn(n_groups, d_state) * 0.01)
+        # D skip parameter
+        self.D = nn.Parameter(torch.ones(n_groups))
+        # dt / B / C projections (input-dependent)
         dt_rank = max(1, dim // 16)
+        self.dt_proj = nn.Sequential(
+            nn.Linear(dim, dt_rank, bias=False),
+            nn.Linear(dt_rank, self.inner_dim, bias=True)
+        )
+        self.B_proj = nn.Linear(dim, n_groups * d_state, bias=False)
+        self.C_proj = nn.Linear(dim, n_groups * d_state, bias=False)
+        # CfC gate or SiLU
+        self.cfc_gate = CfCGate(self.inner_dim) if use_cfc_modulation else None
+        # Output
         self.out_proj = nn.Linear(self.inner_dim, dim, bias=False)
         self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+    def _apply_conv1d(self, x):
+        """Causal 1D conv."""
+        B, L, D = x.shape
+        x = x.transpose(1, 2)  # (B, D, L)
         x = self.conv1d(x)
+        x = x[..., :L]  # causal: trim padding
         x = F.silu(x)
+        return x.transpose(1, 2)  # (B, L, D)
+    def _selective_scan(self, u, delta, A, B, C, D):
         """
+        Selective scan using SSD (structured state space duality).
+        The key insight from Mamba-2: SSD = matrix form of SSM,
+        computable via associative scan in O(N) parallel time.
+        Uses PyTorch native ops (no custom CUDA needed).
         """
+        B_sz, L, ID = u.shape
         G = self.n_groups
         N = self.d_state
+        DG = ID // G
         # Reshape for grouped processing
+        u = u.view(B_sz, L, G, DG)          # (B, L, G, DG)
+        delta = delta.view(B_sz, L, G, DG)  # (B, L, G, DG)
+        B = B.view(B_sz, L, G, N)           # (B, L, G, N)
+        C = C.view(B_sz, L, G, N)           # (B, L, G, N)
+        # Discretization
+        # A_bar = exp(delta * A) element-wise
+        A_neg = -torch.exp(self.A_log)  # (G, N), negative for stability
+        deltaA = torch.exp(delta.unsqueeze(-1) * A_neg)  # (B, L, G, DG, N)
+        deltaB_u = delta.unsqueeze(-1) * B.unsqueeze(-2) * u.unsqueeze(-1)
+        # deltaB_u: (B, L, G, DG, N)
+        # Parallel associative scan
+        # We compute h_i = deltaA_i * h_{i-1} + deltaB_u_i
+        # Then y_i = C_i * h_i + D * u_i
+        y = self._associative_scan(deltaA, deltaB_u, C)
+        # y: (B, L, G, DG)
         # Add skip connection
         y = y + u * D.view(1, 1, G, DG)
+        return y.view(B_sz, L, ID)
+    def _associative_scan(self, A, X, C):
         """
+        Parallel prefix scan using binary tree reduction.
+        Implements: h_i = A_i * h_{i-1} + X_i
+        where multiplication is element-wise (diagonal A).
+        Uses a work-efficient parallel scan algorithm.
         """
+        B, L, G, DG, N = A.shape
+        device = A.device
+        # Pad to power of 2 for simpler binary tree
+        orig_L = L
+        L_pow2 = 1
+        while L_pow2 < L:
+            L_pow2 *= 2
+        if L_pow2 > L:
+            pad_len = L_pow2 - L
+            A_pad = torch.cat([A, torch.ones(B, pad_len, G, DG, N, device=device)], dim=1)
+            X_pad = torch.cat([X, torch.zeros(B, pad_len, G, DG, N, device=device)], dim=1)
+        else:
+            A_pad, X_pad = A, X
+            L_pow2 = L
+        # Blelloch scan (work-efficient): up-sweep + down-sweep
+        h = X_pad.clone()
+        # Up-sweep (reduce)
+        stride = 1
+        while stride < L_pow2:
+            for i in range(stride - 1, L_pow2 - stride, stride * 2):
+                # h[i+stride] = A[i+stride] * h[i] + h[i+stride]
+                Ai = A_pad[:, i+stride:i+stride+1]
+                h[:, i+stride:i+stride+1] = Ai * h[:, i:i+1] + h[:, i+stride:i+stride+1]
+            stride *= 2
+        # Down-sweep
+        stride = L_pow2 // 2
+        while stride > 0:
+            for i in range(stride - 1, L_pow2 - stride, stride * 2):
+                tmp = h[:, i:i+1].clone()
+                Ai = A_pad[:, i+stride:i+stride+1]
+                h[:, i:i+1] = h[:, i+stride:i+stride+1]
+                h[:, i+stride:i+stride+1] = Ai * tmp + h[:, i+stride:i+stride+1]
+            stride //= 2
+        # Trim padding
+        h = h[:, :orig_L]
         # Project through C
+        y = (h * C.unsqueeze(-2)).sum(dim=-1)  # (B, L, G, DG)
         return y
+    def forward(self, x, scan_direction="forward"):
         residual = x
+        B, L, D = x.shape
         if scan_direction == "reverse":
             x = torch.flip(x, dims=[1])
         # Pre-norm
+        x_norm = self.norm(x)
+        # Input projection
+        proj = self.in_proj(x_norm)
         u, z = proj.chunk(2, dim=-1)
+        # 1D conv
         u = self._apply_conv1d(u)
+        # SSD parameters
+        delta = F.softplus(self.dt_proj(x_norm))
+        B_proj = self.B_proj(x_norm)
+        C_proj = self.C_proj(x_norm)
         # Selective scan
+        u = self._selective_scan(u, delta, self.A_log, B_proj, C_proj, self.D)
+        # Gate
+        if self.use_cfc_modulation and self.cfc_gate is not None:
+            gate = self.cfc_gate(x_norm)
             u = u * gate
         else:
             u = u * F.silu(z)
+        # Output
         out = self.out_proj(u)
         out = self.dropout(out)
         if scan_direction == "reverse":
             out = torch.flip(out, dims=[1])
 class BidirectionalMambaBlock(nn.Module):
+    """Bidirectional Mamba: forward + reverse scans merged."""
+    def __init__(self, dim, **kwargs):
+        super().__init__()
+        self.fwd = Mamba2SSDBlock(dim, **kwargs)
+        self.rev = Mamba2SSDBlock(dim, **kwargs)
+        self.merge = nn.Linear(dim * 2, dim, bias=False)
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        f = self.fwd(x, "forward")
+        r = self.rev(x, "reverse")
+        out = self.merge(torch.cat([f, r], dim=-1))
+        return self.norm(out)
+class MultiDirectionalScan(nn.Module):
     """
+    2D-adapted Mamba layer with multi-directional scanning.
+    From DiM paper: cycles through 4 scan patterns across layers:
+    - Row-major forward
+    - Row-major reverse
+    - Column-major forward
+    - Column-major reverse
+    Includes learnable EOL (end-of-line) tokens.
+    """
+    def __init__(self, dim, pattern="row_fwd", eol_tokens=1, **kwargs):
         super().__init__()
+        self.dim = dim
+        self.pattern = pattern
+        self.eol_tokens = eol_tokens
+        self.ssd = BidirectionalMambaBlock(dim, **kwargs) if pattern == "bidir" \
+                   else Mamba2SSDBlock(dim, **kwargs)
+        # Learnable end-of-line tokens
+        if eol_tokens > 0:
+            self.eol_token = nn.Parameter(torch.randn(1, eol_tokens, dim) * 0.02)
+    def _unflatten_2d(self, x, H, W, pattern):
+        """Convert 1D sequence to 2D spatial layout."""
+        B, L, D = x.shape
+        if pattern == "row_fwd":
+            return x.view(B, H, W, D)
+        elif pattern == "row_rev":
+            return torch.flip(x.view(B, H, W, D), dims=[2])
+        elif pattern == "col_fwd":
+            return x.view(B, W, H, D).transpose(1, 2)
+        elif pattern == "col_rev":
+            return torch.flip(x.view(B, W, H, D).transpose(1, 2), dims=[2])
+        return x.view(B, H, W, D)
+    def _flatten_2d(self, x, pattern):
+        """Convert 2D spatial layout back to 1D sequence."""
+        B, H, W, D = x.shape
+        if pattern == "row_fwd":
+            return x.reshape(B, H * W, D)
+        elif pattern == "row_rev":
+            return torch.flip(x, dims=[2]).reshape(B, H * W, D)
+        elif pattern == "col_fwd":
+            return x.transpose(1, 2).reshape(B, H * W, D)
+        elif pattern == "col_rev":
+            return torch.flip(x.transpose(1, 2), dims=[2]).reshape(B, H * W, D)
+        return x.reshape(B, H * W, D)
+    def _add_eol_tokens_row(self, x, H, W):
+        """Add EOL tokens at end of each row."""
+        B, L, D = x.shape
+        x = x.view(B, H, W, D)
+        eol = self.eol_token.expand(B, H, -1, -1)
+        x_with_eol = torch.cat([x, eol], dim=2)  # (B, H, W+eol, D)
+        return x_with_eol.reshape(B, H * (W + self.eol_tokens), D)
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: (B, H*W, D) flattened image tokens
+            H, W: spatial dimensions
+        """
+        B, L, D = x.shape
+        # Add EOL tokens
+        if self.eol_tokens > 0:
+            if "row" in self.pattern:
+                x = self._add_eol_tokens_row(x, H, W)
+                scan_W = W + self.eol_tokens
+                H_2d, W_2d = H, scan_W
+            else:  # col scan
+                x_t = x.view(B, W, H, D).transpose(1, 2)  # (B, H, W, D)
+                x_t = x_t.reshape(B, H * W, D)
+                # Add EOL for columns
+                x_t = x_t.view(B, H, W, D)
+                eol = self.eol_token.expand(B, H, -1, -1)
+                x_t = torch.cat([x_t, eol], dim=2)
+                H_2d, W_2d = H, W + self.eol_tokens
+                x = x_t.reshape(B, H_2d * W_2d, D)
+        else:
+            H_2d, W_2d = H, W
+        # Apply SSD
+        if "rev" in self.pattern:
+            x = self.ssd(x, "reverse")
+        else:
+            x = self.ssd(x, "forward")
+        # Remove EOL tokens
+        if self.eol_tokens > 0:
+            x = x.view(B, H_2d, W_2d, D)
+            x = x[:, :, :W_2d - self.eol_tokens, :]  # Remove EOL
+            if "col" in self.pattern:
+                x = x.transpose(1, 2)  # (B, W, H, D)
+            x = x.reshape(B, H * W, D)
+        return x