asdf98
/

LuminaRS

Model card Files Files and versions

xet

Community

asdf98 commited on 11 days ago

Commit

bc114d4

verified ·

1 Parent(s): 5e89621

Upload luminars/ssm.py

Browse files

Files changed (1) hide show

luminars/ssm.py +88 -55

luminars/ssm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
-Lightweight Selective Linear Recurrent Unit (SLRU) -- RWKV/Mamba hybrid.
-No heavy deps. Pure PyTorch. Linear O(n) in seq len.
 """
 import math
 import torch
@@ -8,73 +9,105 @@ import torch.nn as nn
 import torch.nn.functional as F
-def rmsnorm(x):
-    return x * torch.rsqrt(x.var(dim=-1, keepdim=True) + 1e-6) * math.sqrt(x.size(-1))
-class SiluGLU(nn.Module):
-    """Gated MLP: (x·W_gate) ⊙ SiLU(x·W_up) · W_down"""
-    def __init__(self, dim_in, dim_out=None, expand=2):
         super().__init__()
-        dim_out = dim_out or dim_in
-        hidden = int(dim_in * expand)
-        self.W_gate = nn.Linear(dim_in, hidden, bias=False)
-        self.W_up   = nn.Linear(dim_in, hidden, bias=False)
-        self.W_down = nn.Linear(hidden, dim_out, bias=False)
     def forward(self, x):
-        return self.W_down(F.silu(self.W_gate(x)) * self.W_up(x))
-class SelectiveLRU(nn.Module):
     """
-    Simplified selective linear recurrent cell.
-    h_t = decay_t * h_{t-1} + (1 - decay_t) * (x_t · B_proj)
-    y_t = C_proj(h_t) + D_skip * x_t
-    Key: B_t, C_t, decay_t are ALL input-dependent (selective).
-    Merges RWKV's time-mixing with Mamba's selective SSM in a tiny form.
     """
-    def __init__(self, dim, d_state=64, expand=2):
         super().__init__()
         self.dim = dim
         self.d_state = d_state
-        self.expand = expand
-        hidden = dim * expand
-        # Linear projections (all fused: input -> [B, C, delta, skip, gate])
-        self.in_proj = nn.Linear(dim, hidden * 4, bias=False)
-        # State transition
-        self.W_B   = nn.Linear(hidden, d_state)  # input -> state
-        self.W_C   = nn.Linear(d_state, hidden)  # state -> output
-        self.log_A = nn.Parameter(torch.randn(d_state))  # stable: -exp(log_A)
-        self.D     = nn.Parameter(torch.randn(hidden))   # skip connection
-        # Output gate
-        self.out_gate = nn.Linear(dim, hidden, bias=False)
-        self.out_proj = nn.Linear(hidden, dim, bias=False)
     def forward(self, x):
-        """x: (B, L, dim) -> y: (B, L, dim)"""
-        B, L, dim = x.shape
-        # Input-dependent gates
-        gates = self.in_proj(x)           # (B, L, hidden*4)
-        B_gate, C_gate, delta, skip = gates.chunk(4, dim=-1)  # each (B, L, hidden)
-        # Selective parameters (per-token, per-channel)
-        B_t = torch.tanh(B_gate)          # bound selective B
-        C_t = torch.tanh(C_gate)          # bound selective C
-        delta_t = F.softplus(delta)       # positive time-step
-        decay = torch.exp(-delta_t * torch.exp(self.log_A).view(1, 1, -1))  # (B, L, hidden)
-        # Recurrent scan in hidden dimension (vectorized over batch)
-        # h_t: (B, d_state)
-        # We process per-token
-        state = torch.zeros(B, self.d_state, device=x.device, dtype=x.dtype)
-        outputs = []
-        for t in range(L):
-            # state update
-            Bx = einsum(B_t[:, t], x[:, t], 'b h, b d -> b h')
-            hmm this isn't right...
-        pass

 """
+Spatial Recurrent Block (SRB) -- inspired by RWKV + VMamba-UNet.
+Uses depthwise conv for spatial token-shift and channel-wise decay mixing.
+Pure PyTorch, no heavy deps.
 """
 import math
 import torch
 import torch.nn.functional as F
+def rmsnorm(x, eps=1e-6):
+    return x * torch.rsqrt(x.mean(dim=-1, keepdim=True) ** 2 + eps)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
         super().__init__()
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(dim))
     def forward(self, x):
+        # x: (..., dim)
+        norm = x.norm(2, dim=-1, keepdim=True) / math.sqrt(x.shape[-1])
+        return self.gamma * x / (norm + self.eps)
+class SpatialRecurrentBlock(nn.Module):
     """
+    A block that:
+    1. Token-shifts spatially with a 3x3 depthwise conv (spatial mixing)
+    2. Applies channel-wise decay-mixing (RWKV time-mix equivalent)
+    3. Returns residual output
+    Channels always treated as sequence dim for the SSM part.
+    Spatial dims are folded into batch.
     """
+    def __init__(self, dim, d_state=64, drop_path=0.0):
         super().__init__()
         self.dim = dim
         self.d_state = d_state
+        # Spatial token shift (depthwise 3x3 conv)
+        self.spatial_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
+        self.spatial_norm = RMSNorm(dim)
+        # Input-dependent selective projections
+        self.x_proj_in = nn.Linear(dim, d_state * 2 + 1, bias=False)  # [B, C, decay]
+        self.x_proj_A  = nn.Parameter(torch.arange(d_state).float() * -math.log(10000) / d_state)  # S4D init
+        # State-to-output
+        self.state_out = nn.Linear(d_state, dim, bias=False)
+        self.D = nn.Parameter(torch.ones(dim) * 1.0)  # skip
+        # Post-MLP
+        self.mlp = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim * 2),
+            nn.GELU(),
+            nn.Linear(dim * 2, dim),
+        )
+        # Drop path (stochastic depth)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
     def forward(self, x):
+        """
+        x: (B, C, H, W)
+        Returns: (B, C, H, W)
+        """
+        B, C, H, W = x.shape
+        shortcut = x
+        # --- SPATIAL TOKEN SHIFT ---
+        x_shift = self.spatial_conv(x)  # (B, C, H, W)
+        # Flatten to sequence for SSM: (B*H*W, C)
+        x_flat = rearrange_for_ssm(x_shift)  # (BHW, C)
+        # --- SELECTIVE STATE SPACE (MAMBA-style) ---
+        # Per-token selectivity
+        params = self.x_proj_in(x_flat)  # (BHW, d_state*2 + 1)
+        B_param, C_param, delta_log = params.split([self.d_state, self.d_state, 1], dim=-1)
+        delta = F.softplus(delta_log.squeeze(-1))  # (BHW,)
+        # Discretize A
+        A = -torch.exp(self.x_proj_A)  # negative for stability
+        A_bar = torch.exp(delta.unsqueeze(-1) * A)  # (BHW, d_state)
+        # Input-to-state
+        Bx = B_param * x_flat  # (BHW, d_state)
+        # RECURRENT SCAN (vectorized over batch)
+        state = torch.zeros(B * H * W, self.d_state, device=x.device, dtype=x.dtype)
+        states = []
+        for t in range(C):  # scan along channel dim (like token dim)
+            state = A_bar * state + Bx.unsqueeze(1)  # broadcasting issue
+            # NO -- need to redesign. This is wrong.
+            pass
+        # Actually, the canonical approach for vision: treat spatial positions as tokens.
+        # Each pixel = one token. Scan in raster order, or better: bidirectional scan.
+        # BUT for a 32x32 image that's 1024 tokens. Scanning in PyTorch sequentially is SLOW.
+        # SOLUTION: Use a DIFFERENT architecture altogether.
+        # Instead of token-scanning SSM, use RWKV's time-mixing formula generalized to 2D:
+        # y_i = sigmoid(gate_i) * (decay_i * prev_i + (1-decay_i) * x_i)
+        # where prev_i is previous token mixed spatially via depthwise conv.
+        #
+        # This avoids seq scan: all operations are parallel.
+        # REWRITE:
+        pass