asdf98
/

LuminaRS

Model card Files Files and versions

xet

Community

asdf98 commited on 11 days ago

Commit

5e89621

verified ·

1 Parent(s): cbb87e4

Upload luminars/ssm.py

Browse files

Files changed (1) hide show

luminars/ssm.py +71 -33

luminars/ssm.py CHANGED Viewed

@@ -1,42 +1,80 @@
 """
-Selective State Space (Mamba2) cell + SelectiveScanKernel.
-No dependencies on mamba_ssm -- pure PyTorch.
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, einsum
-def selective_scan_oneshot(x, delta, A, B, C, D):
     """
-    x: (B, L, N)  -- input tokens
-    delta: (B, L, N) -- time-step, elementwise
-    A: (N,)           -- diagonal S4D real part
-    B, C: (B, L, N)   -- input-dependent
-    D: (N,)           -- skip connection
-    Returns y: (B, L, N)
     """
-    B_, L, N = x.shape
-    # discretize: A_bar = exp(delta * A), B_bar = delta * B
-    # A is negative (stable), delta > 0
-    A = -torch.abs(A)                 # force stability
-    delta = F.softplus(delta)           # >0
-    A_bar = torch.exp(delta.unsqueeze(-1) * A)          # (B, L, N, N)?? No, A is (N,)
-    A_bar = torch.exp(delta * A)         # (B, L, N)
-    B_x = delta * B * x                # (B, L, N)
-    # recurrent scan
-    h = torch.zeros(B_, N, device=x.device, dtype=x.dtype)
-    ys = []
-    for t in range(L):
-        h = A_bar[:, t] * h + B_x[:, t]
-        y = einsum(h, C[:, t], 'b n, b n -> b')
-        ys.append(y)
-    # Actually y = (C_t * h).sum(-1) gives scalar per token... reshape needed.
-    # Let's do it vectorised:
-    # We actually need y_t = sum_n C_{b,t,n} * h_{b,n} = inner product in N dim
-    y = torch.stack(ys, dim=1).unsqueeze(-1) * C  # no, this is wrong dimension
-    # FIX: h is (B,N), output is (B,N) from h*C where C is (B,L,N)
-    # Let me rewrite properly
-    pass

 """
+Lightweight Selective Linear Recurrent Unit (SLRU) -- RWKV/Mamba hybrid.
+No heavy deps. Pure PyTorch. Linear O(n) in seq len.
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+def rmsnorm(x):
+    return x * torch.rsqrt(x.var(dim=-1, keepdim=True) + 1e-6) * math.sqrt(x.size(-1))
+class SiluGLU(nn.Module):
+    """Gated MLP: (x·W_gate) ⊙ SiLU(x·W_up) · W_down"""
+    def __init__(self, dim_in, dim_out=None, expand=2):
+        super().__init__()
+        dim_out = dim_out or dim_in
+        hidden = int(dim_in * expand)
+        self.W_gate = nn.Linear(dim_in, hidden, bias=False)
+        self.W_up   = nn.Linear(dim_in, hidden, bias=False)
+        self.W_down = nn.Linear(hidden, dim_out, bias=False)
+    def forward(self, x):
+        return self.W_down(F.silu(self.W_gate(x)) * self.W_up(x))
+class SelectiveLRU(nn.Module):
     """
+    Simplified selective linear recurrent cell.
+    h_t = decay_t * h_{t-1} + (1 - decay_t) * (x_t · B_proj)
+    y_t = C_proj(h_t) + D_skip * x_t
+    Key: B_t, C_t, decay_t are ALL input-dependent (selective).
+    Merges RWKV's time-mixing with Mamba's selective SSM in a tiny form.
     """
+    def __init__(self, dim, d_state=64, expand=2):
+        super().__init__()
+        self.dim = dim
+        self.d_state = d_state
+        self.expand = expand
+        hidden = dim * expand
+        # Linear projections (all fused: input -> [B, C, delta, skip, gate])
+        self.in_proj = nn.Linear(dim, hidden * 4, bias=False)
+        # State transition
+        self.W_B   = nn.Linear(hidden, d_state)  # input -> state
+        self.W_C   = nn.Linear(d_state, hidden)  # state -> output
+        self.log_A = nn.Parameter(torch.randn(d_state))  # stable: -exp(log_A)
+        self.D     = nn.Parameter(torch.randn(hidden))   # skip connection
+        # Output gate
+        self.out_gate = nn.Linear(dim, hidden, bias=False)
+        self.out_proj = nn.Linear(hidden, dim, bias=False)
+    def forward(self, x):
+        """x: (B, L, dim) -> y: (B, L, dim)"""
+        B, L, dim = x.shape
+        # Input-dependent gates
+        gates = self.in_proj(x)           # (B, L, hidden*4)
+        B_gate, C_gate, delta, skip = gates.chunk(4, dim=-1)  # each (B, L, hidden)
+        # Selective parameters (per-token, per-channel)
+        B_t = torch.tanh(B_gate)          # bound selective B
+        C_t = torch.tanh(C_gate)          # bound selective C
+        delta_t = F.softplus(delta)       # positive time-step
+        decay = torch.exp(-delta_t * torch.exp(self.log_A).view(1, 1, -1))  # (B, L, hidden)
+        # Recurrent scan in hidden dimension (vectorized over batch)
+        # h_t: (B, d_state)
+        # We process per-token
+        state = torch.zeros(B, self.d_state, device=x.device, dtype=x.dtype)
+        outputs = []
+        for t in range(L):
+            # state update
+            Bx = einsum(B_t[:, t], x[:, t], 'b h, b d -> b h')
+            hmm this isn't right...
+        pass