asdf98
/

liqmamba-image-generator

Model card Files Files and versions

xet

Community

asdf98 commited on 6 days ago

Commit

0b30f6b

verified ·

1 Parent(s): a1e6c8b

Upload liqmamba/cfc.py

Browse files

Files changed (1) hide show

liqmamba/cfc.py +164 -0

liqmamba/cfc.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Liquid CfC Cell — Closed-form Continuous-time Neural Network
+Implements Theorem 1 from Hasani et al. (2021): an approximate closed-form
+solution for Liquid Time-Constant (LTC) networks.
+The CfC cell computes:
+    x(t) = (x(0) - A) * exp(-[w_tau + f(I, theta)] * t) * f(-I, theta) + A
+This gives us O(1) computation (no ODE solver needed) while preserving the
+expressive continuous-time dynamics of LTCs.
+In our architecture, CfC replaces static activation functions (SiLU/GELU)
+with learnable temporal dynamics, giving each token adaptive computation depth.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CfCCell(nn.Module):
+    """
+    Single CfC cell implementing the closed-form solution.
+    Args:
+        dim: Input/output dimension
+        hidden_dim: Hidden dimension for the backbone network f
+        time_constant_init: Initial value for the time constant w_tau
+        bias_init: Initial value for the bias vector A
+    """
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int | None = None,
+        time_constant_init: float = 1.0,
+        bias_init: float = 0.0,
+    ):
+        super().__init__()
+        hidden_dim = hidden_dim or dim * 2
+        # Backbone network f(x, I, theta): maps input to activation
+        self.backbone = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.Tanh(),
+            nn.Linear(hidden_dim, dim),
+        )
+        # Time constant w_tau — controls how fast the state decays
+        self.w_tau = nn.Parameter(torch.full((dim,), time_constant_init))
+        # Bias vector A — steady-state target
+        self.A = nn.Parameter(torch.full((dim,), bias_init))
+        self.dim = dim
+    def forward(self, x: torch.Tensor, t: float | torch.Tensor = 1.0) -> torch.Tensor:
+        """
+        Args:
+            x: Input tensor of shape (..., dim)
+            t: Time delta (scalar or tensor matching batch dims)
+        Returns:
+            Updated state of shape (..., dim)
+        """
+        # f(x, theta) — nonlinear transformation
+        fx = self.backbone(x)
+        # w_tau + f(x, theta): effective decay rate
+        # Softplus keeps w_tau positive (biological plausibility)
+        decay = F.softplus(self.w_tau) + fx
+        # exp(-decay * t): temporal decay factor
+        if isinstance(t, (int, float)):
+            exp_term = torch.exp(-decay * t)
+        else:
+            exp_term = torch.exp(-decay * t.unsqueeze(-1))
+        # Closed-form solution: x(t) = (x(0) - A) * exp(-decay*t) + A
+        # With residual: x acts as x(0) - A, then we add back A
+        state = (x - self.A) * exp_term + self.A
+        return state
+class CfCLayer(nn.Module):
+    """
+    Multi-neuron CfC layer with optional wiring (sparse connectivity).
+    Each neuron has its own CfC cell, and they interact through a
+    linear projection before the CfC update.
+    This implements the full CfC model from the paper where:
+    - Each neuron has independent time constants
+    - Neurons interact through a learned weight matrix
+    - The closed-form solution gives O(1) per-step computation
+    """
+    def __init__(
+        self,
+        dim: int,
+        expansion_factor: int = 2,
+        use_residual: bool = True,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        # Input projection (mixes information between neurons)
+        self.input_proj = nn.Linear(dim, dim * expansion_factor)
+        # CfC cell operating on expanded dimension
+        self.cfc = CfCCell(dim * expansion_factor)
+        # Output projection back to original dim
+        self.output_proj = nn.Linear(dim * expansion_factor, dim)
+        self.use_residual = use_residual
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor, t: float = 1.0) -> torch.Tensor:
+        residual = x
+        # Expand → CfC → Contract
+        h = self.input_proj(x)
+        h = torch.tanh(h)  # Activation before CfC (as in paper)
+        h = self.cfc(h, t)
+        h = self.output_proj(h)
+        h = self.dropout(h)
+        if self.use_residual:
+            h = residual + h
+        return self.norm(h)
+class CfCGate(nn.Module):
+    """
+    CfC-inspired gating mechanism for Mamba blocks.
+    Instead of the standard SiLU gating used in Mamba, we use a CfC cell
+    that computes an adaptive, time-dependent gate value. This gives the
+    model the ability to:
+    1. Learn per-token computation depth (like adaptive computation time)
+    2. Smooth temporal dynamics that prevent gradient explosion
+    3. Better handling of long-range dependencies through liquid state
+    The gate is computed as:
+        gate = sigmoid(CfC(linear(x)))
+    where CfC gives a smooth, bounded output suitable for gating.
+    """
+    def __init__(self, dim: int, hidden_dim: int | None = None):
+        super().__init__()
+        hidden_dim = hidden_dim or dim
+        self.proj = nn.Linear(dim, hidden_dim)
+        self.cfc = CfCCell(hidden_dim)
+        self.gate_proj = nn.Linear(hidden_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.proj(x)
+        h = self.cfc(h)
+        gate = torch.sigmoid(self.gate_proj(h))
+        return gate