asdf98
/

IRIS-architecture

@@ -433,29 +433,36 @@ class GatedLinearRecurrence(nn.Module):
         self.proj_out = nn.Linear(recurrence_dim * 2, dim)
     def _scan(self, x: torch.Tensor) -> torch.Tensor:
-        """Sequential scan for a single direction. x: [B, N, rec_dim]"""
         B, N, D = x.shape
-        # Compute gates (can be parallelized)
         a_base = torch.sigmoid(self.Lambda)  # [D]
-        r = torch.sigmoid(self.W_a(x))       # [B, N, D] - recurrence gate
-        i = torch.sigmoid(self.W_x(x))       # [B, N, D] - input gate
         # a_t = a_base^(c * r_t) — data-dependent decay
         a = a_base.pow(self.c * r)            # [B, N, D]
         # Normalized input: sqrt(1 - a^2) for variance preservation
         input_scale = torch.sqrt(1.0 - a * a + 1e-8)
-        scaled_input = input_scale * (i * x)  # [B, N, D]
-        # Sequential recurrence (use parallel scan in production)
-        outputs = []
         h = torch.zeros(B, D, device=x.device, dtype=x.dtype)
         for t in range(N):
-            h = a[:, t] * h + scaled_input[:, t]
-            outputs.append(h)
-        return torch.stack(outputs, dim=1)  # [B, N, D]
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, D = x.shape
@@ -476,7 +483,7 @@ class GatedLinearRecurrence(nn.Module):
 class ManhattanSpatialGate(nn.Module):
     """Pathway 3: Manhattan distance spatial decay gating.
     Provides learned 2D spatial inductive bias with per-head multi-scale receptive fields.
-    Uses windowed computation for efficiency.
     """
     def __init__(self, dim: int, num_heads: int, window: int = 16):
         super().__init__()
@@ -493,49 +500,59 @@ class ManhattanSpatialGate(nn.Module):
         self.v_proj = nn.Linear(dim, dim)
         self.g_proj = nn.Linear(dim, dim)
         self.o_proj = nn.Linear(dim, dim)
     def _get_manhattan_mask(self, H: int, W: int, device: torch.device) -> torch.Tensor:
-        """Compute Manhattan distance matrix between all 2D positions."""
-        coords = torch.stack(torch.meshgrid(
-            torch.arange(H, device=device),
-            torch.arange(W, device=device),
-            indexing='ij'
-        ), dim=-1).reshape(-1, 2).float()  # [N, 2]
-        # Manhattan distance: |x1-x2| + |y1-y2|
-        dist = torch.cdist(coords, coords, p=1)  # [N, N]
         return dist
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
         B, N, D = x.shape
-        # Compute spatial decay
         gamma = torch.sigmoid(self.gamma_logit)  # [num_heads]
         manhattan_dist = self._get_manhattan_mask(H, W, x.device)  # [N, N]
-        # Window the distance matrix for efficiency
-        # Only compute decay for positions within window distance
-        decay_mask = (manhattan_dist <= self.window).float()
-        # Per-head decay: gamma_h^dist
         decay = gamma[:, None, None].pow(manhattan_dist[None, :, :])  # [heads, N, N]
-        decay = decay * decay_mask[None, :, :]
         # Value and gate
         v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim)
         g = torch.sigmoid(self.g_proj(x))
         # Apply spatial decay to values
-        # [B, heads, N, head_dim] = [heads, N, N] @ [B, heads, N, head_dim]
         v = v.permute(0, 2, 1, 3)  # [B, heads, N, head_dim]
         out = torch.matmul(decay.unsqueeze(0), v)  # [B, heads, N, head_dim]
         # Normalize by decay sum
-        decay_sum = decay.sum(dim=-1, keepdim=True).unsqueeze(0) + 1e-8  # [1, heads, N, 1]
         out = out / decay_sum
-        out = out.permute(0, 2, 1, 3).reshape(B, N, D)  # [B, N, D]
-        out = out * g  # Gating
         return self.o_proj(out)

         self.proj_out = nn.Linear(recurrence_dim * 2, dim)
     def _scan(self, x: torch.Tensor) -> torch.Tensor:
+        """Gated linear recurrence scan. x: [B, N, rec_dim]
+        Uses chunked computation to reduce Python loop overhead.
+        For production, replace with a CUDA parallel scan kernel.
+        """
         B, N, D = x.shape
+        # Compute all gates in one shot (parallelized)
         a_base = torch.sigmoid(self.Lambda)  # [D]
+        r = torch.sigmoid(self.W_a(x))       # [B, N, D]
+        i = torch.sigmoid(self.W_x(x))       # [B, N, D]
         # a_t = a_base^(c * r_t) — data-dependent decay
         a = a_base.pow(self.c * r)            # [B, N, D]
         # Normalized input: sqrt(1 - a^2) for variance preservation
         input_scale = torch.sqrt(1.0 - a * a + 1e-8)
+        u = input_scale * (i * x)             # [B, N, D]
+        # Sequential recurrence — use contiguous tensors for speed
+        a = a.contiguous()
+        u = u.contiguous()
         h = torch.zeros(B, D, device=x.device, dtype=x.dtype)
+        outputs = torch.empty_like(u)         # Pre-allocate output
         for t in range(N):
+            h = a[:, t] * h + u[:, t]
+            outputs[:, t] = h
+        return outputs
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, D = x.shape
 class ManhattanSpatialGate(nn.Module):
     """Pathway 3: Manhattan distance spatial decay gating.
     Provides learned 2D spatial inductive bias with per-head multi-scale receptive fields.
+    Uses CACHED distance matrix and sparse windowed computation for efficiency.
     """
     def __init__(self, dim: int, num_heads: int, window: int = 16):
         super().__init__()
         self.v_proj = nn.Linear(dim, dim)
         self.g_proj = nn.Linear(dim, dim)
         self.o_proj = nn.Linear(dim, dim)
+        # Cache for distance matrix (computed once, reused)
+        self._cached_dist = None
+        self._cached_shape = None
     def _get_manhattan_mask(self, H: int, W: int, device: torch.device) -> torch.Tensor:
+        """Compute Manhattan distance matrix — CACHED after first call."""
+        shape_key = (H, W, device)
+        if self._cached_dist is not None and self._cached_shape == shape_key:
+            return self._cached_dist
+        # Build coordinate grid
+        rows = torch.arange(H, device=device)
+        cols = torch.arange(W, device=device)
+        grid_r, grid_c = torch.meshgrid(rows, cols, indexing='ij')
+        coords = torch.stack([grid_r.reshape(-1), grid_c.reshape(-1)], dim=-1).float()  # [N, 2]
+        # Manhattan distance via broadcasting (faster than cdist)
+        dist = (coords[:, None, :] - coords[None, :, :]).abs().sum(dim=-1)  # [N, N]
+        self._cached_dist = dist
+        self._cached_shape = shape_key
         return dist
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
         B, N, D = x.shape
+        # Compute spatial decay (distance matrix is cached)
         gamma = torch.sigmoid(self.gamma_logit)  # [num_heads]
         manhattan_dist = self._get_manhattan_mask(H, W, x.device)  # [N, N]
+        # Window mask — only positions within window distance contribute
+        decay_mask = (manhattan_dist <= self.window)  # bool [N, N]
+        # Per-head decay: gamma_h^dist, masked to window
+        # Only compute pow for positions within window (sparse)
         decay = gamma[:, None, None].pow(manhattan_dist[None, :, :])  # [heads, N, N]
+        decay = decay * decay_mask.unsqueeze(0).float()
         # Value and gate
         v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim)
         g = torch.sigmoid(self.g_proj(x))
         # Apply spatial decay to values
         v = v.permute(0, 2, 1, 3)  # [B, heads, N, head_dim]
         out = torch.matmul(decay.unsqueeze(0), v)  # [B, heads, N, head_dim]
         # Normalize by decay sum
+        decay_sum = decay.sum(dim=-1, keepdim=True).unsqueeze(0) + 1e-8
         out = out / decay_sum
+        out = out.permute(0, 2, 1, 3).reshape(B, N, D)
+        out = out * g
         return self.o_proj(out)