asdf98
/

IRIS-architecture

@@ -372,35 +372,34 @@ class FourierMixingPathway(nn.Module):
         out_imag = torch.einsum('...ki,kij->...kj', x.real, w_imag) + torch.einsum('...ki,kij->...kj', x.imag, w_real)
         return torch.complex(out_real, out_imag)
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
         B, N, D = x.shape
         x_2d = x.reshape(B, H, W, D)
-        # 2D Real FFT on spatial dimensions
         x_freq = torch.fft.rfft2(x_2d, dim=(1, 2), norm='ortho')  # [B, H, W//2+1, D]
         # Reshape channel dim for block-diagonal MLP: D → (num_blocks, block_size)
         Hf, Wf = x_freq.shape[1], x_freq.shape[2]
         x_freq = x_freq.reshape(B, Hf, Wf, self.num_blocks, self.block_size)
-        # Block MLP Layer 1: operates on last dim (block_size)
-        # x_freq: [B, Hf, Wf, num_blocks, block_size]
-        # w1: [num_blocks, block_size, block_size]
         x_freq = self.complex_matmul(x_freq, self.w1_real, self.w1_imag)
-        x_freq = x_freq + self.b1  # Broadcast bias (real only)
         x_freq = torch.complex(F.relu(x_freq.real), F.relu(x_freq.imag))
         # Block MLP Layer 2
         x_freq = self.complex_matmul(x_freq, self.w2_real, self.w2_imag)
         x_freq = x_freq + self.b2
-        # Reshape back to [B, Hf, Wf, D]
         x_freq = x_freq.reshape(B, Hf, Wf, D)
         # Soft-shrinkage (sparsity in Fourier domain)
         magnitude = x_freq.abs()
         shrunk_mag = F.relu(magnitude - self.sparsity_threshold)
-        # Preserve phase, shrink magnitude
         x_freq = x_freq * (shrunk_mag / (magnitude + 1e-8))
         # Inverse FFT
@@ -432,37 +431,33 @@ class GatedLinearRecurrence(nn.Module):
         # Output projection
         self.proj_out = nn.Linear(recurrence_dim * 2, dim)
     def _scan(self, x: torch.Tensor) -> torch.Tensor:
-        """Gated linear recurrence scan. x: [B, N, rec_dim]
-        Uses chunked computation to reduce Python loop overhead.
-        For production, replace with a CUDA parallel scan kernel.
-        """
         B, N, D = x.shape
         # Compute all gates in one shot (parallelized)
-        a_base = torch.sigmoid(self.Lambda)  # [D]
-        r = torch.sigmoid(self.W_a(x))       # [B, N, D]
-        i = torch.sigmoid(self.W_x(x))       # [B, N, D]
-        # a_t = a_base^(c * r_t) — data-dependent decay
-        a = a_base.pow(self.c * r)            # [B, N, D]
-        # Normalized input: sqrt(1 - a^2) for variance preservation
         input_scale = torch.sqrt(1.0 - a * a + 1e-8)
-        u = input_scale * (i * x)             # [B, N, D]
-        # Sequential recurrence — use contiguous tensors for speed
-        a = a.contiguous()
-        u = u.contiguous()
-        h = torch.zeros(B, D, device=x.device, dtype=x.dtype)
-        outputs = torch.empty_like(u)         # Pre-allocate output
-        for t in range(N):
-            h = a[:, t] * h + u[:, t]
-            outputs[:, t] = h
-        return outputs
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, D = x.shape
@@ -526,29 +521,30 @@ class ManhattanSpatialGate(nn.Module):
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
         B, N, D = x.shape
-        # Compute spatial decay (distance matrix is cached)
-        gamma = torch.sigmoid(self.gamma_logit)  # [num_heads]
-        manhattan_dist = self._get_manhattan_mask(H, W, x.device)  # [N, N]
-        # Window mask — only positions within window distance contribute
-        decay_mask = (manhattan_dist <= self.window)  # bool [N, N]
-        # Per-head decay: gamma_h^dist, masked to window
-        # Only compute pow for positions within window (sparse)
         decay = gamma[:, None, None].pow(manhattan_dist[None, :, :])  # [heads, N, N]
         decay = decay * decay_mask.unsqueeze(0).float()
-        # Value and gate
         v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim)
         g = torch.sigmoid(self.g_proj(x))
-        # Apply spatial decay to values
         v = v.permute(0, 2, 1, 3)  # [B, heads, N, head_dim]
-        out = torch.matmul(decay.unsqueeze(0), v)  # [B, heads, N, head_dim]
-        # Normalize by decay sum
-        decay_sum = decay.sum(dim=-1, keepdim=True).unsqueeze(0) + 1e-8
         out = out / decay_sum
         out = out.permute(0, 2, 1, 3).reshape(B, N, D)
@@ -623,8 +619,9 @@ class CrossAttention(nn.Module):
         self.o_proj = nn.Linear(num_heads * head_dim, dim)
         # QK normalization for stability (from SANA-Sprint)
-        self.q_norm = nn.RMSNorm(head_dim)
-        self.k_norm = nn.RMSNorm(head_dim)
     def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
         B, N, _ = x.shape

         out_imag = torch.einsum('...ki,kij->...kj', x.real, w_imag) + torch.einsum('...ki,kij->...kj', x.imag, w_real)
         return torch.complex(out_real, out_imag)
+    @torch.amp.custom_fwd(device_type='cuda', cast_inputs=torch.float32)
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
+        """Forward pass — forced to fp32 because FFT + ComplexHalf is broken/slow."""
         B, N, D = x.shape
         x_2d = x.reshape(B, H, W, D)
+        # 2D Real FFT on spatial dimensions (MUST be fp32 — ComplexHalf is broken)
         x_freq = torch.fft.rfft2(x_2d, dim=(1, 2), norm='ortho')  # [B, H, W//2+1, D]
         # Reshape channel dim for block-diagonal MLP: D → (num_blocks, block_size)
         Hf, Wf = x_freq.shape[1], x_freq.shape[2]
         x_freq = x_freq.reshape(B, Hf, Wf, self.num_blocks, self.block_size)
+        # Block MLP Layer 1
         x_freq = self.complex_matmul(x_freq, self.w1_real, self.w1_imag)
+        x_freq = x_freq + self.b1
         x_freq = torch.complex(F.relu(x_freq.real), F.relu(x_freq.imag))
         # Block MLP Layer 2
         x_freq = self.complex_matmul(x_freq, self.w2_real, self.w2_imag)
         x_freq = x_freq + self.b2
+        # Reshape back
         x_freq = x_freq.reshape(B, Hf, Wf, D)
         # Soft-shrinkage (sparsity in Fourier domain)
         magnitude = x_freq.abs()
         shrunk_mag = F.relu(magnitude - self.sparsity_threshold)
         x_freq = x_freq * (shrunk_mag / (magnitude + 1e-8))
         # Inverse FFT
         # Output projection
         self.proj_out = nn.Linear(recurrence_dim * 2, dim)
+    @staticmethod
+    @torch.jit.script
+    def _scan_kernel(a: torch.Tensor, u: torch.Tensor) -> torch.Tensor:
+        """JIT-compiled sequential scan — avoids Python loop overhead on GPU."""
+        B, N, D = a.shape
+        h = torch.zeros(B, D, device=a.device, dtype=a.dtype)
+        outputs = torch.empty_like(u)
+        for t in range(N):
+            h = a[:, t] * h + u[:, t]
+            outputs[:, t] = h
+        return outputs
     def _scan(self, x: torch.Tensor) -> torch.Tensor:
+        """Gated linear recurrence scan. x: [B, N, rec_dim]"""
         B, N, D = x.shape
         # Compute all gates in one shot (parallelized)
+        a_base = torch.sigmoid(self.Lambda)
+        r = torch.sigmoid(self.W_a(x))
+        i = torch.sigmoid(self.W_x(x))
+        a = a_base.pow(self.c * r)
         input_scale = torch.sqrt(1.0 - a * a + 1e-8)
+        u = input_scale * (i * x)
+        # JIT-compiled scan (much faster than Python loop on GPU)
+        return self._scan_kernel(a.contiguous(), u.contiguous())
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, D = x.shape
     def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
         B, N, D = x.shape
+        input_dtype = x.dtype
+        # Compute spatial decay in fp32 (pow in fp16 loses precision badly)
+        gamma = torch.sigmoid(self.gamma_logit).float()  # [num_heads] fp32
+        manhattan_dist = self._get_manhattan_mask(H, W, x.device)  # [N, N] fp32
+        # Window mask
+        decay_mask = (manhattan_dist <= self.window)
+        # Per-head decay: gamma_h^dist (fp32 for precision)
         decay = gamma[:, None, None].pow(manhattan_dist[None, :, :])  # [heads, N, N]
         decay = decay * decay_mask.unsqueeze(0).float()
+        # Value and gate (stay in input dtype for speed)
         v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim)
         g = torch.sigmoid(self.g_proj(x))
+        # Matmul in input dtype (fp16 ok for matmul)
         v = v.permute(0, 2, 1, 3)  # [B, heads, N, head_dim]
+        decay_cast = decay.unsqueeze(0).to(input_dtype)
+        out = torch.matmul(decay_cast, v)  # [B, heads, N, head_dim]
+        # Normalize
+        decay_sum = decay_cast.sum(dim=-1, keepdim=True) + 1e-8
         out = out / decay_sum
         out = out.permute(0, 2, 1, 3).reshape(B, N, D)
         self.o_proj = nn.Linear(num_heads * head_dim, dim)
         # QK normalization for stability (from SANA-Sprint)
+        # Use LayerNorm instead of RMSNorm — RMSNorm has fp16 weight mismatch issues
+        self.q_norm = nn.LayerNorm(head_dim, elementwise_affine=False)
+        self.k_norm = nn.LayerNorm(head_dim, elementwise_affine=False)
     def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
         B, N, _ = x.shape