onnx-community
/

needle-onnx

+"""Needle Simple Attention Network — PyTorch port.
+Encoder, Decoder, NeedleModel — parametric on TransformerConfig.
+Key design decisions:
+  - No FFN (no_feedforward=True is the production default; we never implement it).
+  - ZCRMSNorm, GQA, RoPE all match architecture.py line-for-line.
+  - Decoder.step() is ONNX-traceable: no data-dependent control flow.
+  - Tied embedding: decoder logits = hidden @ embedding.weight.T
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .config import TransformerConfig
+from .layers import ZCRMSNorm, RoPE, MultiHeadAttention, make_causal_mask
+# ---------------------------------------------------------------------------
+# EncoderBlock
+# ---------------------------------------------------------------------------
+class EncoderBlock(nn.Module):
+    """Pre-norm self-attention with sigmoid-gated residual.
+    Matches Flax EncoderBlock.__call__:
+        gate = sigmoid(attn_gate)
+        x = ZCRMSNorm(x)
+        x = self_attn(x, x, ...)
+        x = residual + gate * attn_out
+    """
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        # Scalar gate initialized to zero — sigmoid(0) = 0.5
+        self.attn_gate = nn.Parameter(torch.zeros(()))
+        self.norm = ZCRMSNorm(config.d_model)
+        self.self_attn = MultiHeadAttention(config, is_cross_attn=False, is_causal=False)
+    def forward(self, x: torch.Tensor, mask=None, rope=None):
+        """
+        x:    (B, T, d_model)
+        mask: (B, 1, T, T) bool
+        rope: (cos, sin) from RoPE buffers
+        """
+        gate = torch.sigmoid(self.attn_gate)
+        residual = x
+        x = self.norm(x)
+        attn_out, _ = self.self_attn(x, x, mask=mask, rope=rope)
+        x = residual + gate * attn_out
+        return x
+# ---------------------------------------------------------------------------
+# DecoderBlock
+# ---------------------------------------------------------------------------
+class DecoderBlock(nn.Module):
+    """Causal self-attn + cross-attn with independent sigmoid-gated residuals.
+    Matches Flax DecoderBlock.__call__:
+        self_gate  = sigmoid(self_attn_gate)
+        x = ZCRMSNorm(x) -> self_attn(x, x) -> x = residual + self_gate * out
+        cross_gate = sigmoid(cross_attn_gate)
+        x = ZCRMSNorm(x) -> cross_attn(x, encoder_out) -> x = residual + cross_gate * out
+    """
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.self_attn_gate = nn.Parameter(torch.zeros(()))
+        self.cross_attn_gate = nn.Parameter(torch.zeros(()))
+        # ZCRMSNorm_0 = pre-norm for self-attn
+        # ZCRMSNorm_1 = pre-norm for cross-attn
+        self.self_norm = ZCRMSNorm(config.d_model)
+        self.cross_norm = ZCRMSNorm(config.d_model)
+        self.self_attn = MultiHeadAttention(config, is_cross_attn=False, is_causal=True)
+        self.cross_attn = MultiHeadAttention(config, is_cross_attn=True, is_causal=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_out: torch.Tensor,
+        self_mask=None,
+        cross_mask=None,
+        rope=None,
+        past_self_kv=None,
+    ):
+        """
+        Args:
+            x:            (B, T_dec, d_model)
+            encoder_out:  (B, T_enc, d_model)
+            self_mask:    (B, 1, T_dec, T_total) bool
+            cross_mask:   (B, 1, T_dec, T_enc) bool
+            rope:         (cos, sin) for self-attention RoPE
+            past_self_kv: (k, v) each (B, num_kv_heads, past_T, head_dim)
+        Returns:
+            x:            (B, T_dec, d_model)
+            present_self_kv: (k, v) each (B, num_kv_heads, T_total, head_dim)
+        """
+        # --- Causal self-attention ---
+        self_gate = torch.sigmoid(self.self_attn_gate)
+        residual = x
+        x = self.self_norm(x)
+        self_out, present_self_kv = self.self_attn(
+            x, x, mask=self_mask, rope=rope, past_kv=past_self_kv
+        )
+        x = residual + self_gate * self_out
+        # --- Cross-attention ---
+        cross_gate = torch.sigmoid(self.cross_attn_gate)
+        residual = x
+        x = self.cross_norm(x)
+        cross_out, _ = self.cross_attn(x, encoder_out, mask=cross_mask)
+        x = residual + cross_gate * cross_out
+        return x, present_self_kv
+# ---------------------------------------------------------------------------
+# Encoder
+# ---------------------------------------------------------------------------
+class Encoder(nn.Module):
+    """Embedding lookup + N EncoderBlocks + final ZCRMSNorm.
+    Returns encoder hidden states: (B, T_enc, d_model).
+    Note: embedding is shared with Decoder and set externally via .embedding.
+    """
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+        # Embedding is shared; the NeedleModel assigns it after construction.
+        self.embedding: nn.Embedding | None = None
+        self.embed_scale = math.sqrt(config.d_model)
+        self.layers = nn.ModuleList([
+            EncoderBlock(config) for _ in range(config.num_encoder_layers)
+        ])
+        self.final_norm = ZCRMSNorm(config.d_model)
+        head_dim = config.d_model // config.num_heads
+        self.rope = RoPE(head_dim, config.max_seq_len, config.rope_theta)
+    def forward(self, input_ids: torch.Tensor, mask=None) -> torch.Tensor:
+        """
+        input_ids: (B, T_enc) long
+        mask:      (B, 1, 1, T_enc) bool padding mask (optional)
+        Returns: (B, T_enc, d_model)
+        """
+        assert self.embedding is not None, "Encoder.embedding must be set by NeedleModel"
+        x = self.embedding(input_ids) * self.embed_scale
+        T = input_ids.shape[1]
+        cos, sin = self.rope.get_cos_sin(T)
+        rope = (cos, sin)
+        for layer in self.layers:
+            x = layer(x, mask=mask, rope=rope)
+        x = self.final_norm(x)
+        return x
+# ---------------------------------------------------------------------------
+# Decoder
+# ---------------------------------------------------------------------------
+class Decoder(nn.Module):
+    """Embedding lookup + N DecoderBlocks + final ZCRMSNorm + LM head.
+    The LM head is a tied projection: logits = hidden @ embedding.weight.T
+    The embedding weight is shared with the Encoder/NeedleModel.
+    """
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+        # Embedding is shared; set by NeedleModel after construction.
+        self.embedding: nn.Embedding | None = None
+        self.embed_scale = math.sqrt(config.d_model)
+        self.layers = nn.ModuleList([
+            DecoderBlock(config) for _ in range(config.num_decoder_layers)
+        ])
+        # ZCRMSNorm_0 in the decoder (final norm after all layers)
+        self.final_norm = ZCRMSNorm(config.d_model)
+        head_dim = config.d_model // config.num_heads
+        self.rope = RoPE(head_dim, config.max_seq_len, config.rope_theta)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        encoder_out: torch.Tensor,
+        self_mask=None,
+        cross_mask=None,
+    ) -> torch.Tensor:
+        """Full-sequence decode (training / teacher-forcing).
+        Args:
+            input_ids:   (B, T_dec) long
+            encoder_out: (B, T_enc, d_model)
+            self_mask:   (B, 1, T_dec, T_dec) bool causal mask
+            cross_mask:  (B, 1, T_dec, T_enc) bool
+        Returns:
+            logits: (B, T_dec, vocab_size)
+        """
+        assert self.embedding is not None
+        x = self.embedding(input_ids) * self.embed_scale
+        T = input_ids.shape[1]
+        cos, sin = self.rope.get_cos_sin(T)
+        rope = (cos, sin)
+        for layer in self.layers:
+            x, _ = layer(x, encoder_out, self_mask=self_mask, cross_mask=cross_mask,
+                         rope=rope, past_self_kv=None)
+        x = self.final_norm(x)
+        # Tied output projection: (B, T, d_model) @ (d_model, vocab_size)
+        logits = x.float() @ self.embedding.weight.T
+        return logits
+    # ------------------------------------------------------------------
+    # Autoregressive step — the entry point for ONNX export (Task 7)
+    # ------------------------------------------------------------------
+    def initial_past_kv(self, batch: int = 1) -> torch.Tensor:
+        """Return a zero past_kv tensor for the first step.
+        Shape: (num_decoder_layers, 2, batch, num_kv_heads, 0, head_dim)
+        Using length-0 in the sequence dimension so the first step's cat
+        produces just the current step's KV.
+        """
+        cfg = self.config
+        head_dim = cfg.d_model // cfg.num_heads
+        return torch.zeros(
+            cfg.num_decoder_layers, 2, batch, cfg.num_kv_heads, 0, head_dim,
+            dtype=torch.float32,
+        )
+    def step(
+        self,
+        decoder_input_ids: torch.Tensor,
+        encoder_kv: torch.Tensor,
+        past_self_kv: torch.Tensor,
+    ):
+        """Single autoregressive decoder step.
+        Accepts explicit past KV cache and returns updated KV (present).
+        This signature is what torch.onnx.export traces in Task 7.
+        Args:
+            decoder_input_ids: (B, 1) long — single token per step
+            encoder_kv:        (B, T_enc, d_model) — frozen encoder output
+            past_self_kv:      (num_decoder_layers, 2, B, num_kv_heads, past_T, head_dim)
+                               Use initial_past_kv() for the first step.
+        Returns:
+            logits:      (B, 1, vocab_size)
+            present_kv:  (num_decoder_layers, 2, B, num_kv_heads, past_T+1, head_dim)
+        NOTE: No Python control flow that depends on tensor *values* — only
+        shape-derived constants — so this is safely ONNX-traceable.
+        """
+        assert self.embedding is not None
+        B = decoder_input_ids.shape[0]
+        x = self.embedding(decoder_input_ids) * self.embed_scale  # (B, 1, d_model)
+        # RoPE for this one position: offset by past_T
+        past_T = past_self_kv.shape[4]
+        # We use position (past_T) for the current token.
+        # Slice cos/sin at that single position: (1, head_dim//2)
+        cos_full, sin_full = self.rope.get_cos_sin(past_T + 1)
+        cos = cos_full[past_T:past_T + 1]   # (1, head_dim//2)
+        sin = sin_full[past_T:past_T + 1]
+        rope = (cos, sin)
+        # Causal mask: shape (1, 1, 1, past_T+1) — current token attends all past+self
+        self_mask = make_causal_mask(1, past_T, device=x.device)  # (1,1,1, past_T+1)
+        present_layers = []
+        for i, layer in enumerate(self.layers):
+            # Unpack this layer's past KV: each (B, num_kv_heads, past_T, head_dim)
+            layer_past_k = past_self_kv[i, 0]  # (B, num_kv_heads, past_T, head_dim)
+            layer_past_v = past_self_kv[i, 1]
+            layer_past = (layer_past_k, layer_past_v)
+            x, (k_new, v_new) = layer(
+                x, encoder_kv,
+                self_mask=self_mask,
+                cross_mask=None,
+                rope=rope,
+                past_self_kv=layer_past,
+            )
+            # k_new, v_new: (B, num_kv_heads, past_T+1, head_dim)
+            present_layers.append(torch.stack([k_new, v_new], dim=0))  # (2, B, nkv, T+1, hd)
+        # Stack layers: (num_decoder_layers, 2, B, num_kv_heads, past_T+1, head_dim)
+        present_kv = torch.stack(present_layers, dim=0)
+        x = self.final_norm(x)
+        logits = x.float() @ self.embedding.weight.T  # (B, 1, vocab_size)
+        return logits, present_kv
+# ---------------------------------------------------------------------------
+# NeedleModel
+# ---------------------------------------------------------------------------
+class NeedleModel(nn.Module):
+    """Top-level Needle Simple Attention Network — PyTorch port.
+    Mirrors SimpleAttentionNetwork (Flax).
+    Parameters
+    ----------
+    config : TransformerConfig
+        Architecture hyperparameters.  Pass production dims to get the 26M model.
+    """
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+        # Shared embedding (tied output projection in decoder)
+        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
+        nn.init.normal_(self.embedding.weight, std=0.02)
+        self.encoder = Encoder(config)
+        self.decoder = Decoder(config)
+        # Wire up shared embedding
+        self.encoder.embedding = self.embedding
+        self.decoder.embedding = self.embedding
+        # Contrastive head — present in the Flax param tree
+        # contrastive_hidden: (d_model, d_model//4) with bias
+        self.contrastive_hidden = nn.Linear(config.d_model, config.d_model // 4, bias=True)
+        # contrastive_proj: (d_model//4, contrastive_dim) no bias
+        self.contrastive_proj = nn.Linear(config.d_model // 4, config.contrastive_dim, bias=False)
+        # Scalar contrastive temperature
+        self.log_temp = nn.Parameter(torch.zeros(()))
+    def forward(
+        self,
+        src: torch.Tensor,
+        tgt: torch.Tensor,
+        src_mask=None,
+        tgt_mask=None,
+        cross_mask=None,
+    ) -> torch.Tensor:
+        """Full encoder-decoder forward pass (training).
+        Returns logits: (B, T_dec, vocab_size)
+        """
+        encoder_out = self.encoder(src, mask=src_mask)
+        logits = self.decoder(tgt, encoder_out, self_mask=tgt_mask, cross_mask=cross_mask)
+        return logits