Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +36 -0
configuration_seqcond.py +111 -0
generation_utils.py +302 -0
model.safetensors +3 -0
modeling_seqcond.py +985 -0
tokenization_seqcond.py +220 -0
tokenizer_config.json +19 -0
triton_kernels.py +394 -0

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "model_type": "seqcond",
+  "architectures": [
+    "SeqCondForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_seqcond.SeqCondConfig",
+    "AutoModelForCausalLM": "modeling_seqcond.SeqCondForCausalLM",
+    "AutoTokenizer": [
+      "tokenization_seqcond.SeqCondTokenizer",
+      null
+    ]
+  },
+  "transformers_version": "5.3.0",
+  "d_model": 1024,
+  "d_ff": 2730,
+  "num_layers": 24,
+  "vocab_size": 100300,
+  "maxlen": 4096,
+  "num_heads": 16,
+  "num_kv_heads": 4,
+  "qk_norm": true,
+  "qk_norm_eps": 1e-06,
+  "seqcond_heads": 16,
+  "num_query_heads": 16,
+  "num_thetas": 2,
+  "conv_kernel_size": 4,
+  "expand_factor": 2.0,
+  "out_expand_factor": 3,
+  "seqcond_ratio": 2,
+  "skip_low_rank": false,
+  "num_anchor_heads": 0,
+  "eos_token_id": 100279,
+  "pad_token_id": 100279,
+  "bos_token_id": null
+}

configuration_seqcond.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+SeqCond HuggingFace configuration.
+"""
+from transformers import PretrainedConfig
+class SeqCondConfig(PretrainedConfig):
+    """
+    Configuration class for SeqCond models.
+    SeqCond is a hybrid recurrent-transformer architecture that interleaves
+    SeqCond (sequential conditioning) blocks with standard Transformer decoder
+    blocks. SeqCond blocks replace softmax attention with a closed-form
+    complex-exponential accumulator, enabling O(1) per-token decoding.
+    Args:
+        d_model: Hidden dimension.
+        d_ff: Feed-forward dimension (typically 3×d_model).
+        num_layers: Total number of blocks (SeqCond + Transformer).
+        vocab_size: Vocabulary size.
+        maxlen: Maximum sequence length (also sets KV-cache size).
+        dropout: Dropout rate (0.0 disables).
+        tie_weights: Whether to tie embedding and LM-head weights.
+        num_heads: Number of attention heads in Transformer blocks.
+        num_kv_heads: Number of KV heads (GQA). None = full MHA.
+        qk_norm: Whether to apply QK-normalization in Transformer blocks.
+        qk_norm_eps: Epsilon for QK-norm.
+        seqcond_heads: Number of SeqCond memory heads (K).
+        num_query_heads: Number of query heads in SeqCond (K_q, must divide K).
+        num_thetas: Number of frequency components per head (M).
+        derivative_order: Unused — kept for checkpoint compatibility.
+        num_anchor_heads: Number of anchor heads (no decay) in SeqCond.
+        conv_kernel_size: Depthwise conv kernel size inside SeqCond.
+        expand_factor: Inner expansion factor for SeqCond memory dimension.
+        out_expand_factor: SwiGLU expansion factor in SeqCond.
+        use_positional_embedding: Whether to add learnable positional embeddings.
+        seqcond_ratio: Block interleaving ratio. Every (seqcond_ratio+1)-th
+            block (1-indexed) is a Transformer block; the rest are SeqCond.
+        chunk_size: Chunk size for chunked computation (unused in PyTorch path).
+        use_square_matrix: Unused — kept for checkpoint compatibility.
+    """
+    model_type = "seqcond"
+    def __init__(
+        self,
+        # Core
+        d_model: int = 768,
+        d_ff: int = 2304,
+        num_layers: int = 12,
+        vocab_size: int = 100300,
+        maxlen: int = 768,
+        dropout: float = 0.0,
+        tie_weights: bool = True,
+        # Transformer block params
+        num_heads: int = 8,
+        num_kv_heads=None,
+        qk_norm: bool = True,
+        qk_norm_eps: float = 1e-6,
+        # SeqCond block params
+        seqcond_heads: int = 32,
+        num_query_heads: int = 6,
+        num_thetas: int = 4,
+        derivative_order: int = 0,
+        num_anchor_heads: int = 0,
+        conv_kernel_size: int = 4,
+        expand_factor: float = 2.0,
+        out_expand_factor: int = 3,
+        use_positional_embedding: bool = False,
+        seqcond_ratio: int = 5,
+        chunk_size: int = 128,
+        use_square_matrix: bool = False,
+        # Special token IDs (filled in by convert_checkpoint.py)
+        bos_token_id=None,
+        eos_token_id=None,
+        pad_token_id=None,
+        **kwargs,
+    ):
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.maxlen = maxlen
+        self.dropout = dropout
+        self.tie_weights = tie_weights
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.seqcond_heads = seqcond_heads
+        self.num_query_heads = num_query_heads
+        self.num_thetas = num_thetas
+        self.derivative_order = derivative_order
+        self.num_anchor_heads = num_anchor_heads
+        self.conv_kernel_size = conv_kernel_size
+        self.expand_factor = expand_factor
+        self.out_expand_factor = out_expand_factor
+        self.use_positional_embedding = use_positional_embedding
+        self.seqcond_ratio = seqcond_ratio
+        self.chunk_size = chunk_size
+        self.use_square_matrix = use_square_matrix
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )

generation_utils.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+generation_utils.py — High-level generation helpers for SeqCond models.
+These functions wrap SeqCondForCausalLM.generate() / generate_batch() with a
+more user-friendly interface that handles tokenization, formatting, and
+streaming.
+Example usage:
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained("path/to/model", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained("path/to/model", trust_remote_code=True)
+    model.eval().cuda()
+    text = generate(model, tokenizer, "What is 2 + 2?")
+    print(text)
+    # Batched
+    texts = generate_batch(model, tokenizer, ["What is 2+2?", "Name a planet."])
+"""
+from typing import Iterator, List, Optional
+import torch
+import torch.nn.functional as F
+_SEQ_LENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]  # power-of-2 for CUDA graphs
+def _quantized_seq_len(pos: int) -> int:
+    needed = pos + 1
+    for s in _SEQ_LENS:
+        if s >= needed:
+            return s
+    return _SEQ_LENS[-1]
+@torch.no_grad()
+def generate(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.0,
+    use_chat_template: bool = True,
+    use_triton: bool = False,
+    strip_thinking: bool = False,
+    max_thinking_tokens: Optional[int] = None,
+) -> str:
+    """
+    Generate a single completion for *prompt*.
+    Args:
+        model: SeqCondForCausalLM instance.
+        tokenizer: SeqCondTokenizer instance.
+        prompt: Plain-text user prompt.
+        max_new_tokens: Maximum tokens to generate.
+        temperature: Sampling temperature (0 = greedy).
+        top_p: Nucleus sampling probability.
+        top_k: Top-k filtering (0 = disabled).
+        repetition_penalty: Penalty for repeating tokens.
+        use_chat_template: If True, wrap prompt in <|im_start|>user…<|think_start|>.
+        use_triton: If True, use Triton kernels for SeqCond steps.
+        strip_thinking: If True, return only the text after <|think_end|>.
+        max_thinking_tokens: If set, inject <|think_end|> after this many
+            thinking tokens to cap reasoning length.
+    Returns:
+        Generated text (completion only, EOS stripped).
+    """
+    device = next(model.parameters()).device
+    eos_id = tokenizer.im_end_id
+    think_end_id = tokenizer.think_end_id
+    if use_chat_template:
+        ids = tokenizer.encode_chat(prompt, add_think_start=True)
+    else:
+        ids = tokenizer.encode(prompt)
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    logits, states = model.model.prefill(input_ids)
+    logits = logits.squeeze(1)
+    generated: List[int] = []
+    token_buf = torch.zeros((1, 1), dtype=torch.long, device=device)
+    seq_len = len(ids)
+    in_thinking = use_chat_template
+    thinking_tokens = 0
+    think_end_injected = False
+    counts: dict = {}
+    for _ in range(max_new_tokens):
+        ls = logits[0] / max(temperature, 1e-8) if temperature > 0 else logits[0].clone()
+        if repetition_penalty != 1.0:
+            for t in set(generated):
+                if 0 <= t < model.config.vocab_size:
+                    ls[t] /= repetition_penalty
+        if temperature == 0:
+            next_token = int(torch.argmax(ls))
+        else:
+            if top_k > 0:
+                kth = torch.topk(ls, top_k).values[-1]
+                ls = ls.masked_fill(ls < kth, float("-inf"))
+            if top_p < 1.0:
+                sorted_ls, sorted_idx = torch.sort(ls, descending=True)
+                cum = torch.cumsum(F.softmax(sorted_ls, dim=-1), dim=-1)
+                remove = cum > top_p
+                remove[1:] = remove[:-1].clone(); remove[0] = False
+                ls[sorted_idx[remove]] = float("-inf")
+            probs = F.softmax(ls, dim=-1)
+            next_token = int(torch.multinomial(probs, 1))
+        # Thinking budget
+        if next_token == think_end_id:
+            in_thinking = False
+        if in_thinking:
+            thinking_tokens += 1
+        if (
+            max_thinking_tokens is not None
+            and in_thinking
+            and thinking_tokens >= max_thinking_tokens
+            and not think_end_injected
+        ):
+            next_token = think_end_id
+            in_thinking = False
+            think_end_injected = True
+        generated.append(next_token)
+        if next_token == eos_id:
+            break
+        token_buf[0, 0] = next_token
+        seq_len += 1
+        logits, states = model.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)
+    # Decode
+    if generated and generated[-1] == eos_id:
+        generated = generated[:-1]
+    text = tokenizer.decode(generated)
+    if strip_thinking and "<|think_end|>" in text:
+        text = text.split("<|think_end|>", 1)[1].strip()
+    return text
+@torch.no_grad()
+def generate_batch(
+    model,
+    tokenizer,
+    prompts: List[str],
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    use_chat_template: bool = True,
+    use_triton: bool = False,
+    strip_thinking: bool = False,
+) -> List[str]:
+    """
+    Batched generation for a list of prompts.
+    Each prompt is prefilled individually (no padding noise), then all
+    sequences are decoded in lockstep with per-sample early stopping.
+    Returns a list of completion strings (EOS stripped).
+    """
+    device = next(model.parameters()).device
+    eos_id = tokenizer.im_end_id
+    B = len(prompts)
+    if use_chat_template:
+        all_ids = [tokenizer.encode_chat(p, add_think_start=True) for p in prompts]
+    else:
+        all_ids = [tokenizer.encode(p) for p in prompts]
+    # Individual prefills
+    all_logits, all_states = [], []
+    for ids in all_ids:
+        inp = torch.tensor([ids], dtype=torch.long, device=device)
+        lg, st = model.model.prefill(inp)
+        all_logits.append(lg.squeeze(1))
+        all_states.append(st)
+    logits = torch.cat(all_logits, dim=0)
+    num_blocks = len(all_states[0])
+    states = [
+        tuple(torch.cat([s[i][j] for s in all_states], dim=0) for j in range(len(all_states[0][i])))
+        for i in range(num_blocks)
+    ]
+    generated = [[] for _ in range(B)]
+    finished = [False] * B
+    active_map = list(range(B))
+    token_buf = torch.zeros((B, 1), dtype=torch.long, device=device)
+    seq_len = max(len(ids) for ids in all_ids)
+    for _ in range(max_new_tokens):
+        B_cur = len(active_map)
+        if B_cur == 0:
+            break
+        if temperature == 0:
+            next_tokens = torch.argmax(logits, dim=-1)
+        else:
+            probs = F.softmax(logits / max(temperature, 1e-8), dim=-1)
+            next_tokens = torch.multinomial(probs, 1).squeeze(-1)
+        newly_done: set = set()
+        for bi in range(B_cur):
+            oi = active_map[bi]
+            tok = int(next_tokens[bi])
+            generated[oi].append(tok)
+            if tok == eos_id:
+                finished[oi] = True
+                newly_done.add(bi)
+            else:
+                token_buf[bi, 0] = tok
+        if all(finished):
+            break
+        if newly_done:
+            keep = [bi for bi in range(B_cur) if bi not in newly_done]
+            if not keep:
+                break
+            keep_idx = torch.tensor(keep, device=device)
+            token_buf = token_buf[keep_idx].contiguous()
+            states = [tuple(s[keep_idx].contiguous() for s in st) for st in states]
+            logits = logits[keep_idx]
+            active_map = [active_map[bi] for bi in keep]
+        seq_len += 1
+        logits, states = model.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)
+    results = []
+    for toks in generated:
+        if toks and toks[-1] == eos_id:
+            toks = toks[:-1]
+        text = tokenizer.decode(toks)
+        if strip_thinking and "<|think_end|>" in text:
+            text = text.split("<|think_end|>", 1)[1].strip()
+        results.append(text)
+    return results
+@torch.no_grad()
+def stream(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    use_chat_template: bool = True,
+    use_triton: bool = False,
+) -> Iterator[str]:
+    """
+    Streaming token-by-token generation.
+    Yields decoded text fragments as they are produced. Useful for interactive
+    applications (e.g., a chat interface).
+    Example:
+        for fragment in stream(model, tokenizer, "Explain gravity."):
+            print(fragment, end="", flush=True)
+    """
+    device = next(model.parameters()).device
+    eos_id = tokenizer.im_end_id
+    if use_chat_template:
+        ids = tokenizer.encode_chat(prompt, add_think_start=True)
+    else:
+        ids = tokenizer.encode(prompt)
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    logits, states = model.model.prefill(input_ids)
+    logits = logits.squeeze(1)
+    token_buf = torch.zeros((1, 1), dtype=torch.long, device=device)
+    seq_len = len(ids)
+    for _ in range(max_new_tokens):
+        if temperature == 0:
+            next_token = int(torch.argmax(logits[0]))
+        else:
+            probs = F.softmax(logits[0] / max(temperature, 1e-8), dim=-1)
+            next_token = int(torch.multinomial(probs, 1))
+        if next_token == eos_id:
+            break
+        try:
+            yield tokenizer.decode([next_token])
+        except Exception:
+            yield ""
+        token_buf[0, 0] = next_token
+        seq_len += 1
+        logits, states = model.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff37a8ff2b5b7f7fbe456efae39a5c7f82460b31e27239acee0210e3f044a0dc
+size 949771696

modeling_seqcond.py ADDED Viewed

	@@ -0,0 +1,985 @@

+"""
+SeqCond model — self-contained HuggingFace implementation.
+All model code is embedded here so that trust_remote_code=True works without
+any dependency on the original seqcond package.
+Architecture:
+  - Hybrid recurrent-transformer: every (seqcond_ratio+1)-th block (1-indexed)
+    is a standard Transformer decoder block; the rest are SeqCond blocks.
+  - SeqCond blocks use complex-exponential accumulators (den_acc, re_acc, im_acc)
+    for O(1) per-token autoregressive decoding.
+  - Transformer blocks use GQA with RoPE and KV-cache for autoregressive decoding.
+"""
+import math
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_seqcond import SeqCondConfig
+# ---------------------------------------------------------------------------
+# Optional Triton kernels (accelerates SeqCond step, not required)
+# ---------------------------------------------------------------------------
+try:
+    from .triton_kernels import (
+        gated_rmsnorm_triton,
+        seqcond_step_triton,
+        TRITON_AVAILABLE,
+    )
+except ImportError:
+    gated_rmsnorm_triton = None
+    TRITON_AVAILABLE = False
+    seqcond_step_triton = None
+# ---------------------------------------------------------------------------
+# Normalisation layers
+# ---------------------------------------------------------------------------
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, epsilon: float = 1e-5):
+        super().__init__()
+        self.epsilon = epsilon
+        self.scale = nn.Parameter(torch.ones(hidden_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        orig = x.dtype
+        x = x.float()
+        x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.epsilon)
+        return (x * self.scale.float()).to(orig)
+class GatedRMSNorm(nn.Module):
+    """RMSNorm with SiLU gating: rmsnorm(x * silu(residual))."""
+    def __init__(self, hidden_size: int, epsilon: float = 1e-6):
+        super().__init__()
+        self.epsilon = epsilon
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        orig = x.dtype
+        x = x.float() * F.silu(residual.float())
+        x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.epsilon)
+        return (x * self.weight.float()).to(orig)
+# ---------------------------------------------------------------------------
+# Rotary Position Embedding
+# ---------------------------------------------------------------------------
+def precompute_freqs(maxlen: int, head_dim: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    half_d = head_dim // 2
+    pos = np.arange(maxlen)[:, None]
+    dim = np.arange(half_d)[None, :]
+    angles = pos * (1.0 / (10000 ** (dim / half_d)))
+    cos = torch.from_numpy(np.cos(angles).astype(np.float32))
+    sin = torch.from_numpy(np.sin(angles).astype(np.float32))
+    return cos, sin
+def apply_rope(tensor: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    dim = tensor.shape[-1] // 2
+    cos = cos[..., :dim]
+    sin = sin[..., :dim]
+    x1, x2 = tensor[..., :dim], tensor[..., dim:]
+    return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1).view(tensor.shape)
+# ---------------------------------------------------------------------------
+# Transformer decoder block (GQA + RoPE)
+# ---------------------------------------------------------------------------
+class RotarySelfAttention(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        num_kv_heads: Optional[int] = None,
+        dropout: float = 0.0,
+        qk_norm: bool = False,
+        qk_norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self._num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
+        self.num_groups = num_heads // self._num_kv_heads
+        self.head_dim = d_model // num_heads
+        self.dropout = dropout
+        self.qk_norm = qk_norm
+        self.qk_norm_eps = qk_norm_eps
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, self._num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self._num_kv_heads * self.head_dim, bias=False)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+    def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
+        if self.num_groups == 1:
+            return x
+        b, l = x.shape[:2]
+        extra = x.shape[2:]
+        x = x.view(b, l, self._num_kv_heads, 1, *extra[1:])
+        x = x.expand(b, l, self._num_kv_heads, self.num_groups, *extra[1:])
+        return x.reshape(b, l, self.num_heads, *extra[1:])
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        return_state: bool = False,
+    ):
+        b, l = x.shape[0], x.shape[1]
+        q = self.q_proj(x).reshape(b, l, self.num_heads, self.head_dim)
+        k = self.k_proj(x).reshape(b, l, self._num_kv_heads, self.head_dim)
+        v = self.v_proj(x).reshape(b, l, self._num_kv_heads, self.head_dim)
+        q = apply_rope(q, cos, sin)
+        cos_kv = cos[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else cos
+        sin_kv = sin[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else sin
+        k = apply_rope(k, cos_kv, sin_kv)
+        if self.qk_norm:
+            q_f = q.float(); k_f = k.float()
+            q = (q_f * torch.rsqrt(q_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(q.dtype)
+            k = (k_f * torch.rsqrt(k_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(k.dtype)
+        k_cache = k; v_cache = v
+        k = self._repeat_kv(k); v = self._repeat_kv(v)
+        scale = 1.0 / math.sqrt(self.head_dim)
+        scores = torch.einsum("blhd,bmhd->bhlm", q, k) * scale
+        causal = torch.tril(torch.ones(l, l, dtype=torch.bool, device=x.device)).unsqueeze(0).unsqueeze(0)
+        scores = torch.where(causal, scores, torch.full_like(scores, -1e4))
+        attn = F.softmax(scores.float(), dim=-1).to(v.dtype)
+        if self.dropout > 0 and self.training:
+            attn = F.dropout(attn, p=self.dropout)
+        out = torch.einsum("bhql,blhd->bqhd", attn, v).reshape(b, l, self.d_model).to(x.dtype)
+        if return_state:
+            return self.out_proj(out), (k_cache, v_cache)
+        return self.out_proj(out)
+    def step(
+        self,
+        x_t: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        pos: torch.Tensor,
+        cos_t: torch.Tensor,
+        sin_t: torch.Tensor,
+        seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, Tuple]:
+        b = x_t.shape[0]
+        q = self.q_proj(x_t).reshape(b, 1, self.num_heads, self.head_dim)
+        k_new = self.k_proj(x_t).reshape(b, 1, self._num_kv_heads, self.head_dim)
+        v_new = self.v_proj(x_t).reshape(b, 1, self._num_kv_heads, self.head_dim)
+        q = apply_rope(q, cos_t, sin_t)
+        cos_kv = cos_t[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else cos_t
+        sin_kv = sin_t[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else sin_t
+        k_new = apply_rope(k_new, cos_kv, sin_kv)
+        if self.qk_norm:
+            q_f = q.float(); k_f = k_new.float()
+            q = (q_f * torch.rsqrt(q_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(q.dtype)
+            k_new = (k_f * torch.rsqrt(k_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(k_new.dtype)
+        k_cache, v_cache = kv_cache
+        pos_idx = pos.long().view(b, 1, 1, 1).expand(-1, 1, k_new.size(2), k_new.size(3))
+        k_cache.scatter_(1, pos_idx, k_new.to(k_cache.dtype))
+        v_cache.scatter_(1, pos_idx, v_new.to(v_cache.dtype))
+        if seq_len is not None:
+            k_slice, v_slice = k_cache[:, :seq_len], v_cache[:, :seq_len]; L = seq_len
+        else:
+            k_slice, v_slice = k_cache, v_cache; L = k_cache.shape[1]
+        k_r = self._repeat_kv(k_slice); v_r = self._repeat_kv(v_slice)
+        mask = torch.arange(L, device=k_cache.device).view(1, 1, 1, L) > pos.long().view(b, 1, 1, 1)
+        scale = 1.0 / math.sqrt(self.head_dim)
+        scores = torch.einsum("bqhd,bkhd->bhqk", q, k_r) * scale
+        scores = scores.masked_fill(mask, float("-inf"))
+        attn = F.softmax(scores.float(), dim=-1).to(v_r.dtype)
+        out = torch.einsum("bhqk,bkhd->bqhd", attn, v_r).reshape(b, self.d_model).to(x_t.dtype)
+        return self.out_proj(out), (k_cache, v_cache)
+class TransformerDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        d_ff: int,
+        num_kv_heads: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_eps: float = 1e-6,
+        qk_norm: bool = False,
+        qk_norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.norm1 = RMSNorm(d_model, epsilon=norm_eps)
+        self.attn = RotarySelfAttention(d_model, num_heads, num_kv_heads, dropout, qk_norm, qk_norm_eps)
+        self.norm2 = RMSNorm(d_model, epsilon=norm_eps)
+        self.ff_in = nn.Linear(d_model, 2 * d_ff, bias=True)
+        self.ff_out = nn.Linear(d_ff, d_model, bias=True)
+        self.dropout = dropout
+    def forward(self, x, cos, sin, mask=None, return_state=False):
+        y = self.norm1(x)
+        if return_state:
+            y, kv = self.attn(y, cos=cos, sin=sin, mask=mask, return_state=True)
+        else:
+            y = self.attn(y, cos=cos, sin=sin, mask=mask)
+        if self.dropout > 0 and self.training:
+            y = F.dropout(y, p=self.dropout)
+        x = x + y
+        y = self.norm2(x)
+        u, v = self.ff_in(y).chunk(2, dim=-1)
+        y = self.ff_out(F.silu(v) * u)
+        if self.dropout > 0 and self.training:
+            y = F.dropout(y, p=self.dropout)
+        out = x + y
+        return (out, kv) if return_state else out
+    def step(self, x_t, kv_cache, pos, cos_t, sin_t, seq_len=None):
+        y = self.norm1(x_t)
+        y, new_kv = self.attn.step(y, kv_cache, pos, cos_t, sin_t, seq_len=seq_len)
+        x_t = x_t + y
+        y = self.norm2(x_t)
+        u, v = self.ff_in(y).chunk(2, dim=-1)
+        return x_t + self.ff_out(F.silu(v) * u), new_kv
+# ---------------------------------------------------------------------------
+# SeqCond attention block
+# ---------------------------------------------------------------------------
+class SeqCondAttention(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int = 12,
+        num_query_heads: int = 6,
+        num_anchor_heads: int = 0,
+        num_thetas: int = 1,
+        conv_kernel_size: int = 4,
+        expand_factor: int = 1,
+        out_expand_factor: int = 3,
+        dropout: float = 0.0,
+        maxlen: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        assert num_heads % num_query_heads == 0
+        self.d_model = d_model
+        self.K = num_heads
+        self.K_q = num_query_heads
+        self.n_rep = num_heads // num_query_heads
+        self.M = num_thetas
+        self.num_decay_heads = num_heads - num_anchor_heads
+        self.num_anchor_heads = num_anchor_heads
+        self.conv_kernel_size = conv_kernel_size
+        self.dropout_rate = dropout
+        self.maxlen = maxlen
+        d_inner = int(d_model * expand_factor)
+        self.H = max(1, d_inner // (self.K * self.M))
+        self.dim_memory = self.K * self.H
+        self.dim_query_head = self.H * self.M * 2
+        self.dim_query_total = self.K_q * self.dim_query_head
+        self.dim_expand = self.H * out_expand_factor
+        self.dim_swiglu_head = self.dim_expand * 2
+        self.dim_swiglu_total = self.K * self.dim_swiglu_head
+        self.dim_mem_total = self.dim_memory + self.K
+        self.dim_conv_total = self.dim_mem_total + self.dim_query_total
+        self.in_proj = nn.Linear(d_model, self.dim_conv_total, bias=False)
+        self.conv_weight = nn.Parameter(torch.empty(self.dim_conv_total, 1, conv_kernel_size))
+        nn.init.kaiming_normal_(self.conv_weight)
+        # Cached buffers (computed lazily)
+        self.register_buffer("_conv_kernel_t", None)
+        self.register_buffer("_theta_cached", None)
+        self.register_buffer("_w_int_cached", None)
+        self.register_buffer("_decay_slopes_cached", None)
+        self.register_buffer("_anchor_slopes_cached", None)
+        self.register_buffer("_phase_scale_b", None)
+        self.register_buffer("_score_scale_b", None)
+        self.register_buffer("_score_bias_b", None)
+        self._triton_out_re_buffer = None
+        self._triton_out_im_buffer = None
+        self._triton_norm_buffer = None
+        if self.M == 1:
+            init_theta = np.geomspace(0.001, 3.0, self.K).reshape(1, 1, self.K, 1, 1)
+            init_theta = np.tile(init_theta, (1, 1, 1, self.H, 1))
+            x = np.clip((init_theta - 0.001) / 2.999, 1e-4, 1 - 1e-4)
+            self.theta_raw = nn.Parameter(torch.from_numpy((np.log(x) - np.log(1 - x)).astype(np.float32)))
+            self.w_int_raw = nn.Parameter(torch.zeros(1, 1, self.K_q, self.n_rep, self.H, 1))
+        else:
+            init_vals = np.geomspace(0.001, 3.0, self.M).reshape(1, 1, 1, 1, self.M)
+            init_vals = np.tile(init_vals, (1, 1, self.K, self.H, 1))
+            self.theta_d_raw = nn.Parameter(torch.from_numpy(np.log(np.exp(init_vals) - 1.0 + 1e-4).astype(np.float32)))
+            self.w_int_raw = nn.Parameter(torch.zeros(1, 1, self.K_q, self.n_rep, self.H, self.M))
+        if self.num_decay_heads > 0:
+            self.decay_slopes = nn.Parameter(
+                torch.from_numpy(np.log(np.exp(np.geomspace(0.001, 0.1, self.num_decay_heads)) - 1).astype(np.float32))
+            )
+        if self.num_anchor_heads > 0:
+            self.anchor_slopes = nn.Parameter(
+                torch.from_numpy(np.log(np.exp(np.geomspace(0.01, 0.1, self.num_anchor_heads)) - 1).astype(np.float32))
+            )
+        self.score_scale = nn.Parameter(torch.ones(self.K))
+        self.score_bias = nn.Parameter(torch.zeros(self.K))
+        self.phase_scale = nn.Parameter(torch.ones(self.K))
+        self.gate_proj = nn.Linear(d_model, self.K * 2 * self.H, bias=False)
+        self.gated_norm = GatedRMSNorm(self.K * 2 * self.H)
+        self.W_readout = nn.Parameter(torch.empty(self.K, 2 * self.H, self.dim_swiglu_head))
+        nn.init.xavier_uniform_(self.W_readout)
+        self.out_proj = nn.Linear(self.dim_swiglu_total // 2, d_model, bias=False)
+    def forward(self, x: torch.Tensor, mask=None, return_state: bool = False):
+        B, L, D = x.shape
+        z_conv = self.in_proj(x)
+        z_conv_t = F.pad(z_conv.transpose(1, 2), (self.conv_kernel_size - 1, 0))
+        z_conv = F.silu(F.conv1d(z_conv_t, self.conv_weight, groups=self.dim_conv_total).transpose(1, 2))
+        z_mem = z_conv[..., : self.dim_mem_total]
+        q_raw = z_conv[..., self.dim_mem_total :]
+        k_val = z_mem[..., : self.dim_memory].reshape(B, L, self.K, self.H)
+        s_raw = z_mem[..., self.dim_memory :]
+        q_raw = q_raw.reshape(B, L, self.K_q, 1, self.H, self.M, 2)
+        q_re, q_im = q_raw[..., 0], q_raw[..., 1]
+        if self.M == 1:
+            theta = 0.001 + 2.999 * torch.sigmoid(self.theta_raw)
+        else:
+            theta_d = F.softplus(self.theta_d_raw) + 1e-4
+            theta_accum = torch.cumsum(theta_d, dim=-1)
+            theta = 0.001 + (theta_accum / theta_accum[..., -1:]) * 2.999
+        w_int = torch.exp(self.w_int_raw)
+        w_int = w_int / (w_int.sum(dim=-1, keepdim=True) + 1e-6)
+        pos = torch.arange(L, dtype=torch.float32, device=x.device)
+        log_w_list = []
+        if self.num_decay_heads > 0:
+            slopes = F.softplus(self.decay_slopes).view(1, 1, -1)
+            dist = torch.clamp((self.maxlen or L) - 1 - pos, min=0.0).view(1, L, 1)
+            log_w_list.append(-slopes * dist)
+        if self.num_anchor_heads > 0:
+            log_w_list.append(-F.softplus(self.anchor_slopes).view(1, 1, -1) * pos.view(1, L, 1))
+        log_tw = torch.cat(log_w_list, dim=2) if log_w_list else torch.zeros(1, L, self.K, device=x.device)
+        score_raw = self.score_scale.view(1, 1, -1) * s_raw.float() + self.score_bias.view(1, 1, -1)
+        p_w = (F.softplus(score_raw) * torch.exp(log_tw)).clamp(1e-4, 5000.0)
+        k_f32 = k_val.float().unsqueeze(-1)
+        p_w_b = p_w.unsqueeze(-1).unsqueeze(-1)
+        phase_scale_b = self.phase_scale.view(1, 1, self.K, 1, 1)
+        k_scaled = k_f32 * phase_scale_b
+        phi = (k_scaled / (1.0 + k_scaled.abs())) * theta
+        kvw = k_f32 * p_w_b
+        re = kvw * torch.cos(phi)
+        im = kvw * torch.sin(phi)
+        flat_size = self.K * self.H * self.M
+        stack = torch.cat([p_w.float(), re.reshape(B, L, -1), im.reshape(B, L, -1)], dim=-1)
+        cumsum = torch.cumsum(stack, dim=1)
+        den_acc = cumsum[..., : self.K]
+        re_acc = cumsum[..., self.K : self.K + flat_size].reshape(B, L, self.K, self.H, self.M)
+        im_acc = cumsum[..., self.K + flat_size :].reshape(B, L, self.K, self.H, self.M)
+        inv_den = (1.0 / torch.clamp(den_acc, min=1e-4)).unsqueeze(-1).unsqueeze(-1)
+        state_re_g = (re_acc * inv_den).reshape(B, L, self.K_q, self.n_rep, self.H, self.M)
+        state_im_g = (im_acc * inv_den).reshape(B, L, self.K_q, self.n_rep, self.H, self.M)
+        scale = 1.0 / (self.H ** 0.5)
+        match_re = ((state_re_g * q_re + state_im_g * q_im) * scale).float()
+        match_im = ((state_im_g * q_re - state_re_g * q_im) * scale).float()
+        out_re = ((match_re * w_int.float()).sum(dim=-1)).reshape(B, L, self.K, self.H).to(x.dtype)
+        out_im = ((match_im * w_int.float()).sum(dim=-1)).reshape(B, L, self.K, self.H).to(x.dtype)
+        out_complex = self.gated_norm(torch.cat([out_re, out_im], dim=-1).reshape(B, L, -1), self.gate_proj(x))
+        out_complex = out_complex.reshape(B, L, self.K, 2 * self.H)
+        y_raw = torch.einsum("blkf,kfn->blkn", out_complex, self.W_readout.to(out_complex.dtype))
+        y_val, y_gate = y_raw.chunk(2, dim=-1)
+        output = self.out_proj((y_val * torch.sigmoid(y_gate)).reshape(B, L, -1).to(x.dtype))
+        if return_state:
+            z_pre = self.in_proj(x)
+            buf_sz = self.conv_kernel_size - 1
+            conv_buf = z_pre[:, -buf_sz:] if L >= buf_sz else torch.cat([
+                torch.zeros(B, buf_sz - L, self.dim_conv_total, device=x.device, dtype=z_pre.dtype), z_pre], dim=1)
+            state = (
+                p_w.sum(dim=1),
+                re_acc[:, -1],
+                im_acc[:, -1],
+                torch.full((B,), L, dtype=torch.float32, device=x.device),
+                conv_buf,
+            )
+            return output, state
+        return output
+    def step(self, x_t: torch.Tensor, state: Tuple, use_triton: bool = False) -> Tuple:
+        B, D = x_t.shape
+        den_acc, re_acc, im_acc, pos, conv_buffer = state
+        z_conv = self.in_proj(x_t)
+        if self._conv_kernel_t is None or self._conv_kernel_t.device != z_conv.device:
+            self._conv_kernel_t = self.conv_weight[:, 0, :].t().contiguous()
+        conv_input = torch.cat([conv_buffer, z_conv.unsqueeze(1)], dim=1)
+        z_conv_act = F.silu((conv_input * self._conv_kernel_t).sum(dim=1))
+        z_mem = z_conv_act[..., : self.dim_mem_total]
+        q_raw = z_conv_act[..., self.dim_mem_total :]
+        k_val = z_mem[..., : self.dim_memory].reshape(B, self.K, self.H)
+        s_raw = z_mem[..., self.dim_memory :]
+        q_raw = q_raw.reshape(B, self.K_q, 1, self.H, self.M, 2)
+        q_re, q_im = q_raw[..., 0], q_raw[..., 1]
+        if self._theta_cached is None:
+            if self.M == 1:
+                self._theta_cached = (0.001 + 2.999 * torch.sigmoid(self.theta_raw))[0, 0]
+            else:
+                theta_d = F.softplus(self.theta_d_raw) + 1e-4
+                theta_accum = torch.cumsum(theta_d, dim=-1)
+                self._theta_cached = (0.001 + (theta_accum / theta_accum[..., -1:]) * 2.999)[0, 0]
+            w = torch.exp(self.w_int_raw)
+            self._w_int_cached = w / (w.sum(dim=-1, keepdim=True) + 1e-6)
+            self._w_int_cached = self._w_int_cached[0, 0]
+        theta = self._theta_cached
+        w_int = self._w_int_cached
+        if self._decay_slopes_cached is None and self.num_decay_heads > 0:
+            self._decay_slopes_cached = F.softplus(self.decay_slopes).view(1, -1)
+        if self._anchor_slopes_cached is None and self.num_anchor_heads > 0:
+            self._anchor_slopes_cached = F.softplus(self.anchor_slopes).view(1, -1)
+        if self._score_scale_b is None:
+            self._score_scale_b = self.score_scale.view(1, -1)
+            self._score_bias_b = self.score_bias.view(1, -1)
+            self._phase_scale_b = self.phase_scale.view(1, self.K, 1, 1)
+        log_w_list = []
+        if self.num_decay_heads > 0:
+            dist = (self.maxlen or 2048) - 1 - pos.unsqueeze(-1)
+            log_w_list.append(-self._decay_slopes_cached * dist.clamp(min=0.0))
+        if self.num_anchor_heads > 0:
+            log_w_list.append(-self._anchor_slopes_cached * pos.unsqueeze(-1))
+        log_tw = torch.cat(log_w_list, dim=1) if log_w_list else torch.zeros(B, self.K, device=x_t.device)
+        if (
+            use_triton
+            and x_t.is_cuda
+            and self.n_rep == 1
+            and TRITON_AVAILABLE
+            and seqcond_step_triton is not None
+        ):
+            if (
+                self._triton_out_re_buffer is None
+                or self._triton_out_re_buffer.shape != (B, self.K, self.H)
+                or self._triton_out_re_buffer.device != x_t.device
+            ):
+                self._triton_out_re_buffer = torch.empty(
+                    B, self.K, self.H, device=x_t.device, dtype=torch.float32
+                )
+                self._triton_out_im_buffer = torch.empty_like(
+                    self._triton_out_re_buffer
+                )
+            out_re, out_im = seqcond_step_triton(
+                k_val,
+                s_raw,
+                q_re.squeeze(2),
+                q_im.squeeze(2),
+                re_acc,
+                im_acc,
+                den_acc,
+                theta,
+                w_int,
+                self.phase_scale,
+                self.score_scale,
+                self.score_bias,
+                log_tw,
+                out_re_buffer=self._triton_out_re_buffer,
+                out_im_buffer=self._triton_out_im_buffer,
+            )
+            out_complex = torch.cat([out_re, out_im], dim=-1)
+        else:
+            score_raw = self._score_scale_b * s_raw.float() + self._score_bias_b
+            p_w = (F.softplus(score_raw) * torch.exp(log_tw)).clamp(1e-4, 5000.0)
+            k_f32 = k_val.float().unsqueeze(-1)
+            k_scaled = k_f32 * self._phase_scale_b
+            phi = (k_scaled / (1.0 + k_scaled.abs())) * theta
+            kvw = k_f32 * p_w.unsqueeze(-1).unsqueeze(-1)
+            re = kvw * torch.cos(phi)
+            im = kvw * torch.sin(phi)
+            den_acc.add_(p_w); re_acc.add_(re); im_acc.add_(im)
+            inv_den = (1.0 / torch.clamp(den_acc, min=1e-4)).unsqueeze(-1).unsqueeze(-1)
+            state_re_g = (re_acc * inv_den).reshape(B, self.K_q, self.n_rep, self.H, self.M)
+            state_im_g = (im_acc * inv_den).reshape(B, self.K_q, self.n_rep, self.H, self.M)
+            scale = 1.0 / (self.H ** 0.5)
+            match_re = ((state_re_g * q_re + state_im_g * q_im) * scale).float()
+            match_im = ((state_im_g * q_re - state_re_g * q_im) * scale).float()
+            out_re = ((match_re * w_int.float()).sum(-1)).reshape(B, self.K, self.H).to(x_t.dtype)
+            out_im = ((match_im * w_int.float()).sum(-1)).reshape(B, self.K, self.H).to(x_t.dtype)
+            out_complex = torch.cat([out_re, out_im], dim=-1)
+        out_complex = out_complex.reshape(B, self.K, 2 * self.H)
+        out_complex_flat = out_complex.reshape(B, -1)
+        gate_for_norm = self.gate_proj(x_t)
+        if use_triton and x_t.is_cuda and gated_rmsnorm_triton is not None:
+            if (
+                self._triton_norm_buffer is None
+                or self._triton_norm_buffer.shape != out_complex_flat.shape
+                or self._triton_norm_buffer.device != x_t.device
+            ):
+                self._triton_norm_buffer = torch.empty(
+                    out_complex_flat.shape,
+                    device=x_t.device,
+                    dtype=torch.float32,
+                )
+            out_flat = gated_rmsnorm_triton(
+                out_complex_flat,
+                gate_for_norm,
+                self.gated_norm.weight,
+                self.gated_norm.epsilon,
+                out_buffer=self._triton_norm_buffer,
+            )
+        else:
+            out_flat = self.gated_norm(out_complex_flat, gate_for_norm)
+        out_complex = out_flat.to(x_t.dtype).reshape(B, self.K, 2 * self.H)
+        y_raw = torch.einsum("bkf,kfn->bkn", out_complex, self.W_readout.to(out_complex.dtype))
+        y_val, y_gate = y_raw.chunk(2, dim=-1)
+        out = self.out_proj((y_val * torch.sigmoid(y_gate)).reshape(B, -1).to(x_t.dtype))
+        pos.add_(1).clamp_(max=(self.maxlen or 2048) - 1)
+        if self.conv_kernel_size > 1:
+            if self.conv_kernel_size > 2:
+                conv_buffer[:, :-1, :].copy_(conv_buffer[:, 1:, :].clone())
+            conv_buffer[:, -1, :].copy_(z_conv)
+        return out, (den_acc, re_acc, im_acc, pos, conv_buffer)
+class SeqCondBlock(nn.Module):
+    def __init__(self, d_model: int, norm_eps: float = 1e-6, **kwargs):
+        super().__init__()
+        self.norm = RMSNorm(d_model, epsilon=norm_eps)
+        self.attn = SeqCondAttention(d_model=d_model, **kwargs)
+    def forward(self, x, mask=None, return_state=False):
+        if return_state:
+            out, state = self.attn(self.norm(x), mask=mask, return_state=True)
+            return x + out, state
+        return x + self.attn(self.norm(x), mask=mask)
+    def step(self, x_t, state, use_triton=False):
+        out, new_state = self.attn.step(self.norm(x_t), state, use_triton=use_triton)
+        return x_t + out, new_state
+# ---------------------------------------------------------------------------
+# Core SeqCond language model
+# ---------------------------------------------------------------------------
+class SeqCondModel(nn.Module):
+    """Core SeqCond model (no HF wrapper). Used internally by SeqCondForCausalLM."""
+    def __init__(self, config: SeqCondConfig):
+        super().__init__()
+        self.d_model = config.d_model
+        self.d_ff = config.d_ff
+        self.num_layers = config.num_layers
+        self.vocab_size = config.vocab_size
+        self.maxlen = config.maxlen
+        self.num_heads = config.num_heads
+        self.num_kv_heads = config.num_kv_heads if config.num_kv_heads is not None else config.num_heads
+        self.seqcond_ratio = config.seqcond_ratio
+        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
+        self.use_positional_embedding = config.use_positional_embedding
+        if config.use_positional_embedding:
+            self.position_embedding = nn.Embedding(config.maxlen, config.d_model)
+        head_dim = config.d_model // config.num_heads
+        cos, sin = precompute_freqs(config.maxlen, head_dim)
+        self.register_buffer("cos_emb", cos)
+        self.register_buffer("sin_emb", sin)
+        self.blocks = nn.ModuleList()
+        self.block_types = []
+        for i in range(config.num_layers):
+            if (i + 1) % (config.seqcond_ratio + 1) == 0:
+                block = TransformerDecoderBlock(
+                    d_model=config.d_model,
+                    num_heads=config.num_heads,
+                    d_ff=config.d_ff,
+                    num_kv_heads=self.num_kv_heads,
+                    dropout=config.dropout,
+                    qk_norm=config.qk_norm,
+                    qk_norm_eps=config.qk_norm_eps,
+                )
+                self.block_types.append("transformer")
+            else:
+                block = SeqCondBlock(
+                    d_model=config.d_model,
+                    num_heads=config.seqcond_heads,
+                    num_query_heads=config.num_query_heads,
+                    num_anchor_heads=config.num_anchor_heads,
+                    num_thetas=config.num_thetas,
+                    conv_kernel_size=config.conv_kernel_size,
+                    expand_factor=config.expand_factor,
+                    out_expand_factor=config.out_expand_factor,
+                    dropout=config.dropout,
+                    maxlen=config.maxlen,
+                )
+                self.block_types.append("seqcond")
+            self.blocks.append(block)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        if config.tie_weights:
+            self.lm_head.weight = self.embedding.weight
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        B, L = input_ids.shape
+        x = self.embedding(input_ids)
+        if self.use_positional_embedding:
+            x = x + self.position_embedding(torch.arange(L, device=input_ids.device))
+        cos = self.cos_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
+        sin = self.sin_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
+        for block, bt in zip(self.blocks, self.block_types):
+            x = block(x, cos, sin) if bt == "transformer" else block(x)
+        return self.lm_head(x)
+    def prefill(self, input_ids: torch.Tensor, return_all_logits: bool = False):
+        B, L = input_ids.shape
+        device = input_ids.device
+        x = self.embedding(input_ids)
+        if self.use_positional_embedding:
+            x = x + self.position_embedding(torch.arange(L, device=device))
+        cos = self.cos_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
+        sin = self.sin_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
+        states = []
+        for block, bt in zip(self.blocks, self.block_types):
+            if bt == "transformer":
+                x, kv = block(x, cos, sin, return_state=True)
+                k, v = kv
+                k_cache = torch.zeros(B, self.maxlen, self.num_kv_heads, self.d_model // self.num_heads, device=device, dtype=k.dtype)
+                v_cache = torch.zeros_like(k_cache)
+                k_cache[:, :L] = k; v_cache[:, :L] = v
+                states.append((k_cache, v_cache))
+            else:
+                x, state = block(x, return_state=True)
+                states.append(state)
+        logits = self.lm_head(x)
+        if return_all_logits:
+            return logits, states
+        return logits[:, -1:, :], states
+    def init_state(self, batch_size: int, device: torch.device) -> List:
+        states = []
+        for block, bt in zip(self.blocks, self.block_types):
+            if bt == "transformer":
+                k = torch.zeros(batch_size, self.maxlen, self.num_kv_heads, self.d_model // self.num_heads, device=device)
+                states.append((k, torch.zeros_like(k)))
+            else:
+                a = block.attn
+                states.append((
+                    torch.zeros(batch_size, a.K, device=device),
+                    torch.zeros(batch_size, a.K, a.H, a.M, device=device),
+                    torch.zeros(batch_size, a.K, a.H, a.M, device=device),
+                    torch.zeros(batch_size, device=device),
+                    torch.zeros(batch_size, a.conv_kernel_size - 1, a.dim_conv_total, device=device),
+                ))
+        return states
+    def step(self, token_id: torch.Tensor, states: List, pos=None, seq_len=None, use_triton=False):
+        B = token_id.size(0)
+        if pos is None:
+            for state, bt in zip(states, self.block_types):
+                if bt == "seqcond":
+                    pos = state[3]; break
+            if pos is None:
+                pos = torch.zeros(B, device=token_id.device, dtype=torch.long)
+        x = self.embedding(token_id).squeeze(1)
+        pos = pos.clamp(max=self.maxlen - 1)
+        if self.use_positional_embedding:
+            x = x + torch.index_select(self.position_embedding.weight, 0, pos.long())
+        pos_idx = pos.long()
+        cos_t = torch.index_select(self.cos_emb, 0, pos_idx).unsqueeze(1).unsqueeze(1).expand(B, 1, self.num_heads, -1)
+        sin_t = torch.index_select(self.sin_emb, 0, pos_idx).unsqueeze(1).unsqueeze(1).expand(B, 1, self.num_heads, -1)
+        new_states = []
+        for block, bt, state in zip(self.blocks, self.block_types, states):
+            if bt == "transformer":
+                x, ns = block.step(x, state, pos, cos_t, sin_t, seq_len=seq_len)
+            else:
+                x, ns = block.step(x, state, use_triton=use_triton)
+            new_states.append(ns)
+        return self.lm_head(x), new_states
+# ---------------------------------------------------------------------------
+# HuggingFace wrapper
+# ---------------------------------------------------------------------------
+class SeqCondPreTrainedModel(PreTrainedModel):
+    config_class = SeqCondConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=0.02)
+class SeqCondForCausalLM(SeqCondPreTrainedModel):
+    """
+    SeqCond causal language model, HuggingFace-compatible.
+    Supports:
+    - Standard HF forward() for training / perplexity evaluation.
+    - Custom generate() using state-based O(1) decoding.
+    - generate_batch() for batched generation with per-sample early stopping.
+    """
+    def __init__(self, config: SeqCondConfig):
+        super().__init__(config)
+        self.model = SeqCondModel(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embedding
+    def set_input_embeddings(self, value):
+        self.model.embedding = value
+    def get_output_embeddings(self):
+        return self.model.lm_head
+    def set_output_embeddings(self, value):
+        self.model.lm_head = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Standard forward pass (used for training / perplexity).
+        Note: attention_mask is accepted for API compatibility but is not used
+        in the forward pass — SeqCond is always causal.
+        """
+        logits = self.model(input_ids)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 256,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        repetition_penalty: float = 1.0,
+        eos_token_id: Optional[int] = None,
+        use_triton: bool = False,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Autoregressive generation with state-based O(1) decoding.
+        Returns the full sequence (prompt + generated tokens) as a LongTensor.
+        """
+        if eos_token_id is None:
+            eos_token_id = self.config.eos_token_id
+        device = input_ids.device
+        B = input_ids.size(0)
+        # Prefill
+        logits, states = self.model.prefill(input_ids)
+        logits = logits.squeeze(1)  # (B, vocab)
+        generated = input_ids.tolist()
+        finished = [False] * B
+        token_buf = torch.zeros((B, 1), dtype=torch.long, device=device)
+        seq_len = input_ids.size(1)
+        for _ in range(max_new_tokens):
+            # Temperature scaling
+            if temperature > 0:
+                ls = logits / temperature
+            else:
+                ls = logits.clone()
+            # Repetition penalty
+            if repetition_penalty != 1.0:
+                for bi, toks in enumerate(generated):
+                    for t in set(toks):
+                        if 0 <= t < self.config.vocab_size:
+                            ls[bi, t] /= repetition_penalty
+            # Sampling
+            if temperature == 0:
+                next_tokens = torch.argmax(ls, dim=-1)
+            else:
+                if top_k > 0:
+                    kth = torch.topk(ls, top_k, dim=-1).values[:, -1:]
+                    ls = ls.masked_fill(ls < kth, float("-inf"))
+                if top_p < 1.0:
+                    sorted_ls, sorted_idx = torch.sort(ls, dim=-1, descending=True)
+                    cum_probs = torch.cumsum(F.softmax(sorted_ls, dim=-1), dim=-1)
+                    sorted_remove = cum_probs > top_p
+                    sorted_remove[:, 1:] = sorted_remove[:, :-1].clone()
+                    sorted_remove[:, 0] = False
+                    remove = torch.zeros_like(sorted_remove)
+                    remove.scatter_(1, sorted_idx, sorted_remove)
+                    ls = ls.masked_fill(remove, float("-inf"))
+                probs = F.softmax(ls, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            for bi in range(B):
+                tok = next_tokens[bi].item()
+                generated[bi].append(tok)
+                if eos_token_id is not None and tok == eos_token_id:
+                    finished[bi] = True
+                token_buf[bi, 0] = tok
+            if all(finished):
+                break
+            seq_len += 1
+            logits, states = self.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)
+        max_len = max(len(g) for g in generated)
+        pad_id = self.config.pad_token_id or 0
+        out = torch.full((B, max_len), pad_id, dtype=torch.long, device=device)
+        for bi, g in enumerate(generated):
+            out[bi, : len(g)] = torch.tensor(g, dtype=torch.long, device=device)
+        return out
+    @torch.no_grad()
+    def generate_batch(
+        self,
+        input_ids_list: List[torch.LongTensor],
+        max_new_tokens: int = 256,
+        temperature: float = 0.7,
+        eos_token_id: Optional[int] = None,
+        use_triton: bool = False,
+    ) -> List[List[int]]:
+        """
+        Batched generation: each prompt is prefilled independently, then
+        decoded in lockstep with per-sample early stopping.
+        Args:
+            input_ids_list: List of 1D LongTensors, one per prompt.
+        Returns:
+            List of generated token id lists (completion only, EOS stripped).
+        """
+        if eos_token_id is None:
+            eos_token_id = self.config.eos_token_id
+        device = input_ids_list[0].device
+        B = len(input_ids_list)
+        # Per-sample prefill
+        all_logits, all_states = [], []
+        for ids in input_ids_list:
+            lg, st = self.model.prefill(ids.unsqueeze(0))
+            all_logits.append(lg.squeeze(1))
+            all_states.append(st)
+        logits = torch.cat(all_logits, dim=0)
+        # Stack states
+        num_blocks = len(all_states[0])
+        states = [
+            tuple(torch.cat([s[i][j] for s in all_states], dim=0) for j in range(len(all_states[0][i])))
+            for i in range(num_blocks)
+        ]
+        generated = [[] for _ in range(B)]
+        finished = [False] * B
+        active_map = list(range(B))
+        token_buf = torch.zeros((B, 1), dtype=torch.long, device=device)
+        seq_len = max(ids.size(0) for ids in input_ids_list)
+        for _ in range(max_new_tokens):
+            B_cur = len(active_map)
+            if B_cur == 0:
+                break
+            if temperature == 0:
+                next_tokens = torch.argmax(logits, dim=-1)
+            else:
+                probs = F.softmax(logits / temperature, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            newly_done = set()
+            for bi in range(B_cur):
+                oi = active_map[bi]
+                tok = next_tokens[bi].item()
+                generated[oi].append(tok)
+                if eos_token_id is not None and tok == eos_token_id:
+                    finished[oi] = True
+                    newly_done.add(bi)
+                else:
+                    token_buf[bi, 0] = tok
+            if all(finished):
+                break
+            if newly_done:
+                keep = [bi for bi in range(B_cur) if bi not in newly_done]
+                if not keep:
+                    break
+                keep_idx = torch.tensor(keep, device=device)
+                token_buf = token_buf[keep_idx].contiguous()
+                states = [tuple(s[keep_idx].contiguous() for s in st) for st in states]
+                active_map = [active_map[bi] for bi in keep]
+            seq_len += 1
+            logits, states = self.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)
+        results = []
+        for toks in generated:
+            if toks and toks[-1] == eos_token_id:
+                toks = toks[:-1]
+            results.append(toks)
+        return results

tokenization_seqcond.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+SeqCond tokenizer — tiktoken cl100k_base with 4 additional special tokens.
+Special tokens (assigned in order after the base vocab):
+  <|im_start|>    — marks the start of a chat turn
+  <|im_end|>      — marks the end of a chat turn (also used as EOS)
+  <|think_start|> — marks the start of chain-of-thought reasoning
+  <|think_end|>   — marks the end of chain-of-thought reasoning
+Chat template:
+  <|im_start|>user
+  {prompt}
+  <|im_end|><|im_start|>assistant
+  <|think_start|>{thinking}<|think_end|>
+  {answer}
+  <|im_end|>
+"""
+import os
+from typing import Dict, List, Optional, Tuple
+from transformers import PreTrainedTokenizer
+_SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>", "<|think_start|>", "<|think_end|>"]
+_SPECIAL_TOKEN_IDS = {
+    "<|im_start|>": 100278,
+    "<|im_end|>": 100279,
+    "<|think_start|>": 100280,
+    "<|think_end|>": 100281,
+    "<|endoftext|>": 100282,
+    "<|fim_prefix|>": 100283,
+    "<|fim_middle|>": 100284,
+    "<|fim_suffix|>": 100285,
+    "<|endofprompt|>": 100286,
+}
+_BASE_VOCAB_SIZE = 100256
+_VOCAB_SIZE = max(_SPECIAL_TOKEN_IDS.values()) + 1
+def _build_tiktoken_enc():
+    """Build tiktoken encoding with SeqCond special tokens."""
+    try:
+        import tiktoken
+    except ImportError as e:
+        raise ImportError("tiktoken is required: pip install tiktoken") from e
+    base = tiktoken.get_encoding("cl100k_base")
+    return tiktoken.Encoding(
+        name="seqcond",
+        pat_str=base._pat_str,
+        mergeable_ranks=base._mergeable_ranks,
+        special_tokens=_SPECIAL_TOKEN_IDS,
+    )
+class SeqCondTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizer for SeqCond models, backed by tiktoken cl100k_base.
+    This is a slow tokenizer that wraps tiktoken. Tokens are represented
+    internally as their stringified integer IDs (e.g. "42", "100256").
+    This avoids building a full vocab dict while remaining compatible with
+    HuggingFace's PreTrainedTokenizer interface.
+    Requires: pip install tiktoken
+    """
+    vocab_files_names: Dict[str, str] = {}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        eos_token: str = "<|im_end|>",
+        bos_token: Optional[str] = None,
+        unk_token: Optional[str] = None,
+        pad_token: str = "<|im_end|>",
+        add_bos_token: bool = False,
+        **kwargs,
+    ):
+        self._enc = _build_tiktoken_enc()
+        self._id_to_special: Dict[int, str] = {idx: tok for tok, idx in _SPECIAL_TOKEN_IDS.items()}
+        self._special_to_id: Dict[str, int] = {v: k for k, v in self._id_to_special.items()}
+        # Register special tokens before calling super().__init__
+        kwargs.setdefault("additional_special_tokens", [t for t in _SPECIAL_TOKENS if t not in (eos_token, bos_token, unk_token, pad_token)])
+        super().__init__(
+            eos_token=eos_token,
+            bos_token=bos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return _VOCAB_SIZE
+    # ------------------------------------------------------------------
+    # Core token ↔ id mappings
+    # ------------------------------------------------------------------
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        """Encode text into a list of token-id strings."""
+        ids = self._enc.encode(text, allowed_special="all")
+        # Shift non-special BPE IDs by +1 to match convectors.Tiktokenize
+        # offset used during training (ID 0 reserved).
+        shifted = [i if i in self._id_to_special else i + 1 for i in ids]
+        return [str(i) for i in shifted]
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token string (or id-string) to an integer id."""
+        if token in self._special_to_id:
+            return self._special_to_id[token]
+        try:
+            return int(token)
+        except ValueError:
+            return 0
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert an integer id to its token string."""
+        if index in self._id_to_special:
+            return self._id_to_special[index]
+        return str(index)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Decode a list of token strings back to text."""
+        ids = []
+        for t in tokens:
+            if t in self._special_to_id:
+                ids.append(self._special_to_id[t])
+            else:
+                try:
+                    ids.append(int(t))
+                except ValueError:
+                    pass
+        # Reverse the +1 BPE shift before decoding; skip invalid/ID 0 tokens.
+        real_ids = []
+        for i in ids:
+            if i in self._id_to_special:
+                real_ids.append(i)
+            elif i >= 1:
+                real_ids.append(i - 1)
+        return self._enc.decode(real_ids)
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Return a vocab dict. Only special tokens are included with their names;
+        regular BPE tokens are included as their id-string representation.
+        (Building a full 100k-entry reverse BPE map is expensive and rarely needed.)
+        """
+        vocab = {str(i): i for i in range(self.vocab_size)}
+        for tok, idx in self._special_to_id.items():
+            vocab[tok] = idx
+        return vocab
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str, ...]:
+        """
+        No vocabulary file is needed — the tiktoken encoding is fetched from
+        the tiktoken package at runtime. Returns an empty tuple.
+        """
+        return ()
+    # ------------------------------------------------------------------
+    # Convenience helpers
+    # ------------------------------------------------------------------
+    @property
+    def im_start_id(self) -> int:
+        return self._special_to_id["<|im_start|>"]
+    @property
+    def im_end_id(self) -> int:
+        return self._special_to_id["<|im_end|>"]
+    @property
+    def think_start_id(self) -> int:
+        return self._special_to_id["<|think_start|>"]
+    @property
+    def think_end_id(self) -> int:
+        return self._special_to_id["<|think_end|>"]
+    def encode_chat(self, prompt: str, add_think_start: bool = True) -> List[int]:
+        """
+        Format and encode a user prompt using the standard chat template.
+        Args:
+            prompt: The user's message (plain text).
+            add_think_start: If True (default), append <|think_start|> so the
+                model begins generating its chain-of-thought immediately.
+        Returns:
+            List of token ids (prompt already encoded, ready for prefill).
+        """
+        text = f"<|im_start|>user\n{prompt}\n<|im_end|><|im_start|>assistant\n"
+        if add_think_start:
+            text += "<|think_start|>"
+        ids = self._enc.encode(text, allowed_special="all")
+        return [i if i in self._id_to_special else i + 1 for i in ids]
+    def apply_chat_template(self, conversation, add_generation_prompt: bool = True, **kwargs) -> List[int]:
+        """
+        Minimal chat template support for HF pipeline compatibility.
+        Expects conversation as a list of {"role": ..., "content": ...} dicts.
+        Only the last user turn is supported for now.
+        """
+        text = ""
+        for msg in conversation:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            text += f"<|im_start|>{role}\n{content}\n<|im_end|>"
+        if add_generation_prompt:
+            text += "<|im_start|>assistant\n<|think_start|>"
+        ids = self._enc.encode(text, allowed_special="all")
+        return [i if i in self._id_to_special else i + 1 for i in ids]

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "tokenizer_class": "SeqCondTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_seqcond.SeqCondTokenizer",
+      null
+    ]
+  },
+  "model_max_length": 4096,
+  "eos_token": "<|im_end|>",
+  "bos_token": null,
+  "unk_token": null,
+  "pad_token": "<|im_end|>",
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|think_start|>",
+    "<|think_end|>"
+  ]
+}

triton_kernels.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import torch
+try:
+    import triton
+    import triton.language as tl
+    TRITON_AVAILABLE = True
+except ImportError:
+    TRITON_AVAILABLE = False
+    triton = None
+    tl = None
+if TRITON_AVAILABLE:
+    def _select_seqcond_launch_config(H: int, M: int) -> tuple[int, int]:
+        if M <= 1:
+            block_m = 1
+        elif M <= 2:
+            block_m = 2
+        elif M <= 4:
+            block_m = 4
+        elif M <= 8:
+            block_m = 8
+        else:
+            block_m = 16
+        if H >= 64:
+            block_h = 64
+        elif H >= 32:
+            block_h = 32
+        elif H >= 16:
+            block_h = 16
+        elif H >= 8:
+            block_h = 8
+        elif H >= 4:
+            block_h = 4
+        elif H >= 2:
+            block_h = 2
+        else:
+            block_h = 1
+        return block_m, block_h
+    @triton.jit
+    def _seqcond_fully_fused_kernel_impl(
+        k_ptr,
+        s_raw_ptr,
+        q_re_ptr,
+        q_im_ptr,
+        re_acc_ptr,
+        im_acc_ptr,
+        den_acc_ptr,
+        theta_ptr,
+        w_int_ptr,
+        phase_scale_ptr,
+        score_scale_ptr,
+        score_bias_ptr,
+        log_tw_ptr,
+        out_re_ptr,
+        out_im_ptr,
+        K: tl.constexpr,
+        H: tl.constexpr,
+        M: tl.constexpr,
+        stride_k_b,
+        stride_k_k,
+        stride_k_h,
+        stride_acc_b,
+        stride_acc_k,
+        stride_acc_h,
+        stride_acc_m,
+        stride_theta_k,
+        stride_theta_h,
+        stride_theta_m,
+        stride_q_b,
+        stride_q_k,
+        stride_q_h,
+        stride_q_m,
+        stride_w_k,
+        stride_w_h,
+        stride_w_m,
+        stride_out_b,
+        stride_out_k,
+        stride_out_h,
+        BLOCK_M: tl.constexpr,
+        BLOCK_H: tl.constexpr,
+    ):
+        pid = tl.program_id(0)
+        num_h_blocks = (H + BLOCK_H - 1) // BLOCK_H
+        b = pid // (K * num_h_blocks)
+        rem = pid % (K * num_h_blocks)
+        k = rem // num_h_blocks
+        h_block = rem % num_h_blocks
+        h_start = h_block * BLOCK_H
+        s_raw = tl.load(s_raw_ptr + b * K + k)
+        score_scale = tl.load(score_scale_ptr + k)
+        score_bias = tl.load(score_bias_ptr + k)
+        log_tw = tl.load(log_tw_ptr + b * K + k)
+        phase_scale = tl.load(phase_scale_ptr + k)
+        score = score_scale * s_raw + score_bias
+        p_w_content = tl.where(score > 20.0, score, tl.log(1.0 + tl.exp(score)))
+        p_w = p_w_content * tl.exp(log_tw)
+        p_w = tl.minimum(tl.maximum(p_w, 1e-4), 5000.0)
+        old_den = tl.load(den_acc_ptr + b * K + k)
+        new_den = old_den + p_w
+        if h_block == 0:
+            tl.store(den_acc_ptr + b * K + k, new_den)
+        offs_h = tl.arange(0, BLOCK_H)
+        h_idx = h_start + offs_h
+        h_mask = h_idx < H
+        k_val = tl.load(
+            k_ptr + b * stride_k_b + k * stride_k_k + h_idx * stride_k_h,
+            mask=h_mask,
+            other=0.0,
+        )
+        k_scaled = k_val * phase_scale
+        phi_base = k_scaled / (1.0 + tl.abs(k_scaled))
+        kvw = k_val * p_w
+        sum_re = tl.zeros((BLOCK_H,), dtype=tl.float32)
+        sum_im = tl.zeros((BLOCK_H,), dtype=tl.float32)
+        inv_den = 1.0 / tl.maximum(new_den, 1e-4)
+        scale = 1.0 / tl.sqrt(float(H))
+        offs_m = tl.arange(0, BLOCK_M)
+        for m_start in range(0, M, BLOCK_M):
+            m_idx = m_start + offs_m
+            m_mask = m_idx < M
+            theta_base = k * stride_theta_k
+            theta_vals = tl.load(
+                theta_ptr + theta_base + h_idx[:, None] * stride_theta_h + m_idx[None, :] * stride_theta_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            phi = phi_base[:, None] * theta_vals
+            cos_phi = tl.cos(phi)
+            sin_phi = tl.sin(phi)
+            acc_base = b * stride_acc_b + k * stride_acc_k
+            old_re = tl.load(
+                re_acc_ptr + acc_base + h_idx[:, None] * stride_acc_h + m_idx[None, :] * stride_acc_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            old_im = tl.load(
+                im_acc_ptr + acc_base + h_idx[:, None] * stride_acc_h + m_idx[None, :] * stride_acc_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            new_re = old_re + kvw[:, None] * cos_phi
+            new_im = old_im + kvw[:, None] * sin_phi
+            tl.store(
+                re_acc_ptr + acc_base + h_idx[:, None] * stride_acc_h + m_idx[None, :] * stride_acc_m,
+                new_re,
+                mask=h_mask[:, None] & m_mask[None, :],
+            )
+            tl.store(
+                im_acc_ptr + acc_base + h_idx[:, None] * stride_acc_h + m_idx[None, :] * stride_acc_m,
+                new_im,
+                mask=h_mask[:, None] & m_mask[None, :],
+            )
+            q_base = b * stride_q_b + k * stride_q_k
+            q_re_vals = tl.load(
+                q_re_ptr + q_base + h_idx[:, None] * stride_q_h + m_idx[None, :] * stride_q_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            q_im_vals = tl.load(
+                q_im_ptr + q_base + h_idx[:, None] * stride_q_h + m_idx[None, :] * stride_q_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            w_base = k * stride_w_k
+            w_vals = tl.load(
+                w_int_ptr + w_base + h_idx[:, None] * stride_w_h + m_idx[None, :] * stride_w_m,
+                mask=h_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )
+            state_re = new_re * inv_den
+            state_im = new_im * inv_den
+            match_re = (state_re * q_re_vals + state_im * q_im_vals) * scale
+            match_im = (state_im * q_re_vals - state_re * q_im_vals) * scale
+            sum_re += tl.sum(match_re * w_vals, axis=1)
+            sum_im += tl.sum(match_im * w_vals, axis=1)
+        out_base = b * stride_out_b + k * stride_out_k
+        tl.store(out_re_ptr + out_base + h_idx * stride_out_h, sum_re, mask=h_mask)
+        tl.store(out_im_ptr + out_base + h_idx * stride_out_h, sum_im, mask=h_mask)
+def seqcond_step_triton(
+    k_val: torch.Tensor,
+    s_raw: torch.Tensor,
+    q_re: torch.Tensor,
+    q_im: torch.Tensor,
+    re_acc: torch.Tensor,
+    im_acc: torch.Tensor,
+    den_acc: torch.Tensor,
+    theta: torch.Tensor,
+    w_int: torch.Tensor,
+    phase_scale: torch.Tensor,
+    score_scale: torch.Tensor,
+    score_bias: torch.Tensor,
+    log_time_weight: torch.Tensor,
+    out_re_buffer: torch.Tensor | None = None,
+    out_im_buffer: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, K, H = k_val.shape
+    M = theta.shape[2]
+    K_q = q_re.shape[1]
+    assert K_q == K, (
+        f"Triton kernel requires n_rep==1 (K_q==K), got K_q={K_q}, K={K}. "
+        f"Use PyTorch path for n_rep>1."
+    )
+    def _prep_f32(t: torch.Tensor) -> torch.Tensor:
+        if t.dtype == torch.float32:
+            return t
+        return t.float()
+    def _prep_f32_contiguous(t: torch.Tensor) -> torch.Tensor:
+        if t.dtype != torch.float32:
+            t = t.float()
+        if not t.is_contiguous():
+            t = t.contiguous()
+        return t
+    k_val = _prep_f32(k_val)
+    s_raw = _prep_f32_contiguous(s_raw)
+    q_re = _prep_f32(q_re)
+    q_im = _prep_f32(q_im)
+    theta = _prep_f32(theta)
+    phase_scale = _prep_f32_contiguous(phase_scale)
+    score_scale = _prep_f32_contiguous(score_scale)
+    score_bias = _prep_f32_contiguous(score_bias)
+    log_time_weight = _prep_f32_contiguous(log_time_weight)
+    if w_int.dim() == 4:
+        w_int = w_int.squeeze(1)
+    w_int = _prep_f32(w_int)
+    if (
+        out_re_buffer is None
+        or out_re_buffer.shape != (B, K, H)
+        or out_re_buffer.device != k_val.device
+        or out_re_buffer.dtype != torch.float32
+    ):
+        out_re = torch.empty(B, K, H, device=k_val.device, dtype=torch.float32)
+    else:
+        out_re = out_re_buffer
+    if (
+        out_im_buffer is None
+        or out_im_buffer.shape != (B, K, H)
+        or out_im_buffer.device != k_val.device
+        or out_im_buffer.dtype != torch.float32
+    ):
+        out_im = torch.empty(B, K, H, device=k_val.device, dtype=torch.float32)
+    else:
+        out_im = out_im_buffer
+    common_args = (
+        k_val,
+        s_raw,
+        q_re,
+        q_im,
+        re_acc,
+        im_acc,
+        den_acc,
+        theta,
+        w_int,
+        phase_scale,
+        score_scale,
+        score_bias,
+        log_time_weight,
+        out_re,
+        out_im,
+        K,
+        H,
+        M,
+        k_val.stride(0),
+        k_val.stride(1),
+        k_val.stride(2),
+        re_acc.stride(0),
+        re_acc.stride(1),
+        re_acc.stride(2),
+        re_acc.stride(3),
+        theta.stride(0),
+        theta.stride(1),
+        theta.stride(2),
+        q_re.stride(0),
+        q_re.stride(1),
+        q_re.stride(2),
+        q_re.stride(3),
+        w_int.stride(0),
+        w_int.stride(1),
+        w_int.stride(2),
+        out_re.stride(0),
+        out_re.stride(1),
+        out_re.stride(2),
+    )
+    block_m, block_h = _select_seqcond_launch_config(H, M)
+    grid = (B * K * ((H + block_h - 1) // block_h),)
+    _seqcond_fully_fused_kernel_impl[grid](*common_args, BLOCK_M=block_m, BLOCK_H=block_h)
+    return out_re, out_im
+if TRITON_AVAILABLE:
+    def _select_rmsnorm_block_size(n_cols: int) -> int:
+        block = 1
+        while block < n_cols:
+            block *= 2
+        return min(block, 4096)
+    @triton.jit
+    def _gated_rmsnorm_kernel(
+        x_ptr,
+        residual_ptr,
+        weight_ptr,
+        out_ptr,
+        n_cols,
+        stride_x_row,
+        stride_residual_row,
+        stride_out_row,
+        epsilon,
+        BLOCK_N: tl.constexpr,
+    ):
+        row = tl.program_id(0)
+        offs = tl.arange(0, BLOCK_N)
+        mask = offs < n_cols
+        x = tl.load(x_ptr + row * stride_x_row + offs, mask=mask, other=0.0).to(tl.float32)
+        residual = tl.load(residual_ptr + row * stride_residual_row + offs, mask=mask, other=0.0).to(tl.float32)
+        weight = tl.load(weight_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+        gated = x * (residual * tl.sigmoid(residual))
+        variance = tl.sum(gated * gated, axis=0) / n_cols
+        inv_rms = tl.rsqrt(variance + epsilon)
+        out = gated * inv_rms * weight
+        tl.store(out_ptr + row * stride_out_row + offs, out, mask=mask)
+def gated_rmsnorm_triton(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    out_buffer: torch.Tensor | None = None,
+) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not available")
+    if x.dim() != 2 or residual.dim() != 2:
+        raise ValueError(
+            f"gated_rmsnorm_triton expects 2D tensors, got x.shape={tuple(x.shape)} residual.shape={tuple(residual.shape)}"
+        )
+    if x.shape != residual.shape:
+        raise ValueError(
+            f"gated_rmsnorm_triton expects matching x/residual shapes, got {tuple(x.shape)} and {tuple(residual.shape)}"
+        )
+    if weight.dim() != 1 or weight.shape[0] != x.shape[1]:
+        raise ValueError(
+            f"gated_rmsnorm_triton expects weight.shape == ({x.shape[1]},), got {tuple(weight.shape)}"
+        )
+    def _prep_f32_contiguous(t: torch.Tensor) -> torch.Tensor:
+        if t.dtype != torch.float32:
+            t = t.float()
+        if not t.is_contiguous():
+            t = t.contiguous()
+        return t
+    x = _prep_f32_contiguous(x)
+    residual = _prep_f32_contiguous(residual)
+    weight = _prep_f32_contiguous(weight)
+    rows, n_cols = x.shape
+    if (
+        out_buffer is None
+        or out_buffer.shape != x.shape
+        or out_buffer.device != x.device
+        or out_buffer.dtype != torch.float32
+    ):
+        out = torch.empty_like(x, dtype=torch.float32)
+    else:
+        out = out_buffer
+    block_n = _select_rmsnorm_block_size(n_cols)
+    _gated_rmsnorm_kernel[(rows,)](
+        x,
+        residual,
+        weight,
+        out,
+        n_cols,
+        x.stride(0),
+        residual.stride(0),
+        out.stride(0),
+        epsilon,
+        BLOCK_N=block_n,
+    )
+    return out