nautile-370m / modeling_seqcond.py

Upload modeling_seqcond.py with huggingface_hub

408d7bd verified 14 days ago

47.5 kB

	"""
	SeqCond model — self-contained HuggingFace implementation.

	All model code is embedded here so that trust_remote_code=True works without
	any dependency on the original seqcond package.

	Architecture:
	- Hybrid recurrent-transformer: every (seqcond_ratio+1)-th block (1-indexed)
	is a standard Transformer decoder block; the rest are SeqCond blocks.
	- SeqCond blocks use complex-exponential accumulators (den_acc, re_acc, im_acc)
	for O(1) per-token autoregressive decoding.
	- Transformer blocks use GQA with RoPE and KV-cache for autoregressive decoding.
	"""

	import math
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import PreTrainedModel
	from transformers.modeling_outputs import CausalLMOutputWithPast

	from .configuration_seqcond import SeqCondConfig

	# ---------------------------------------------------------------------------
	# Optional Triton kernels (accelerates SeqCond step, not required)
	# ---------------------------------------------------------------------------
	try:
	from .triton_kernels import (
	gated_rmsnorm_triton,
	seqcond_step_triton,
	TRITON_AVAILABLE,
	)
	except ImportError:
	gated_rmsnorm_triton = None
	TRITON_AVAILABLE = False
	seqcond_step_triton = None


	# ---------------------------------------------------------------------------
	# Normalisation layers
	# ---------------------------------------------------------------------------

	class RMSNorm(nn.Module):
	def __init__(self, hidden_size: int, epsilon: float = 1e-5):
	super().__init__()
	self.epsilon = epsilon
	self.scale = nn.Parameter(torch.ones(hidden_size))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	orig = x.dtype
	x = x.float()
	x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.epsilon)
	return (x * self.scale.float()).to(orig)


	class GatedRMSNorm(nn.Module):
	"""RMSNorm with SiLU gating: rmsnorm(x * silu(residual))."""

	def __init__(self, hidden_size: int, epsilon: float = 1e-6):
	super().__init__()
	self.epsilon = epsilon
	self.weight = nn.Parameter(torch.ones(hidden_size))

	def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
	orig = x.dtype
	x = x.float() * F.silu(residual.float())
	x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.epsilon)
	return (x * self.weight.float()).to(orig)


	# ---------------------------------------------------------------------------
	# Rotary Position Embedding
	# ---------------------------------------------------------------------------

	def precompute_freqs(maxlen: int, head_dim: int) -> Tuple[torch.Tensor, torch.Tensor]:
	half_d = head_dim // 2
	pos = np.arange(maxlen)[:, None]
	dim = np.arange(half_d)[None, :]
	angles = pos * (1.0 / (10000 ** (dim / half_d)))
	cos = torch.from_numpy(np.cos(angles).astype(np.float32))
	sin = torch.from_numpy(np.sin(angles).astype(np.float32))
	return cos, sin


	def apply_rope(tensor: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	dim = tensor.shape[-1] // 2
	cos = cos[..., :dim]
	sin = sin[..., :dim]
	x1, x2 = tensor[..., :dim], tensor[..., dim:]
	return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1).view(tensor.shape)


	# ---------------------------------------------------------------------------
	# Transformer decoder block (GQA + RoPE)
	# ---------------------------------------------------------------------------

	class RotarySelfAttention(nn.Module):
	def __init__(
	self,
	d_model: int,
	num_heads: int,
	num_kv_heads: Optional[int] = None,
	dropout: float = 0.0,
	qk_norm: bool = False,
	qk_norm_eps: float = 1e-6,
	):
	super().__init__()
	self.d_model = d_model
	self.num_heads = num_heads
	self._num_kv_heads = num_kv_heads if num_kv_heads is not None else num_heads
	self.num_groups = num_heads // self._num_kv_heads
	self.head_dim = d_model // num_heads
	self.dropout = dropout
	self.qk_norm = qk_norm
	self.qk_norm_eps = qk_norm_eps

	self.q_proj = nn.Linear(d_model, d_model, bias=False)
	self.k_proj = nn.Linear(d_model, self._num_kv_heads * self.head_dim, bias=False)
	self.v_proj = nn.Linear(d_model, self._num_kv_heads * self.head_dim, bias=False)
	self.out_proj = nn.Linear(d_model, d_model, bias=False)

	def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
	if self.num_groups == 1:
	return x
	b, l = x.shape[:2]
	extra = x.shape[2:]
	x = x.view(b, l, self._num_kv_heads, 1, *extra[1:])
	x = x.expand(b, l, self._num_kv_heads, self.num_groups, *extra[1:])
	return x.reshape(b, l, self.num_heads, *extra[1:])

	def forward(
	self,
	x: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	mask: Optional[torch.Tensor] = None,
	return_state: bool = False,
	):
	b, l = x.shape[0], x.shape[1]
	q = self.q_proj(x).reshape(b, l, self.num_heads, self.head_dim)
	k = self.k_proj(x).reshape(b, l, self._num_kv_heads, self.head_dim)
	v = self.v_proj(x).reshape(b, l, self._num_kv_heads, self.head_dim)

	q = apply_rope(q, cos, sin)
	cos_kv = cos[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else cos
	sin_kv = sin[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else sin
	k = apply_rope(k, cos_kv, sin_kv)

	if self.qk_norm:
	q_f = q.float(); k_f = k.float()
	q = (q_f * torch.rsqrt(q_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(q.dtype)
	k = (k_f * torch.rsqrt(k_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(k.dtype)

	k_cache = k; v_cache = v
	k = self._repeat_kv(k); v = self._repeat_kv(v)

	scale = 1.0 / math.sqrt(self.head_dim)
	scores = torch.einsum("blhd,bmhd->bhlm", q, k) * scale
	causal = torch.tril(torch.ones(l, l, dtype=torch.bool, device=x.device)).unsqueeze(0).unsqueeze(0)
	scores = torch.where(causal, scores, torch.full_like(scores, -1e4))
	attn = F.softmax(scores.float(), dim=-1).to(v.dtype)
	if self.dropout > 0 and self.training:
	attn = F.dropout(attn, p=self.dropout)
	out = torch.einsum("bhql,blhd->bqhd", attn, v).reshape(b, l, self.d_model).to(x.dtype)

	if return_state:
	return self.out_proj(out), (k_cache, v_cache)
	return self.out_proj(out)

	def step(
	self,
	x_t: torch.Tensor,
	kv_cache: Tuple[torch.Tensor, torch.Tensor],
	pos: torch.Tensor,
	cos_t: torch.Tensor,
	sin_t: torch.Tensor,
	seq_len: Optional[int] = None,
	) -> Tuple[torch.Tensor, Tuple]:
	b = x_t.shape[0]
	q = self.q_proj(x_t).reshape(b, 1, self.num_heads, self.head_dim)
	k_new = self.k_proj(x_t).reshape(b, 1, self._num_kv_heads, self.head_dim)
	v_new = self.v_proj(x_t).reshape(b, 1, self._num_kv_heads, self.head_dim)

	q = apply_rope(q, cos_t, sin_t)
	cos_kv = cos_t[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else cos_t
	sin_kv = sin_t[:, :, : self._num_kv_heads, :] if self._num_kv_heads < self.num_heads else sin_t
	k_new = apply_rope(k_new, cos_kv, sin_kv)

	if self.qk_norm:
	q_f = q.float(); k_f = k_new.float()
	q = (q_f * torch.rsqrt(q_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(q.dtype)
	k_new = (k_f * torch.rsqrt(k_f.pow(2).mean(-1, keepdim=True) + self.qk_norm_eps)).to(k_new.dtype)

	k_cache, v_cache = kv_cache
	pos_idx = pos.long().view(b, 1, 1, 1).expand(-1, 1, k_new.size(2), k_new.size(3))
	k_cache.scatter_(1, pos_idx, k_new.to(k_cache.dtype))
	v_cache.scatter_(1, pos_idx, v_new.to(v_cache.dtype))

	if seq_len is not None:
	k_slice, v_slice = k_cache[:, :seq_len], v_cache[:, :seq_len]; L = seq_len
	else:
	k_slice, v_slice = k_cache, v_cache; L = k_cache.shape[1]

	k_r = self._repeat_kv(k_slice); v_r = self._repeat_kv(v_slice)
	mask = torch.arange(L, device=k_cache.device).view(1, 1, 1, L) > pos.long().view(b, 1, 1, 1)
	scale = 1.0 / math.sqrt(self.head_dim)
	scores = torch.einsum("bqhd,bkhd->bhqk", q, k_r) * scale
	scores = scores.masked_fill(mask, float("-inf"))
	attn = F.softmax(scores.float(), dim=-1).to(v_r.dtype)
	out = torch.einsum("bhqk,bkhd->bqhd", attn, v_r).reshape(b, self.d_model).to(x_t.dtype)
	return self.out_proj(out), (k_cache, v_cache)


	class TransformerDecoderBlock(nn.Module):
	def __init__(
	self,
	d_model: int,
	num_heads: int,
	d_ff: int,
	num_kv_heads: Optional[int] = None,
	dropout: float = 0.0,
	norm_eps: float = 1e-6,
	qk_norm: bool = False,
	qk_norm_eps: float = 1e-6,
	):
	super().__init__()
	self.norm1 = RMSNorm(d_model, epsilon=norm_eps)
	self.attn = RotarySelfAttention(d_model, num_heads, num_kv_heads, dropout, qk_norm, qk_norm_eps)
	self.norm2 = RMSNorm(d_model, epsilon=norm_eps)
	self.ff_in = nn.Linear(d_model, 2 * d_ff, bias=True)
	self.ff_out = nn.Linear(d_ff, d_model, bias=True)
	self.dropout = dropout

	def forward(self, x, cos, sin, mask=None, return_state=False):
	y = self.norm1(x)
	if return_state:
	y, kv = self.attn(y, cos=cos, sin=sin, mask=mask, return_state=True)
	else:
	y = self.attn(y, cos=cos, sin=sin, mask=mask)
	if self.dropout > 0 and self.training:
	y = F.dropout(y, p=self.dropout)
	x = x + y
	y = self.norm2(x)
	u, v = self.ff_in(y).chunk(2, dim=-1)
	y = self.ff_out(F.silu(v) * u)
	if self.dropout > 0 and self.training:
	y = F.dropout(y, p=self.dropout)
	out = x + y
	return (out, kv) if return_state else out

	def step(self, x_t, kv_cache, pos, cos_t, sin_t, seq_len=None):
	y = self.norm1(x_t)
	y, new_kv = self.attn.step(y, kv_cache, pos, cos_t, sin_t, seq_len=seq_len)
	x_t = x_t + y
	y = self.norm2(x_t)
	u, v = self.ff_in(y).chunk(2, dim=-1)
	return x_t + self.ff_out(F.silu(v) * u), new_kv


	# ---------------------------------------------------------------------------
	# SeqCond attention block
	# ---------------------------------------------------------------------------

	class SeqCondAttention(nn.Module):
	def __init__(
	self,
	d_model: int,
	num_heads: int = 12,
	num_query_heads: int = 6,
	num_anchor_heads: int = 0,
	num_thetas: int = 1,
	conv_kernel_size: int = 4,
	expand_factor: int = 1,
	out_expand_factor: int = 3,
	dropout: float = 0.0,
	maxlen: Optional[int] = None,
	**kwargs,
	):
	super().__init__()
	assert num_heads % num_query_heads == 0

	self.d_model = d_model
	self.K = num_heads
	self.K_q = num_query_heads
	self.n_rep = num_heads // num_query_heads
	self.M = num_thetas
	self.num_decay_heads = num_heads - num_anchor_heads
	self.num_anchor_heads = num_anchor_heads
	self.conv_kernel_size = conv_kernel_size
	self.dropout_rate = dropout
	self.maxlen = maxlen

	d_inner = int(d_model * expand_factor)
	self.H = max(1, d_inner // (self.K * self.M))
	self.dim_memory = self.K * self.H
	self.dim_query_head = self.H * self.M * 2
	self.dim_query_total = self.K_q * self.dim_query_head
	self.dim_expand = self.H * out_expand_factor
	self.dim_swiglu_head = self.dim_expand * 2
	self.dim_swiglu_total = self.K * self.dim_swiglu_head
	self.dim_mem_total = self.dim_memory + self.K
	self.dim_conv_total = self.dim_mem_total + self.dim_query_total

	self.in_proj = nn.Linear(d_model, self.dim_conv_total, bias=False)
	self.conv_weight = nn.Parameter(torch.empty(self.dim_conv_total, 1, conv_kernel_size))
	nn.init.kaiming_normal_(self.conv_weight)

	# Cached buffers (computed lazily)
	self.register_buffer("_conv_kernel_t", None)
	self.register_buffer("_theta_cached", None)
	self.register_buffer("_w_int_cached", None)
	self.register_buffer("_decay_slopes_cached", None)
	self.register_buffer("_anchor_slopes_cached", None)
	self.register_buffer("_phase_scale_b", None)
	self.register_buffer("_score_scale_b", None)
	self.register_buffer("_score_bias_b", None)
	self._triton_out_re_buffer = None
	self._triton_out_im_buffer = None
	self._triton_norm_buffer = None

	if self.M == 1:
	init_theta = np.geomspace(0.001, 3.0, self.K).reshape(1, 1, self.K, 1, 1)
	init_theta = np.tile(init_theta, (1, 1, 1, self.H, 1))
	x = np.clip((init_theta - 0.001) / 2.999, 1e-4, 1 - 1e-4)
	self.theta_raw = nn.Parameter(torch.from_numpy((np.log(x) - np.log(1 - x)).astype(np.float32)))
	self.w_int_raw = nn.Parameter(torch.zeros(1, 1, self.K_q, self.n_rep, self.H, 1))
	else:
	init_vals = np.geomspace(0.001, 3.0, self.M).reshape(1, 1, 1, 1, self.M)
	init_vals = np.tile(init_vals, (1, 1, self.K, self.H, 1))
	self.theta_d_raw = nn.Parameter(torch.from_numpy(np.log(np.exp(init_vals) - 1.0 + 1e-4).astype(np.float32)))
	self.w_int_raw = nn.Parameter(torch.zeros(1, 1, self.K_q, self.n_rep, self.H, self.M))

	if self.num_decay_heads > 0:
	self.decay_slopes = nn.Parameter(
	torch.from_numpy(np.log(np.exp(np.geomspace(0.001, 0.1, self.num_decay_heads)) - 1).astype(np.float32))
	)
	if self.num_anchor_heads > 0:
	self.anchor_slopes = nn.Parameter(
	torch.from_numpy(np.log(np.exp(np.geomspace(0.01, 0.1, self.num_anchor_heads)) - 1).astype(np.float32))
	)

	self.score_scale = nn.Parameter(torch.ones(self.K))
	self.score_bias = nn.Parameter(torch.zeros(self.K))
	self.phase_scale = nn.Parameter(torch.ones(self.K))
	self.gate_proj = nn.Linear(d_model, self.K * 2 * self.H, bias=False)
	self.gated_norm = GatedRMSNorm(self.K * 2 * self.H)
	self.W_readout = nn.Parameter(torch.empty(self.K, 2 * self.H, self.dim_swiglu_head))
	nn.init.xavier_uniform_(self.W_readout)
	self.out_proj = nn.Linear(self.dim_swiglu_total // 2, d_model, bias=False)

	def forward(self, x: torch.Tensor, mask=None, return_state: bool = False):
	B, L, D = x.shape
	z_conv = self.in_proj(x)
	z_conv_t = F.pad(z_conv.transpose(1, 2), (self.conv_kernel_size - 1, 0))
	z_conv = F.silu(F.conv1d(z_conv_t, self.conv_weight, groups=self.dim_conv_total).transpose(1, 2))

	z_mem = z_conv[..., : self.dim_mem_total]
	q_raw = z_conv[..., self.dim_mem_total :]
	k_val = z_mem[..., : self.dim_memory].reshape(B, L, self.K, self.H)
	s_raw = z_mem[..., self.dim_memory :]
	q_raw = q_raw.reshape(B, L, self.K_q, 1, self.H, self.M, 2)
	q_re, q_im = q_raw[..., 0], q_raw[..., 1]

	if self.M == 1:
	theta = 0.001 + 2.999 * torch.sigmoid(self.theta_raw)
	else:
	theta_d = F.softplus(self.theta_d_raw) + 1e-4
	theta_accum = torch.cumsum(theta_d, dim=-1)
	theta = 0.001 + (theta_accum / theta_accum[..., -1:]) * 2.999

	w_int = torch.exp(self.w_int_raw)
	w_int = w_int / (w_int.sum(dim=-1, keepdim=True) + 1e-6)

	pos = torch.arange(L, dtype=torch.float32, device=x.device)
	log_w_list = []
	if self.num_decay_heads > 0:
	slopes = F.softplus(self.decay_slopes).view(1, 1, -1)
	dist = torch.clamp((self.maxlen or L) - 1 - pos, min=0.0).view(1, L, 1)
	log_w_list.append(-slopes * dist)
	if self.num_anchor_heads > 0:
	log_w_list.append(-F.softplus(self.anchor_slopes).view(1, 1, -1) * pos.view(1, L, 1))
	log_tw = torch.cat(log_w_list, dim=2) if log_w_list else torch.zeros(1, L, self.K, device=x.device)

	score_raw = self.score_scale.view(1, 1, -1) * s_raw.float() + self.score_bias.view(1, 1, -1)
	p_w = (F.softplus(score_raw) * torch.exp(log_tw)).clamp(1e-4, 5000.0)

	k_f32 = k_val.float().unsqueeze(-1)
	p_w_b = p_w.unsqueeze(-1).unsqueeze(-1)
	phase_scale_b = self.phase_scale.view(1, 1, self.K, 1, 1)
	k_scaled = k_f32 * phase_scale_b
	phi = (k_scaled / (1.0 + k_scaled.abs())) * theta
	kvw = k_f32 * p_w_b
	re = kvw * torch.cos(phi)
	im = kvw * torch.sin(phi)

	flat_size = self.K * self.H * self.M
	stack = torch.cat([p_w.float(), re.reshape(B, L, -1), im.reshape(B, L, -1)], dim=-1)
	cumsum = torch.cumsum(stack, dim=1)
	den_acc = cumsum[..., : self.K]
	re_acc = cumsum[..., self.K : self.K + flat_size].reshape(B, L, self.K, self.H, self.M)
	im_acc = cumsum[..., self.K + flat_size :].reshape(B, L, self.K, self.H, self.M)

	inv_den = (1.0 / torch.clamp(den_acc, min=1e-4)).unsqueeze(-1).unsqueeze(-1)
	state_re_g = (re_acc * inv_den).reshape(B, L, self.K_q, self.n_rep, self.H, self.M)
	state_im_g = (im_acc * inv_den).reshape(B, L, self.K_q, self.n_rep, self.H, self.M)

	scale = 1.0 / (self.H ** 0.5)
	match_re = ((state_re_g * q_re + state_im_g * q_im) * scale).float()
	match_im = ((state_im_g * q_re - state_re_g * q_im) * scale).float()
	out_re = ((match_re * w_int.float()).sum(dim=-1)).reshape(B, L, self.K, self.H).to(x.dtype)
	out_im = ((match_im * w_int.float()).sum(dim=-1)).reshape(B, L, self.K, self.H).to(x.dtype)
	out_complex = self.gated_norm(torch.cat([out_re, out_im], dim=-1).reshape(B, L, -1), self.gate_proj(x))
	out_complex = out_complex.reshape(B, L, self.K, 2 * self.H)

	y_raw = torch.einsum("blkf,kfn->blkn", out_complex, self.W_readout.to(out_complex.dtype))
	y_val, y_gate = y_raw.chunk(2, dim=-1)
	output = self.out_proj((y_val * torch.sigmoid(y_gate)).reshape(B, L, -1).to(x.dtype))

	if return_state:
	z_pre = self.in_proj(x)
	buf_sz = self.conv_kernel_size - 1
	conv_buf = z_pre[:, -buf_sz:] if L >= buf_sz else torch.cat([
	torch.zeros(B, buf_sz - L, self.dim_conv_total, device=x.device, dtype=z_pre.dtype), z_pre], dim=1)
	state = (
	p_w.sum(dim=1),
	re_acc[:, -1],
	im_acc[:, -1],
	torch.full((B,), L, dtype=torch.float32, device=x.device),
	conv_buf,
	)
	return output, state
	return output

	def step(self, x_t: torch.Tensor, state: Tuple, use_triton: bool = False) -> Tuple:
	B, D = x_t.shape
	den_acc, re_acc, im_acc, pos, conv_buffer = state

	z_conv = self.in_proj(x_t)

	if self._conv_kernel_t is None or self._conv_kernel_t.device != z_conv.device:
	self._conv_kernel_t = self.conv_weight[:, 0, :].t().contiguous()

	conv_input = torch.cat([conv_buffer, z_conv.unsqueeze(1)], dim=1)
	z_conv_act = F.silu((conv_input * self._conv_kernel_t).sum(dim=1))

	z_mem = z_conv_act[..., : self.dim_mem_total]
	q_raw = z_conv_act[..., self.dim_mem_total :]
	k_val = z_mem[..., : self.dim_memory].reshape(B, self.K, self.H)
	s_raw = z_mem[..., self.dim_memory :]
	q_raw = q_raw.reshape(B, self.K_q, 1, self.H, self.M, 2)
	q_re, q_im = q_raw[..., 0], q_raw[..., 1]

	if self._theta_cached is None:
	if self.M == 1:
	self._theta_cached = (0.001 + 2.999 * torch.sigmoid(self.theta_raw))[0, 0]
	else:
	theta_d = F.softplus(self.theta_d_raw) + 1e-4
	theta_accum = torch.cumsum(theta_d, dim=-1)
	self._theta_cached = (0.001 + (theta_accum / theta_accum[..., -1:]) * 2.999)[0, 0]
	w = torch.exp(self.w_int_raw)
	self._w_int_cached = w / (w.sum(dim=-1, keepdim=True) + 1e-6)
	self._w_int_cached = self._w_int_cached[0, 0]
	theta = self._theta_cached
	w_int = self._w_int_cached

	if self._decay_slopes_cached is None and self.num_decay_heads > 0:
	self._decay_slopes_cached = F.softplus(self.decay_slopes).view(1, -1)
	if self._anchor_slopes_cached is None and self.num_anchor_heads > 0:
	self._anchor_slopes_cached = F.softplus(self.anchor_slopes).view(1, -1)
	if self._score_scale_b is None:
	self._score_scale_b = self.score_scale.view(1, -1)
	self._score_bias_b = self.score_bias.view(1, -1)
	self._phase_scale_b = self.phase_scale.view(1, self.K, 1, 1)

	log_w_list = []
	if self.num_decay_heads > 0:
	dist = (self.maxlen or 2048) - 1 - pos.unsqueeze(-1)
	log_w_list.append(-self._decay_slopes_cached * dist.clamp(min=0.0))
	if self.num_anchor_heads > 0:
	log_w_list.append(-self._anchor_slopes_cached * pos.unsqueeze(-1))
	log_tw = torch.cat(log_w_list, dim=1) if log_w_list else torch.zeros(B, self.K, device=x_t.device)

	if (
	use_triton
	and x_t.is_cuda
	and self.n_rep == 1
	and TRITON_AVAILABLE
	and seqcond_step_triton is not None
	):
	if (
	self._triton_out_re_buffer is None
	or self._triton_out_re_buffer.shape != (B, self.K, self.H)
	or self._triton_out_re_buffer.device != x_t.device
	):
	self._triton_out_re_buffer = torch.empty(
	B, self.K, self.H, device=x_t.device, dtype=torch.float32
	)
	self._triton_out_im_buffer = torch.empty_like(
	self._triton_out_re_buffer
	)
	out_re, out_im = seqcond_step_triton(
	k_val,
	s_raw,
	q_re.squeeze(2),
	q_im.squeeze(2),
	re_acc,
	im_acc,
	den_acc,
	theta,
	w_int,
	self.phase_scale,
	self.score_scale,
	self.score_bias,
	log_tw,
	out_re_buffer=self._triton_out_re_buffer,
	out_im_buffer=self._triton_out_im_buffer,
	)
	out_complex = torch.cat([out_re, out_im], dim=-1)
	else:
	score_raw = self._score_scale_b * s_raw.float() + self._score_bias_b
	p_w = (F.softplus(score_raw) * torch.exp(log_tw)).clamp(1e-4, 5000.0)
	k_f32 = k_val.float().unsqueeze(-1)
	k_scaled = k_f32 * self._phase_scale_b
	phi = (k_scaled / (1.0 + k_scaled.abs())) * theta
	kvw = k_f32 * p_w.unsqueeze(-1).unsqueeze(-1)
	re = kvw * torch.cos(phi)
	im = kvw * torch.sin(phi)
	den_acc.add_(p_w); re_acc.add_(re); im_acc.add_(im)
	inv_den = (1.0 / torch.clamp(den_acc, min=1e-4)).unsqueeze(-1).unsqueeze(-1)
	state_re_g = (re_acc * inv_den).reshape(B, self.K_q, self.n_rep, self.H, self.M)
	state_im_g = (im_acc * inv_den).reshape(B, self.K_q, self.n_rep, self.H, self.M)
	scale = 1.0 / (self.H ** 0.5)
	match_re = ((state_re_g * q_re + state_im_g * q_im) * scale).float()
	match_im = ((state_im_g * q_re - state_re_g * q_im) * scale).float()
	out_re = ((match_re * w_int.float()).sum(-1)).reshape(B, self.K, self.H).to(x_t.dtype)
	out_im = ((match_im * w_int.float()).sum(-1)).reshape(B, self.K, self.H).to(x_t.dtype)
	out_complex = torch.cat([out_re, out_im], dim=-1)

	out_complex = out_complex.reshape(B, self.K, 2 * self.H)
	out_complex_flat = out_complex.reshape(B, -1)
	gate_for_norm = self.gate_proj(x_t)
	if use_triton and x_t.is_cuda and gated_rmsnorm_triton is not None:
	if (
	self._triton_norm_buffer is None
	or self._triton_norm_buffer.shape != out_complex_flat.shape
	or self._triton_norm_buffer.device != x_t.device
	):
	self._triton_norm_buffer = torch.empty(
	out_complex_flat.shape,
	device=x_t.device,
	dtype=torch.float32,
	)
	out_flat = gated_rmsnorm_triton(
	out_complex_flat,
	gate_for_norm,
	self.gated_norm.weight,
	self.gated_norm.epsilon,
	out_buffer=self._triton_norm_buffer,
	)
	else:
	out_flat = self.gated_norm(out_complex_flat, gate_for_norm)
	out_complex = out_flat.to(x_t.dtype).reshape(B, self.K, 2 * self.H)
	y_raw = torch.einsum("bkf,kfn->bkn", out_complex, self.W_readout.to(out_complex.dtype))
	y_val, y_gate = y_raw.chunk(2, dim=-1)
	out = self.out_proj((y_val * torch.sigmoid(y_gate)).reshape(B, -1).to(x_t.dtype))

	pos.add_(1).clamp_(max=(self.maxlen or 2048) - 1)
	if self.conv_kernel_size > 1:
	if self.conv_kernel_size > 2:
	conv_buffer[:, :-1, :].copy_(conv_buffer[:, 1:, :].clone())
	conv_buffer[:, -1, :].copy_(z_conv)

	return out, (den_acc, re_acc, im_acc, pos, conv_buffer)


	class SeqCondBlock(nn.Module):
	def __init__(self, d_model: int, norm_eps: float = 1e-6, **kwargs):
	super().__init__()
	self.norm = RMSNorm(d_model, epsilon=norm_eps)
	self.attn = SeqCondAttention(d_model=d_model, **kwargs)

	def forward(self, x, mask=None, return_state=False):
	if return_state:
	out, state = self.attn(self.norm(x), mask=mask, return_state=True)
	return x + out, state
	return x + self.attn(self.norm(x), mask=mask)

	def step(self, x_t, state, use_triton=False):
	out, new_state = self.attn.step(self.norm(x_t), state, use_triton=use_triton)
	return x_t + out, new_state


	# ---------------------------------------------------------------------------
	# Core SeqCond language model
	# ---------------------------------------------------------------------------

	class SeqCondModel(nn.Module):
	"""Core SeqCond model (no HF wrapper). Used internally by SeqCondForCausalLM."""

	def __init__(self, config: SeqCondConfig):
	super().__init__()
	self.d_model = config.d_model
	self.d_ff = config.d_ff
	self.num_layers = config.num_layers
	self.vocab_size = config.vocab_size
	self.maxlen = config.maxlen
	self.num_heads = config.num_heads
	self.num_kv_heads = config.num_kv_heads if config.num_kv_heads is not None else config.num_heads
	self.seqcond_ratio = config.seqcond_ratio

	self.embedding = nn.Embedding(config.vocab_size, config.d_model)

	self.use_positional_embedding = config.use_positional_embedding
	if config.use_positional_embedding:
	self.position_embedding = nn.Embedding(config.maxlen, config.d_model)

	head_dim = config.d_model // config.num_heads
	cos, sin = precompute_freqs(config.maxlen, head_dim)
	self.register_buffer("cos_emb", cos)
	self.register_buffer("sin_emb", sin)

	self.blocks = nn.ModuleList()
	self.block_types = []
	for i in range(config.num_layers):
	if (i + 1) % (config.seqcond_ratio + 1) == 0:
	block = TransformerDecoderBlock(
	d_model=config.d_model,
	num_heads=config.num_heads,
	d_ff=config.d_ff,
	num_kv_heads=self.num_kv_heads,
	dropout=config.dropout,
	qk_norm=config.qk_norm,
	qk_norm_eps=config.qk_norm_eps,
	)
	self.block_types.append("transformer")
	else:
	block = SeqCondBlock(
	d_model=config.d_model,
	num_heads=config.seqcond_heads,
	num_query_heads=config.num_query_heads,
	num_anchor_heads=config.num_anchor_heads,
	num_thetas=config.num_thetas,
	conv_kernel_size=config.conv_kernel_size,
	expand_factor=config.expand_factor,
	out_expand_factor=config.out_expand_factor,
	dropout=config.dropout,
	maxlen=config.maxlen,
	)
	self.block_types.append("seqcond")
	self.blocks.append(block)

	self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
	if config.tie_weights:
	self.lm_head.weight = self.embedding.weight

	def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
	B, L = input_ids.shape
	x = self.embedding(input_ids)
	if self.use_positional_embedding:
	x = x + self.position_embedding(torch.arange(L, device=input_ids.device))
	cos = self.cos_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
	sin = self.sin_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
	for block, bt in zip(self.blocks, self.block_types):
	x = block(x, cos, sin) if bt == "transformer" else block(x)
	return self.lm_head(x)

	def prefill(self, input_ids: torch.Tensor, return_all_logits: bool = False):
	B, L = input_ids.shape
	device = input_ids.device
	x = self.embedding(input_ids)
	if self.use_positional_embedding:
	x = x + self.position_embedding(torch.arange(L, device=device))
	cos = self.cos_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
	sin = self.sin_emb[:L].unsqueeze(0).unsqueeze(2).expand(B, L, self.num_heads, -1)
	states = []
	for block, bt in zip(self.blocks, self.block_types):
	if bt == "transformer":
	x, kv = block(x, cos, sin, return_state=True)
	k, v = kv
	k_cache = torch.zeros(B, self.maxlen, self.num_kv_heads, self.d_model // self.num_heads, device=device, dtype=k.dtype)
	v_cache = torch.zeros_like(k_cache)
	k_cache[:, :L] = k; v_cache[:, :L] = v
	states.append((k_cache, v_cache))
	else:
	x, state = block(x, return_state=True)
	states.append(state)
	logits = self.lm_head(x)
	if return_all_logits:
	return logits, states
	return logits[:, -1:, :], states

	def init_state(self, batch_size: int, device: torch.device) -> List:
	states = []
	for block, bt in zip(self.blocks, self.block_types):
	if bt == "transformer":
	k = torch.zeros(batch_size, self.maxlen, self.num_kv_heads, self.d_model // self.num_heads, device=device)
	states.append((k, torch.zeros_like(k)))
	else:
	a = block.attn
	states.append((
	torch.zeros(batch_size, a.K, device=device),
	torch.zeros(batch_size, a.K, a.H, a.M, device=device),
	torch.zeros(batch_size, a.K, a.H, a.M, device=device),
	torch.zeros(batch_size, device=device),
	torch.zeros(batch_size, a.conv_kernel_size - 1, a.dim_conv_total, device=device),
	))
	return states

	def step(self, token_id: torch.Tensor, states: List, pos=None, seq_len=None, use_triton=False):
	B = token_id.size(0)
	if pos is None:
	for state, bt in zip(states, self.block_types):
	if bt == "seqcond":
	pos = state[3]; break
	if pos is None:
	pos = torch.zeros(B, device=token_id.device, dtype=torch.long)

	x = self.embedding(token_id).squeeze(1)
	pos = pos.clamp(max=self.maxlen - 1)
	if self.use_positional_embedding:
	x = x + torch.index_select(self.position_embedding.weight, 0, pos.long())

	pos_idx = pos.long()
	cos_t = torch.index_select(self.cos_emb, 0, pos_idx).unsqueeze(1).unsqueeze(1).expand(B, 1, self.num_heads, -1)
	sin_t = torch.index_select(self.sin_emb, 0, pos_idx).unsqueeze(1).unsqueeze(1).expand(B, 1, self.num_heads, -1)

	new_states = []
	for block, bt, state in zip(self.blocks, self.block_types, states):
	if bt == "transformer":
	x, ns = block.step(x, state, pos, cos_t, sin_t, seq_len=seq_len)
	else:
	x, ns = block.step(x, state, use_triton=use_triton)
	new_states.append(ns)

	return self.lm_head(x), new_states


	# ---------------------------------------------------------------------------
	# HuggingFace wrapper
	# ---------------------------------------------------------------------------

	class SeqCondPreTrainedModel(PreTrainedModel):
	config_class = SeqCondConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = False

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, std=0.02)


	class SeqCondForCausalLM(SeqCondPreTrainedModel):
	"""
	SeqCond causal language model, HuggingFace-compatible.

	Supports:
	- Standard HF forward() for training / perplexity evaluation.
	- Custom generate() using state-based O(1) decoding.
	- generate_batch() for batched generation with per-sample early stopping.
	- precompute() / use_cuda_graph=True for CUDA-graph-accelerated decoding.
	"""

	_CUDA_GRAPH_SEQ_LENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

	def __init__(self, config: SeqCondConfig):
	super().__init__(config)
	self.model = SeqCondModel(config)
	self.post_init()
	# CUDA graph state
	self._cg_graphs: dict = {}
	self._cg_logits: dict = {}
	self._cg_token: Optional[torch.Tensor] = None
	self._cg_states: Optional[list] = None
	self._cg_use_triton: bool = False
	self._cg_ready: bool = False # True after precompute() has been called

	# ------------------------------------------------------------------
	# CUDA graph helpers
	# ------------------------------------------------------------------

	def _cg_get_seq_len(self, pos: int) -> int:
	for s in self._CUDA_GRAPH_SEQ_LENS:
	if s >= pos + 1:
	return s
	return self._CUDA_GRAPH_SEQ_LENS[-1]

	def _cg_copy_states(self, src, dst):
	for s, d in zip(src, dst):
	for st, dt in zip(s, d):
	dt.copy_(st)

	def _cg_capture(self, seq_len: int):
	saved = self.model.init_state(1, device=self._cg_token.device)
	self._cg_copy_states(self._cg_states, saved)
	stream = torch.cuda.Stream()
	stream.wait_stream(torch.cuda.current_stream())
	with torch.cuda.stream(stream):
	for _ in range(3):
	self.model.step(self._cg_token, self._cg_states,
	seq_len=seq_len, use_triton=self._cg_use_triton)
	torch.cuda.current_stream().wait_stream(stream)
	self._cg_copy_states(saved, self._cg_states)
	graph = torch.cuda.CUDAGraph()
	with torch.cuda.graph(graph):
	logits, _ = self.model.step(self._cg_token, self._cg_states,
	seq_len=seq_len, use_triton=self._cg_use_triton)
	self._cg_copy_states(saved, self._cg_states)
	self._cg_graphs[seq_len] = graph
	self._cg_logits[seq_len] = logits

	@torch.no_grad()
	def precompute(self, max_seq_len: int = 2048, use_triton: bool = False):
	"""Pre-capture CUDA graphs up to max_seq_len. Call once after loading."""
	if not torch.cuda.is_available():
	return
	if self._cg_use_triton != use_triton:
	self._cg_graphs = {}
	self._cg_logits = {}
	self._cg_use_triton = use_triton
	device = next(self.parameters()).device
	self._cg_token = torch.zeros((1, 1), dtype=torch.long, device=device)
	self._cg_states = self.model.init_state(1, device=device)
	for s in self._CUDA_GRAPH_SEQ_LENS:
	if s > max_seq_len:
	break
	self._cg_capture(s)
	self._cg_ready = True
	print(f"Pre-captured {len(self._cg_graphs)} CUDA graphs (triton={use_triton}).")

	def get_input_embeddings(self):
	return self.model.embedding

	def set_input_embeddings(self, value):
	self.model.embedding = value

	def get_output_embeddings(self):
	return self.model.lm_head

	def set_output_embeddings(self, value):
	self.model.lm_head = value

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	labels: Optional[torch.LongTensor] = None,
	**kwargs,
	) -> CausalLMOutputWithPast:
	"""
	Standard forward pass (used for training / perplexity).

	Note: attention_mask is accepted for API compatibility but is not used
	in the forward pass — SeqCond is always causal.
	"""
	logits = self.model(input_ids)

	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss = F.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1),
	)

	return CausalLMOutputWithPast(loss=loss, logits=logits)

	@staticmethod
	def _detect_triton() -> bool:
	try:
	import triton # noqa: F401
	return True
	except ImportError:
	return False

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.LongTensor,
	max_new_tokens: int = 1024,
	temperature: float = 0.15,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.1,
	eos_token_id: Optional[int] = None,
	acceleration: str = "auto",
	use_triton: Optional[bool] = None,
	use_cuda_graph: Optional[bool] = None,
	**kwargs,
	) -> torch.LongTensor:
	"""
	Autoregressive generation with state-based O(1) decoding.

	Args:
	acceleration: One of ``"auto"`` (default), ``"cuda_graph"``,
	``"triton"`` (cuda_graph + triton), or ``"none"``.
	``"auto"`` uses CUDA graphs when a GPU is available, and adds
	Triton kernels automatically if the triton package is installed.
	Explicit ``use_triton`` / ``use_cuda_graph`` kwargs override this.

	Returns the full sequence (prompt + generated tokens) as a LongTensor.
	"""
	# ------------------------------------------------------------------
	# Resolve acceleration mode
	# ------------------------------------------------------------------
	on_cuda = torch.cuda.is_available() and input_ids.device.type == "cuda"

	if acceleration == "auto":
	_use_cuda_graph = on_cuda
	_use_triton = on_cuda and self._detect_triton()
	elif acceleration == "triton":
	_use_cuda_graph = on_cuda
	_use_triton = on_cuda
	elif acceleration == "cuda_graph":
	_use_cuda_graph = on_cuda
	_use_triton = False
	else: # "none"
	_use_cuda_graph = False
	_use_triton = False

	# Legacy kwargs override
	if use_cuda_graph is not None:
	_use_cuda_graph = use_cuda_graph and on_cuda
	if use_triton is not None:
	_use_triton = use_triton and on_cuda

	# Lazy precompute on first generate() call
	if _use_cuda_graph and not self._cg_ready:
	self.precompute(max_seq_len=2048, use_triton=_use_triton)
	elif _use_cuda_graph and self._cg_use_triton != _use_triton:
	self.precompute(max_seq_len=2048, use_triton=_use_triton)

	use_triton = _use_triton
	use_cuda_graph = _use_cuda_graph
	if eos_token_id is None:
	eos_token_id = self.config.eos_token_id

	device = input_ids.device
	B = input_ids.size(0)

	# Prefill
	logits, states = self.model.prefill(input_ids)
	logits = logits.squeeze(1) # (B, vocab)

	generated = input_ids.tolist()
	finished = [False] * B
	token_buf = torch.zeros((B, 1), dtype=torch.long, device=device)
	seq_len = input_ids.size(1)

	# CUDA graph: sync prefill states into static buffer once before decode loop
	if use_cuda_graph and torch.cuda.is_available() and B == 1:
	if self._cg_token is None:
	self._cg_use_triton = use_triton
	self._cg_token = torch.zeros((1, 1), dtype=torch.long, device=device)
	self._cg_states = self.model.init_state(1, device=device)
	self._cg_copy_states(states, self._cg_states)
	states = self._cg_states

	for _ in range(max_new_tokens):
	# Temperature scaling
	if temperature > 0:
	ls = logits / temperature
	else:
	ls = logits.clone()

	# Repetition penalty
	if repetition_penalty != 1.0:
	for bi, toks in enumerate(generated):
	for t in set(toks):
	if 0 <= t < self.config.vocab_size:
	ls[bi, t] /= repetition_penalty

	# Sampling
	if temperature == 0:
	next_tokens = torch.argmax(ls, dim=-1)
	else:
	if top_k > 0:
	kth = torch.topk(ls, top_k, dim=-1).values[:, -1:]
	ls = ls.masked_fill(ls < kth, float("-inf"))
	if top_p < 1.0:
	sorted_ls, sorted_idx = torch.sort(ls, dim=-1, descending=True)
	cum_probs = torch.cumsum(F.softmax(sorted_ls, dim=-1), dim=-1)
	sorted_remove = cum_probs > top_p
	sorted_remove[:, 1:] = sorted_remove[:, :-1].clone()
	sorted_remove[:, 0] = False
	remove = torch.zeros_like(sorted_remove)
	remove.scatter_(1, sorted_idx, sorted_remove)
	ls = ls.masked_fill(remove, float("-inf"))
	probs = F.softmax(ls, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)

	for bi in range(B):
	tok = next_tokens[bi].item()
	generated[bi].append(tok)
	if eos_token_id is not None and tok == eos_token_id:
	finished[bi] = True
	token_buf[bi, 0] = tok

	if all(finished):
	break

	seq_len += 1

	if use_cuda_graph and torch.cuda.is_available() and B == 1:
	cg_sl = self._cg_get_seq_len(seq_len - 1)
	if cg_sl not in self._cg_graphs:
	self._cg_capture(cg_sl)
	self._cg_token.copy_(token_buf)
	self._cg_graphs[cg_sl].replay()
	logits = self._cg_logits[cg_sl]
	else:
	logits, states = self.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)

	max_len = max(len(g) for g in generated)
	pad_id = self.config.pad_token_id or 0
	out = torch.full((B, max_len), pad_id, dtype=torch.long, device=device)
	for bi, g in enumerate(generated):
	out[bi, : len(g)] = torch.tensor(g, dtype=torch.long, device=device)
	return out

	@torch.no_grad()
	def generate_batch(
	self,
	input_ids_list: List[torch.LongTensor],
	max_new_tokens: int = 1024,
	temperature: float = 0.7,
	eos_token_id: Optional[int] = None,
	use_triton: bool = False,
	) -> List[List[int]]:
	"""
	Batched generation: each prompt is prefilled independently, then
	decoded in lockstep with per-sample early stopping.

	Args:
	input_ids_list: List of 1D LongTensors, one per prompt.
	Returns:
	List of generated token id lists (completion only, EOS stripped).
	"""
	if eos_token_id is None:
	eos_token_id = self.config.eos_token_id

	device = input_ids_list[0].device
	B = len(input_ids_list)

	# Per-sample prefill
	all_logits, all_states = [], []
	for ids in input_ids_list:
	lg, st = self.model.prefill(ids.unsqueeze(0))
	all_logits.append(lg.squeeze(1))
	all_states.append(st)

	logits = torch.cat(all_logits, dim=0)
	# Stack states
	num_blocks = len(all_states[0])
	states = [
	tuple(torch.cat([s[i][j] for s in all_states], dim=0) for j in range(len(all_states[0][i])))
	for i in range(num_blocks)
	]

	generated = [[] for _ in range(B)]
	finished = [False] * B
	active_map = list(range(B))
	token_buf = torch.zeros((B, 1), dtype=torch.long, device=device)
	seq_len = max(ids.size(0) for ids in input_ids_list)

	for _ in range(max_new_tokens):
	B_cur = len(active_map)
	if B_cur == 0:
	break

	if temperature == 0:
	next_tokens = torch.argmax(logits, dim=-1)
	else:
	probs = F.softmax(logits / temperature, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)

	newly_done = set()
	for bi in range(B_cur):
	oi = active_map[bi]
	tok = next_tokens[bi].item()
	generated[oi].append(tok)
	if eos_token_id is not None and tok == eos_token_id:
	finished[oi] = True
	newly_done.add(bi)
	else:
	token_buf[bi, 0] = tok

	if all(finished):
	break

	if newly_done:
	keep = [bi for bi in range(B_cur) if bi not in newly_done]
	if not keep:
	break
	keep_idx = torch.tensor(keep, device=device)
	token_buf = token_buf[keep_idx].contiguous()
	states = [tuple(s[keep_idx].contiguous() for s in st) for st in states]
	active_map = [active_map[bi] for bi in keep]

	seq_len += 1
	logits, states = self.model.step(token_buf, states, seq_len=seq_len, use_triton=use_triton)

	results = []
	for toks in generated:
	if toks and toks[-1] == eos_token_id:
	toks = toks[:-1]
	results.append(toks)
	return results