Merge pull request #22 from MotifTechnologies/jangwoong/mla-rope-fa4-port

Browse files

Files changed (6) hide show

benchmarks/bench_mla_rope.yaml +81 -0
benchmarks/cases/mla_rope.py +97 -0
benchmarks/run_cases.py +23 -3
tests/test_mla_rope_grad.py +142 -0
torch-ext/activation/__init__.py +6 -0
torch-ext/activation/fused_rope.py +502 -0

benchmarks/bench_mla_rope.yaml ADDED Viewed

	@@ -0,0 +1,81 @@

+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: jangwoong-mla-rope-bench
+  namespace: kbm-g-np-motif
+spec:
+  managedBy: trainer.kubeflow.org/trainjob-controller
+  podTemplateOverrides:
+    - spec:
+        containers:
+          - name: node
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: shm
+              - mountPath: /mair
+                name: mair
+        volumes:
+          - emptyDir:
+              medium: Memory
+              sizeLimit: 64Gi
+            name: shm
+          - name: mair
+            persistentVolumeClaim:
+              claimName: mair
+      targetJobs:
+        - name: node
+  runtimeRef:
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+    name: torch-distributed
+  suspend: false
+  trainer:
+    args:
+      - /bin/bash
+      - '-c'
+      - |
+        set -e
+        ACTIVATIONPATH=/mair/team-sys/jangwoong/activation
+        DATESTAMP=$(date +'%y_%m_%d_%H_%M')
+        SAVE_PATH=$ACTIVATIONPATH/benchmarks/results/mla_rope/${DATESTAMP}
+        mkdir -p $SAVE_PATH
+        pip install triton pandas
+        # Build activation from local source (copy to /tmp to avoid NFS race)
+        mkdir -p /tmp/activation_src && rm -rf /tmp/activation_src/* && \
+          rsync -a --exclude=build $ACTIVATIONPATH/ /tmp/activation_src/ && \
+          pip install --no-build-isolation /tmp/activation_src 2>&1 | tail -50
+        python -c "import activation; print('fused_q_rope_inplace:', activation.fused_q_rope_inplace); print('fused_kv_split_rope_cat:', activation.fused_kv_split_rope_cat)"
+        nvidia-smi | tee $SAVE_PATH/nvidia_smi.txt
+        echo "=== MLA RoPE benchmark ==="
+        cd $ACTIVATIONPATH/benchmarks
+        CUDA_VISIBLE_DEVICES=0 python run_cases.py --case mla_rope --dtype bf16 \
+          --save-path $SAVE_PATH/bench 2>&1 | tee $SAVE_PATH/bench.log
+        echo "=== Done. Results at: $SAVE_PATH ==="
+        exit 0
+    env:
+      - name: PYTHONUNBUFFERED
+        value: '1'
+      - name: PYTORCH_ALLOC_CONF
+        value: expandable_segments:True
+      - name: CUDA_LAUNCH_BLOCKING
+        value: '0'
+      - name: OMP_NUM_THREADS
+        value: '1'
+    image: ghcr.io/motiftechnologies/llm-training:v0.1.8
+    numNodes: 1
+    numProcPerNode: 1
+    resourcesPerNode:
+      limits:
+        cpu: '96'
+        memory: 1024Gi
+        nvidia.com/gpu: '8'
+      requests:
+        cpu: '96'
+        memory: 1024Gi
+        nvidia.com/gpu: '8'

benchmarks/cases/mla_rope.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""MLA RoPE case: fused (activation Triton) vs vanilla (PyTorch native).
+MLA head dims are fixed by motif3 spec (H_q=80, H_kv=16, D_nope=128,
+D_rope=64, D_v=128); the benchmark's only sweep axes are (bs, sl).
+The framework's ``dim`` axis is a dummy here — pass 0 in configs.
+"""
+import torch
+from torch import nn
+import activation
+from common.diff_engine import DiffCase
+# ---- MLA shapes (motif3_seq) -----------------------------------------------
+H_Q, H_KV = 80, 16
+D_NOPE, D_ROPE, D_V = 128, 64, 128
+D_QK = D_NOPE + D_ROPE  # 192
+# ---- reference (PyTorch native) --------------------------------------------
+def _precompute_freqs_cis(dim, end, theta=10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: dim // 2].float() / dim))
+    t = torch.arange(end, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)
+def _apply_rotary_emb_single(x, freqs_cis):
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    freqs_cis = freqs_cis[: x_.shape[1]].view(1, x_.shape[1], 1, x_.shape[3])
+    out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return out.type_as(x)
+def _reorder(qk, rope_dim):
+    B, S = qk.shape[0], qk.shape[1]
+    qk = qk.view(B, S, -1, rope_dim // 2, 2).transpose(3, 4)
+    return qk.reshape(B, S, -1, rope_dim)
+def _vanilla(q, kv_latent, k_pe, freqs_cis):
+    q_nope, q_pe = torch.split(q, [D_NOPE, D_ROPE], dim=-1)
+    q_pe = _reorder(_apply_rotary_emb_single(q_pe, freqs_cis), D_ROPE)
+    q_total = torch.cat([q_nope, q_pe], dim=-1)
+    k_pe_roped = _reorder(_apply_rotary_emb_single(k_pe.unsqueeze(2), freqs_cis), D_ROPE)
+    k_nope, v = torch.split(kv_latent, [D_NOPE, D_V], dim=-1)
+    k_full = torch.cat([k_nope, k_pe_roped.expand(-1, -1, H_KV, -1)], dim=-1)
+    return q_total, k_full, v
+def _fused(q, kv_latent, k_pe, freqs_cis):
+    q_total = activation.fused_q_rope_inplace(q, freqs_cis, D_NOPE, D_ROPE)
+    # k_pe RoPE stays PyTorch native (head-shared, too small for custom kernel)
+    k_pe_roped = _reorder(_apply_rotary_emb_single(k_pe.unsqueeze(2), freqs_cis), D_ROPE)
+    k_full, v = activation.fused_kv_split_rope_cat(kv_latent, k_pe_roped, D_NOPE, D_V, D_ROPE)
+    return q_total, k_full, v
+class _VanillaModule(nn.Module):
+    def forward(self, q, kv_latent, k_pe, freqs_cis):
+        return _vanilla(q, kv_latent, k_pe, freqs_cis)
+class _FusedModule(nn.Module):
+    def forward(self, q, kv_latent, k_pe, freqs_cis):
+        return _fused(q, kv_latent, k_pe, freqs_cis)
+class MLARoPE(DiffCase):
+    # Framework calls build_inputs(bs, sl, dim, dtype, eps) — dim unused.
+    def build_inputs(self, bs, sl, hidden, dtype, eps):
+        return {
+            "q":         (torch.randn(bs, sl, H_Q, D_QK, dtype=dtype) * 0.5).requires_grad_(True),
+            "kv_latent": (torch.randn(bs, sl, H_KV, D_NOPE + D_V, dtype=dtype) * 0.5).requires_grad_(True),
+            "k_pe":      (torch.randn(bs, sl, D_ROPE, dtype=dtype) * 0.5).requires_grad_(True),
+            "freqs_cis": _precompute_freqs_cis(D_ROPE, sl),
+        }
+    def make_naive(self, I):
+        return _VanillaModule()
+    def make_cuda(self, I):
+        return _FusedModule()
+    def forward(self, obj, I):
+        # fused_q_rope_inplace needs non-leaf q; wrap both paths for fairness
+        q_in = I["q"] * 1.0
+        kv_in = I["kv_latent"] * 1.0
+        kpe_in = I["k_pe"] * 1.0
+        return obj(q_in, kv_in, kpe_in, I["freqs_cis"])
+    def grad_inputs(self, I):
+        return [I["q"], I["kv_latent"], I["k_pe"]]
+CASE = MLARoPE()

benchmarks/run_cases.py CHANGED Viewed

@@ -62,7 +62,8 @@ def main():
     ap = argparse.ArgumentParser()
     ap.add_argument(
         "--case",
-        choices=["rms", "add_rms", "poly", "mul_poly", "grouped_mul_poly"],
         required=True)
     ap.add_argument("--plot", action="store_true")
     ap.add_argument(
@@ -95,12 +96,26 @@ def main():
     case: DiffCase = mod.CASE
     # Correctness checks across multiple configs
-    for bs, sl, hid in [(2, 128, 4096), (8, 4096, 1280), (1, 32768, 1280)]:
         print(
             f"Checking correctness: bs={bs}, sl={sl}, D={hid} "
             f"(N={bs*sl})...",
             end=" ")
-        calculate_diff(case, batch_size=bs, seq_len=sl, hidden_size=hid)
         print("✅")
     for dtype_name, dtype in dtypes:
@@ -109,6 +124,7 @@ def main():
         print(f"{'=' * 60}\n")
         save_dir = os.path.join(args.save_path, args.case, dtype_name)
         is_grouped = args.case == "grouped_mul_poly"
         if args.plot:
@@ -118,6 +134,8 @@ def main():
                 dim = [1280]
             elif "poly" in args.case:
                 dim = [8192, 16384]
             else:
                 dim = [2048, 4096]
             configs = list(
@@ -170,6 +188,8 @@ def main():
                 dim = [1280]
             elif "poly" in args.case:
                 dim = [8192, 16384]
             else:
                 dim = [2048, 4096]
             configs = list(

     ap = argparse.ArgumentParser()
     ap.add_argument(
         "--case",
+        choices=["rms", "add_rms", "poly", "mul_poly", "grouped_mul_poly",
+                 "mla_rope"],
         required=True)
     ap.add_argument("--plot", action="store_true")
     ap.add_argument(
     case: DiffCase = mod.CASE
     # Correctness checks across multiple configs
+    # NOTE: calculate_diff positionally calls build_inputs(hidden_size, bs, sl);
+    # bench framework positionally calls build_inputs(bs, sl, dim).  These
+    # disagree — rms-style cases don't care (all 3 axes are flat dims), but
+    # mla_rope does.  We match the bench convention in cases/mla_rope.py, so
+    # we swap arg names at the correctness call site below for that case.
+    if args.case == "mla_rope":
+        cfgs = [(1, 1024, 0), (4, 4096, 0), (8, 4096, 0)]  # (bs, sl, dummy)
+    else:
+        cfgs = [(2, 128, 4096), (8, 4096, 1280), (1, 32768, 1280)]
+    for bs, sl, hid in cfgs:
         print(
             f"Checking correctness: bs={bs}, sl={sl}, D={hid} "
             f"(N={bs*sl})...",
             end=" ")
+        if args.case == "mla_rope":
+            # Swap so positional (hidden_size, batch_size, seq_len) maps to
+            # our build_inputs(bs, sl, dim) as (bs, sl, dummy).
+            calculate_diff(case, batch_size=sl, seq_len=hid, hidden_size=bs)
+        else:
+            calculate_diff(case, batch_size=bs, seq_len=sl, hidden_size=hid)
         print("✅")
     for dtype_name, dtype in dtypes:
         print(f"{'=' * 60}\n")
         save_dir = os.path.join(args.save_path, args.case, dtype_name)
+        os.makedirs(save_dir, exist_ok=True)
         is_grouped = args.case == "grouped_mul_poly"
         if args.plot:
                 dim = [1280]
             elif "poly" in args.case:
                 dim = [8192, 16384]
+            elif args.case == "mla_rope":
+                dim = [0]  # MLA head dims are fixed; dim axis is a dummy
             else:
                 dim = [2048, 4096]
             configs = list(
                 dim = [1280]
             elif "poly" in args.case:
                 dim = [8192, 16384]
+            elif args.case == "mla_rope":
+                dim = [0]  # MLA head dims are fixed; dim axis is a dummy
             else:
                 dim = [2048, 4096]
             configs = list(

tests/test_mla_rope_grad.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Numerical parity test: activation MLA RoPE kernels vs PyTorch reference.
+The activation package exposes two Triton kernels for Motif3 MLA attention:
+  * fused_q_rope_inplace      — in-place RoPE on q's rope section
+  * fused_kv_split_rope_cat   — split kv_latent + register-broadcast k_pe to H heads + cat
+This test runs both the fused path and a pure-PyTorch reference over identical
+inputs (forward + backward) and compares all outputs and input gradients.
+Self-contained: the reference RoPE implementation lives in this file (no
+upstream model code dependency).
+"""
+import pytest
+import torch
+import activation
+from .utils import assert_close
+# Realistic motif3_seq per-GPU shapes (B=local_batch_size, H_q/H_kv per MLA spec).
+SHAPES = [
+    # (B, S, H_q, H_kv, D_nope, D_rope, D_v)
+    (8, 4096, 80, 16, 128, 64, 128),
+]
+DTYPES = [torch.bfloat16]
+SEEDS = [0]
+# ------------------------------------------------------------------ reference
+def _precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def _apply_rotary_emb_single(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """[B, S, H, D] interleaved → rotated, in interleaved layout."""
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    freqs_cis = freqs_cis[: x_.shape[1]].view(1, x_.shape[1], 1, x_.shape[3])
+    out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return out.type_as(x)
+def _reorder_headdim_elements_rope(qk: torch.Tensor, B: int, S: int, rope_dim: int) -> torch.Tensor:
+    """Interleaved [r0,i0,r1,i1,...] → contiguous [r0,r1,...,i0,i1,...]."""
+    qk = qk.view(B, S, -1, rope_dim // 2, 2)
+    qk = qk.transpose(3, 4)
+    return qk.reshape(B, S, -1, rope_dim)
+def vanilla_path(q, kv_latent, k_pe, freqs_cis, B, S, H_kv, D_nope, D_rope, D_v):
+    # Q
+    q_nope, q_pe = torch.split(q, [D_nope, D_rope], dim=-1)
+    q_pe = _apply_rotary_emb_single(q_pe, freqs_cis)
+    q_pe = _reorder_headdim_elements_rope(q_pe, B, S, D_rope)
+    q_total = torch.cat([q_nope, q_pe], dim=-1)
+    # k_pe (head-shared, H=1)
+    k_pe_4d = k_pe.unsqueeze(2)
+    k_pe_roped = _apply_rotary_emb_single(k_pe_4d, freqs_cis)
+    k_pe_roped = _reorder_headdim_elements_rope(k_pe_roped, B, S, D_rope)
+    # KV split + expand + cat
+    k_nope, v = torch.split(kv_latent, [D_nope, D_v], dim=-1)
+    k_full = torch.cat([k_nope, k_pe_roped.expand(-1, -1, H_kv, -1)], dim=-1)
+    return q_total, k_full, v
+def fused_path(q, kv_latent, k_pe, freqs_cis, B, S, H_kv, D_nope, D_rope, D_v):
+    q_total = activation.fused_q_rope_inplace(q, freqs_cis, D_nope, D_rope)
+    # k_pe RoPE stays PyTorch native (head-shared; standalone Triton kernel was
+    # launch-bound on B200, no measurable win — see PR #22).
+    k_pe_4d = k_pe.unsqueeze(2)
+    k_pe_roped = _apply_rotary_emb_single(k_pe_4d, freqs_cis)
+    k_pe_roped = _reorder_headdim_elements_rope(k_pe_roped, B, S, D_rope)
+    k_full, v = activation.fused_kv_split_rope_cat(
+        kv_latent, k_pe_roped, D_nope, D_v, D_rope
+    )
+    return q_total, k_full, v
+# ------------------------------------------------------------------ harness
+def _run_with_grad(path_fn, q, kv_latent, k_pe, freqs_cis, **shape_kwargs):
+    # Inputs come in as leaves; thread through a no-op so the in-place fused_q
+    # kernel sees a non-leaf (mirrors the real model where q is a Linear output).
+    q_leaf, kv_leaf, kpe_leaf = (
+        q.clone().detach().requires_grad_(True),
+        kv_latent.clone().detach().requires_grad_(True),
+        k_pe.clone().detach().requires_grad_(True),
+    )
+    q_in, kv_in, kpe_in = q_leaf * 1.0, kv_leaf * 1.0, kpe_leaf * 1.0
+    q_total, k_full, v = path_fn(q_in, kv_in, kpe_in, freqs_cis, **shape_kwargs)
+    loss = (q_total.float() ** 2).sum() + (k_full.float() ** 2).sum() + (v.float() ** 2).sum()
+    loss.backward()
+    return (
+        q_total.detach(), k_full.detach(), v.detach(),
+        q_leaf.grad.detach(), kv_leaf.grad.detach(), kpe_leaf.grad.detach(),
+    )
+# ------------------------------------------------------------------ test
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+def test_mla_rope_fused_vs_reference(shape, dtype, seed):
+    B, S, H_q, H_kv, D_nope, D_rope, D_v = shape
+    D_qk = D_nope + D_rope
+    device = "cuda"
+    torch.manual_seed(seed)
+    freqs_cis = _precompute_freqs_cis(D_rope, S).to(device)
+    q = (torch.randn(B, S, H_q, D_qk, device=device, dtype=dtype) * 0.5)
+    kv_latent = (torch.randn(B, S, H_kv, D_nope + D_v, device=device, dtype=dtype) * 0.5)
+    k_pe = (torch.randn(B, S, D_rope, device=device, dtype=dtype) * 0.5)
+    kw = dict(B=B, S=S, H_kv=H_kv, D_nope=D_nope, D_rope=D_rope, D_v=D_v)
+    van_q, van_k, van_v, van_gq, van_gkv, van_gkpe = _run_with_grad(
+        vanilla_path, q, kv_latent, k_pe, freqs_cis, **kw
+    )
+    our_q, our_k, our_v, our_gq, our_gkv, our_gkpe = _run_with_grad(
+        fused_path, q, kv_latent, k_pe, freqs_cis, **kw
+    )
+    # Forward outputs: small bf16 jitter expected on the q rope rotation
+    # (Triton fp32 accum vs inductor fp32 complex_mul order).
+    assert_close(our_q.float(), van_q.float(), atol=1e-2, rtol=1e-2)
+    # KV path is bit-exact (just slice + register broadcast + store).
+    assert_close(our_k.float(), van_k.float(), atol=0.0, rtol=0.0)
+    assert_close(our_v.float(), van_v.float(), atol=0.0, rtol=0.0)
+    # Input grads.
+    assert_close(our_gq.float(), van_gq.float(), atol=1e-2, rtol=1e-2)
+    assert_close(our_gkv.float(), van_gkv.float(), atol=0.0, rtol=0.0)
+    assert_close(our_gkpe.float(), van_gkpe.float(), atol=0.0, rtol=0.0)

torch-ext/activation/__init__.py CHANGED Viewed

@@ -2,6 +2,10 @@ import torch
 from . import layers, parallel_style
 from ._ops import ops
 from .grouped_poly_norm import fused_mul_grouped_poly_norm
 from .poly_norm import FusedMulPolyNormFunction, PolyNormFunction
 from .rms_norm import FusedAddRMSNormFunction, RMSNormFunction
@@ -50,6 +54,8 @@ __all__ = [
     "fused_mul_grouped_poly_norm",
     "rms_norm",
     "fused_add_rms_norm",
     "layers",
     "parallel_style",
     "ops",

 from . import layers, parallel_style
 from ._ops import ops
+from .fused_rope import (
+    fused_kv_split_rope_cat,
+    fused_q_rope_inplace,
+)
 from .grouped_poly_norm import fused_mul_grouped_poly_norm
 from .poly_norm import FusedMulPolyNormFunction, PolyNormFunction
 from .rms_norm import FusedAddRMSNormFunction, RMSNormFunction
     "fused_mul_grouped_poly_norm",
     "rms_norm",
     "fused_add_rms_norm",
+    "fused_q_rope_inplace",
+    "fused_kv_split_rope_cat",
     "layers",
     "parallel_style",
     "ops",

torch-ext/activation/fused_rope.py ADDED Viewed

	@@ -0,0 +1,502 @@

+"""Fused MLA RoPE kernels for Motif3 GDLAttention.
+Applies RoPE to the input tensor and outputs in contiguous format
+[real..., imag...] so no reorder_headdim_elements_rope is needed.
+Registered as torch custom_op for torch.compile compatibility (no graph break).
+Reference: Megatron-LM fused_mla_yarn_rope_apply.py
+"""
+import torch
+import triton
+import triton.language as tl
+# ---------------------------------------------------------------------------
+# Static configs per kernel — no autotune.
+#
+# Picked from a full-sweep dump (TRITON_PRINT_AUTOTUNING=1) on motif3_seq,
+# B=8, S=4096, n_layers=4, 8×B200, EP=8 (plurality across 8 ranks; see
+# PR description for details). These are the shapes the kernels see at
+# the current prod workload; hard-coding avoids autotune warm-up cost and
+# tie-break jitter between near-equivalent configs.
+#
+# If model config changes materially (different H, D, batch-size/seq-len
+# regime), re-run the dump and update these. Autotune key used to be
+# (rope_dim, head_num); same shape invariants still apply at runtime.
+# ---------------------------------------------------------------------------
+# BLOCK_H, num_warps, num_stages per kernel. H values denote head_num
+# (q is per-head H=80, kv fused expands head-shared k_pe to H_kv=16 in motif3_seq).
+_CFG_KV_ROPE_FWD         = dict(BLOCK_H=32, num_warps=8, num_stages=2)  # kv fused,   H=80
+_CFG_KV_ROPE_BWD         = dict(BLOCK_H=16, num_warps=4, num_stages=2)  # kv bwd; BLOCK_H must cover H_kv=16 (single program per token)
+_CFG_Q_ROPE_INPLACE_FWD  = dict(BLOCK_H=8,  num_warps=1, num_stages=2)  # q in-place, H=80
+_CFG_Q_ROPE_BWD          = dict(BLOCK_H=4,  num_warps=2, num_stages=2)  # q bwd,      H=80
+# ---------------------------------------------------------------------------
+# Phase 2: Fused KV split + RoPE + expand + cat kernel
+# Fuses: split(kv_latent, [k_dim, v_dim]) + RoPE(k_pe) + expand + cat → key, value
+# Reference: Megatron rotary_fwd_kv_kernel
+# ---------------------------------------------------------------------------
+@triton.jit
+def _kv_rope_fwd_kernel(
+    KV,       # [B*S, H, k_nope_dim + v_dim]
+    K_PE,     # [B*S, 1, rope_dim] (already RoPE'd, contiguous format)
+    O_KEY,    # [B*S, H, k_nope_dim + rope_dim]  output
+    O_VALUE,  # [B*S, H, v_dim]                  output
+    rope_dim: tl.constexpr,
+    k_nope_dim: tl.constexpr,
+    v_dim: tl.constexpr,
+    head_num: tl.constexpr,
+    stride_kv_token,
+    stride_kv_head,
+    stride_pe_token,
+    stride_k_token,
+    stride_k_head,
+    stride_v_token,
+    stride_v_head,
+    BLOCK_H: tl.constexpr,
+):
+    pid_token = tl.program_id(0)
+    pid_hblock = tl.program_id(1)
+    KV_ptr = KV + pid_token * stride_kv_token + pid_hblock * BLOCK_H * stride_kv_head
+    K_ptr = O_KEY + pid_token * stride_k_token + pid_hblock * BLOCK_H * stride_k_head
+    V_ptr = O_VALUE + pid_token * stride_v_token + pid_hblock * BLOCK_H * stride_v_head
+    h_off = tl.arange(0, BLOCK_H)[:, None]
+    mask = (pid_hblock * BLOCK_H + h_off) < head_num
+    # Read k_nope from KV
+    k_nope_off = h_off * stride_kv_head + tl.arange(0, k_nope_dim)[None, :]
+    k_nope = tl.load(KV_ptr + k_nope_off, mask=mask)
+    # Read v from KV
+    v_off = h_off * stride_kv_head + k_nope_dim + tl.arange(0, v_dim)[None, :]
+    v = tl.load(KV_ptr + v_off, mask=mask)
+    # Read k_pe (shared across all heads, already RoPE'd)
+    # K_PE is [B*S, 1, rope_dim], broadcast to all heads
+    pe_ptr = K_PE + pid_token * stride_pe_token
+    k_pe = tl.load(pe_ptr + tl.arange(0, rope_dim)[None, :])
+    k_pe = k_pe.broadcast_to(BLOCK_H, rope_dim)
+    # Write key = [k_nope | k_pe]
+    k_nope_out = h_off * stride_k_head + tl.arange(0, k_nope_dim)[None, :]
+    tl.store(K_ptr + k_nope_out, k_nope, mask=mask)
+    k_pe_out = h_off * stride_k_head + k_nope_dim + tl.arange(0, rope_dim)[None, :]
+    tl.store(K_ptr + k_pe_out, k_pe, mask=mask)
+    # Write value
+    v_out = h_off * stride_v_head + tl.arange(0, v_dim)[None, :]
+    tl.store(V_ptr + v_out, v, mask=mask)
+@triton.jit
+def _kv_rope_bwd_kernel(
+    DO_KEY,    # [B*S, H, k_nope_dim + rope_dim]  grad_key  (in)
+    DO_VALUE,  # [B*S, H, v_dim]                  grad_value(in)
+    DKV,       # [B*S, H, k_nope_dim + v_dim]     grad_kv_latent (out)
+    DKPE,      # [B*S, 1, rope_dim]               grad_k_pe     (out, bf16)
+    rope_dim: tl.constexpr,
+    k_nope_dim: tl.constexpr,
+    v_dim: tl.constexpr,
+    head_num: tl.constexpr,
+    stride_dk_token, stride_dk_head,
+    stride_dv_token, stride_dv_head,
+    stride_dkv_token, stride_dkv_head,
+    stride_dkpe_token,
+    BLOCK_H: tl.constexpr,
+):
+    """Reverse of _kv_rope_fwd_kernel.
+    Forward did: key_nope = slice(kv_latent), key_rope = broadcast(k_pe, H),
+                 value = slice(kv_latent).
+    Backward therefore: grad_kv_latent_nope = grad_key_nope, grad_kv_latent_v = grad_value,
+                        grad_k_pe = sum(grad_key_rope, dim=H).
+    Single program per token; BLOCK_H must cover head_num (H_kv=16 in MLA).
+    """
+    pid_token = tl.program_id(0)
+    DK_ptr  = DO_KEY   + pid_token * stride_dk_token
+    DV_ptr  = DO_VALUE + pid_token * stride_dv_token
+    DKV_ptr = DKV      + pid_token * stride_dkv_token
+    h_off = tl.arange(0, BLOCK_H)[:, None]
+    mask  = h_off < head_num
+    # 1. grad_key nope → grad_kv_latent nope section
+    nope_in  = h_off * stride_dk_head  + tl.arange(0, k_nope_dim)[None, :]
+    nope_out = h_off * stride_dkv_head + tl.arange(0, k_nope_dim)[None, :]
+    nope_data = tl.load(DK_ptr + nope_in, mask=mask)
+    tl.store(DKV_ptr + nope_out, nope_data, mask=mask)
+    # 2. grad_value → grad_kv_latent v section
+    v_in  = h_off * stride_dv_head  + tl.arange(0, v_dim)[None, :]
+    v_out = h_off * stride_dkv_head + k_nope_dim + tl.arange(0, v_dim)[None, :]
+    v_data = tl.load(DV_ptr + v_in, mask=mask)
+    tl.store(DKV_ptr + v_out, v_data, mask=mask)
+    # 3. grad_k_pe = sum over H of grad_key rope section
+    #    Accumulate in fp32 for precision across H heads.
+    rope_in = h_off * stride_dk_head + k_nope_dim + tl.arange(0, rope_dim)[None, :]
+    rope_grads = tl.load(DK_ptr + rope_in, mask=mask, other=0.0).to(tl.float32)
+    summed = tl.sum(rope_grads, axis=0)  # [rope_dim], fp32
+    pe_out = pid_token * stride_dkpe_token + tl.arange(0, rope_dim)
+    tl.store(DKPE + pe_out, summed.to(DKPE.dtype.element_ty))
+@torch.library.custom_op("motif::kv_rope_fwd", mutates_args=())
+def _kv_rope_fwd(
+    kv_latent: torch.Tensor,  # [B*S, H, k_nope_dim + v_dim]
+    k_pe: torch.Tensor,       # [B*S, 1, rope_dim] (already RoPE'd)
+    k_nope_dim: int,
+    v_dim: int,
+    rope_dim: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert kv_latent.stride(-1) == 1 and k_pe.stride(-1) == 1, (
+        "fused_rope kernel requires last-dim unit stride"
+    )
+    # MLA convention: k_pe is head-shared (single head broadcast to all Q heads
+    # via register-level broadcast inside the kernel). GQA/MQA variants that
+    # give k_pe its own head dim are not supported by this kernel.
+    assert k_pe.shape[1] == 1, f"k_pe must be head-shared (shape[1]==1), got {k_pe.shape}"
+    B_S, H, _ = kv_latent.shape
+    key = kv_latent.new_empty(B_S, H, k_nope_dim + rope_dim)
+    value = kv_latent.new_empty(B_S, H, v_dim)
+    grid = lambda META: (B_S, triton.cdiv(H, META["BLOCK_H"]))
+    _kv_rope_fwd_kernel[grid](
+        kv_latent, k_pe, key, value,
+        rope_dim, k_nope_dim, v_dim, H,
+        kv_latent.stride(0), kv_latent.stride(1),
+        k_pe.stride(0),
+        key.stride(0), key.stride(1),
+        value.stride(0), value.stride(1),
+        **_CFG_KV_ROPE_FWD,
+    )
+    return key, value
+@_kv_rope_fwd.register_fake
+def _kv_rope_fwd_fake(kv_latent, k_pe, k_nope_dim, v_dim, rope_dim):
+    B_S, H, _ = kv_latent.shape
+    key = kv_latent.new_empty(B_S, H, k_nope_dim + rope_dim)
+    value = kv_latent.new_empty(B_S, H, v_dim)
+    return key, value
+@torch.library.custom_op("motif::kv_rope_bwd", mutates_args=())
+def _kv_rope_bwd(
+    grad_key: torch.Tensor,    # [B*S, H, k_nope_dim + rope_dim]
+    grad_value: torch.Tensor,  # [B*S, H, v_dim]
+    k_nope_dim: int,
+    v_dim: int,
+    rope_dim: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert grad_key.stride(-1) == 1 and grad_value.stride(-1) == 1, (
+        "fused_rope kernel requires last-dim unit stride"
+    )
+    B_S, H, _ = grad_key.shape
+    # grad_kv_latent layout matches forward input: [nope | v]
+    grad_kv_latent = grad_key.new_empty(B_S, H, k_nope_dim + v_dim)
+    # grad_k_pe: head-shared, matches forward input shape
+    grad_k_pe = grad_key.new_empty(B_S, 1, rope_dim)
+    # Single program per token; BLOCK_H (=16) must be >= H.
+    grid = (B_S,)
+    _kv_rope_bwd_kernel[grid](
+        grad_key, grad_value, grad_kv_latent, grad_k_pe,
+        rope_dim, k_nope_dim, v_dim, H,
+        grad_key.stride(0), grad_key.stride(1),
+        grad_value.stride(0), grad_value.stride(1),
+        grad_kv_latent.stride(0), grad_kv_latent.stride(1),
+        grad_k_pe.stride(0),
+        **_CFG_KV_ROPE_BWD,
+    )
+    return grad_kv_latent, grad_k_pe
+@_kv_rope_bwd.register_fake
+def _kv_rope_bwd_fake(grad_key, grad_value, k_nope_dim, v_dim, rope_dim):
+    B_S, H, _ = grad_key.shape
+    return (
+        grad_key.new_empty(B_S, H, k_nope_dim + v_dim),
+        grad_key.new_empty(B_S, 1, rope_dim),
+    )
+class FusedKVRoPE(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kv_latent, k_pe, k_nope_dim, v_dim, rope_dim):
+        # kv_latent: [B, S, H, k_nope_dim + v_dim]
+        # k_pe: [B, S, 1, rope_dim] (already RoPE'd)
+        B, S, H, D = kv_latent.shape
+        key_3d, value_3d = _kv_rope_fwd(
+            kv_latent.reshape(B * S, H, D),
+            k_pe.reshape(B * S, 1, rope_dim),
+            k_nope_dim, v_dim, rope_dim,
+        )
+        ctx.k_nope_dim = k_nope_dim
+        ctx.v_dim = v_dim
+        ctx.rope_dim = rope_dim
+        ctx.shape = (B, S, H)
+        return key_3d.view(B, S, H, k_nope_dim + rope_dim), value_3d.view(B, S, H, v_dim)
+    @staticmethod
+    def backward(ctx, grad_key, grad_value):
+        B, S, H = ctx.shape
+        k_nope_dim = ctx.k_nope_dim
+        v_dim = ctx.v_dim
+        rope_dim = ctx.rope_dim
+        # Single Triton kernel does: nope copy + v copy + head-sum of rope section.
+        # Replaces (slice + cat + sum) inductor path.
+        grad_kv_latent_3d, grad_k_pe_3d = _kv_rope_bwd(
+            grad_key.contiguous().reshape(B * S, H, k_nope_dim + rope_dim),
+            grad_value.contiguous().reshape(B * S, H, v_dim),
+            k_nope_dim, v_dim, rope_dim,
+        )
+        grad_kv_latent = grad_kv_latent_3d.view(B, S, H, k_nope_dim + v_dim)
+        grad_k_pe = grad_k_pe_3d.view(B, S, 1, rope_dim)
+        return grad_kv_latent, grad_k_pe, None, None, None
+# ---------------------------------------------------------------------------
+# Q RoPE backward kernel — used by FusedQRoPEInplace.backward (below).
+# Out-of-place: reads contiguous grad_out, writes interleaved grad_in; nope
+# gradient is copied through unchanged.
+# ---------------------------------------------------------------------------
+@triton.jit
+def _q_rope_bwd_kernel(
+    DO,     # [B*S, H, nope_dim + rope_dim] grad (contiguous rope fmt)
+    DQ,     # [B*S, H, nope_dim + rope_dim] grad output (interleaved rope fmt)
+    COS,    # [S, rope_dim // 2]
+    SIN,    # [S, rope_dim // 2]
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    head_num: tl.constexpr,
+    seq_len,
+    stride_do_token,
+    stride_do_head,
+    stride_dq_token,
+    stride_dq_head,
+    BLOCK_H: tl.constexpr,
+):
+    HALF: tl.constexpr = rope_dim // 2
+    pid_token = tl.program_id(0)
+    pid_hblock = tl.program_id(1)
+    pos = pid_token % seq_len
+    DO_ptr = DO + pid_token * stride_do_token + pid_hblock * BLOCK_H * stride_do_head
+    DQ_ptr = DQ + pid_token * stride_dq_token + pid_hblock * BLOCK_H * stride_dq_head
+    h_off = tl.arange(0, BLOCK_H)[:, None]
+    mask = (pid_hblock * BLOCK_H + h_off) < head_num
+    # Copy nope grad as-is
+    nope_off_in = h_off * stride_do_head + tl.arange(0, nope_dim)[None, :]
+    nope_grad = tl.load(DO_ptr + nope_off_in, mask=mask)
+    nope_off_out = h_off * stride_dq_head + tl.arange(0, nope_dim)[None, :]
+    tl.store(DQ_ptr + nope_off_out, nope_grad, mask=mask)
+    # Inverse RoPE: contiguous → interleaved
+    cos = tl.load(COS + pos * HALF + tl.arange(0, HALF))
+    sin = tl.load(SIN + pos * HALF + tl.arange(0, HALF))
+    cos = cos.expand_dims(0).broadcast_to(BLOCK_H, HALF)
+    sin = sin.expand_dims(0).broadcast_to(BLOCK_H, HALF)
+    real_off = h_off * stride_do_head + nope_dim + tl.arange(0, HALF)[None, :]
+    imag_off = real_off + HALF
+    d_real = tl.load(DO_ptr + real_off, mask=mask).to(tl.float32)
+    d_imag = tl.load(DO_ptr + imag_off, mask=mask).to(tl.float32)
+    dx1 = d_real * cos + d_imag * sin
+    dx2 = -d_real * sin + d_imag * cos
+    x1_off = h_off * stride_dq_head + nope_dim + tl.arange(0, HALF)[None, :] * 2
+    x2_off = x1_off + 1
+    tl.store(DQ_ptr + x1_off, dx1, mask=mask)
+    tl.store(DQ_ptr + x2_off, dx2, mask=mask)
+@torch.library.custom_op("motif::q_rope_bwd", mutates_args=())
+def _q_rope_bwd(
+    grad_out: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    nope_dim: int,
+    rope_dim: int,
+    seq_len: int,
+) -> torch.Tensor:
+    assert grad_out.stride(-1) == 1, "fused_rope kernel requires last-dim unit stride"
+    B_S, H, D = grad_out.shape
+    grad_in = torch.empty_like(grad_out)
+    grid = lambda META: (B_S, triton.cdiv(H, META["BLOCK_H"]))
+    _q_rope_bwd_kernel[grid](
+        grad_out, grad_in, cos, sin,
+        nope_dim, rope_dim, H, seq_len,
+        grad_out.stride(0), grad_out.stride(1),
+        grad_in.stride(0), grad_in.stride(1),
+        **_CFG_Q_ROPE_BWD,
+    )
+    return grad_in
+@_q_rope_bwd.register_fake
+def _q_rope_bwd_fake(grad_out, cos, sin, nope_dim, rope_dim, seq_len):
+    return torch.empty_like(grad_out)
+# ---------------------------------------------------------------------------
+# In-place Q RoPE kernel (eliminates cat + nope copy)
+# Modifies q[..., nope_dim:] from interleaved → contiguous format IN-PLACE.
+# nope section [..., :nope_dim] is untouched.
+# Forward: in-place on Q (mutates)
+# Backward: out-of-place via existing _q_rope_bwd_kernel (nope grad copy + inverse rope)
+# ---------------------------------------------------------------------------
+@triton.jit
+def _q_rope_inplace_fwd_kernel(
+    Q,  # [B*S, H, nope_dim + rope_dim]  modified in-place on [..., nope_dim:]
+    COS,  # [S, rope_dim // 2]
+    SIN,  # [S, rope_dim // 2]
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    head_num: tl.constexpr,
+    seq_len,
+    stride_q_token,
+    stride_q_head,
+    BLOCK_H: tl.constexpr,
+):
+    HALF: tl.constexpr = rope_dim // 2
+    pid_token = tl.program_id(0)
+    pid_hblock = tl.program_id(1)
+    pos = pid_token % seq_len
+    cos = tl.load(COS + pos * HALF + tl.arange(0, HALF))
+    sin = tl.load(SIN + pos * HALF + tl.arange(0, HALF))
+    cos = cos.expand_dims(0).broadcast_to(BLOCK_H, HALF)
+    sin = sin.expand_dims(0).broadcast_to(BLOCK_H, HALF)
+    Q_ptr = Q + pid_token * stride_q_token + pid_hblock * BLOCK_H * stride_q_head
+    h_off = tl.arange(0, BLOCK_H)[:, None]
+    mask = (pid_hblock * BLOCK_H + h_off) < head_num
+    # Read rope section interleaved: [r0,i0,r1,i1,...]
+    x1_off = h_off * stride_q_head + nope_dim + tl.arange(0, HALF)[None, :] * 2
+    x2_off = x1_off + 1
+    x1 = tl.load(Q_ptr + x1_off, mask=mask).to(tl.float32)
+    x2 = tl.load(Q_ptr + x2_off, mask=mask).to(tl.float32)
+    out_real = x1 * cos - x2 * sin
+    out_imag = x1 * sin + x2 * cos
+    # Write back to SAME rope section in contiguous format: [r0..r31, i0..i31]
+    real_off = h_off * stride_q_head + nope_dim + tl.arange(0, HALF)[None, :]
+    imag_off = real_off + HALF
+    tl.store(Q_ptr + real_off, out_real, mask=mask)
+    tl.store(Q_ptr + imag_off, out_imag, mask=mask)
+@torch.library.custom_op("motif::q_rope_inplace_fwd", mutates_args=("q",))
+def _q_rope_inplace_fwd(
+    q: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    nope_dim: int,
+    rope_dim: int,
+    seq_len: int,
+) -> None:
+    # In-place op: `q` is mutated on [..., nope_dim:]; no return value by design
+    # (declared via `mutates_args=("q",)` on the custom_op).
+    assert q.stride(-1) == 1, "fused_rope kernel requires last-dim unit stride"
+    B_S, H, _ = q.shape
+    grid = lambda META: (B_S, triton.cdiv(H, META["BLOCK_H"]))
+    _q_rope_inplace_fwd_kernel[grid](
+        q, cos, sin,
+        nope_dim, rope_dim, H, seq_len,
+        q.stride(0), q.stride(1),
+        **_CFG_Q_ROPE_INPLACE_FWD,
+    )
+class FusedQRoPEInplace(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, cos, sin, nope_dim, rope_dim, seq_len):
+        # q: [B, S, H, nope_dim + rope_dim]  mutated in-place on [..., nope_dim:]
+        B, S, H, D = q.shape
+        assert D == nope_dim + rope_dim
+        # Require full contiguity so that `reshape(B*S, H, D)` is guaranteed to
+        # be a view — otherwise reshape silently copies and the in-place mutation
+        # would not reach the original `q` that `ctx.mark_dirty` targets.
+        assert q.is_contiguous(), "FusedQRoPEInplace requires contiguous q"
+        _q_rope_inplace_fwd(q.reshape(B * S, H, D), cos, sin, nope_dim, rope_dim, seq_len)
+        ctx.mark_dirty(q)
+        ctx.save_for_backward(cos, sin)
+        ctx.nope_dim = nope_dim
+        ctx.rope_dim = rope_dim
+        ctx.seq_len = seq_len
+        ctx.shape = (B, S, H, D)
+        # Return the mutated `q` itself (not a new tensor) so autograd edges
+        # flow through; the underlying op is declared `-> None` (in-place).
+        return q
+    @staticmethod
+    def backward(ctx, grad_out):
+        # grad_out: [B, S, H, nope_dim + rope_dim]  rope section in contiguous grad fmt
+        # Produce grad_in: same shape, rope section in interleaved grad fmt (matches Q input layout)
+        cos, sin = ctx.saved_tensors
+        B, S, H, D = ctx.shape
+        # Reuse existing Q backward kernel (copies nope grad + inverse-ropes pe section)
+        grad_in_3d = _q_rope_bwd(
+            grad_out.contiguous().reshape(B * S, H, D),
+            cos, sin, ctx.nope_dim, ctx.rope_dim, ctx.seq_len,
+        )
+        return grad_in_3d.view(B, S, H, D), None, None, None, None, None
+def fused_q_rope_inplace(q, freqs_cis, nope_dim, rope_dim):
+    """In-place fused Q RoPE. Modifies q[..., nope_dim:] from interleaved → contiguous format.
+    Replaces:
+        q_nope, q_pe = split(q, [nope_dim, rope_dim])
+        q_pe = fused_apply_rope(q_pe, freqs_cis)
+        q_total = cat([q_nope, q_pe])
+    Saves the cat copy (~415 µs/layer for Motif3) compared to out-of-place variants.
+    Args:
+        q: [B, S, H, nope_dim + rope_dim] from wq_b output. Will be mutated.
+        freqs_cis: [max_seq_len, rope_dim//2] complex64
+    Returns:
+        Same tensor `q`, now with rope section in contiguous format.
+    """
+    S = q.shape[1]
+    cos = freqs_cis[:S].real.contiguous()
+    sin = freqs_cis[:S].imag.contiguous()
+    return FusedQRoPEInplace.apply(q, cos, sin, nope_dim, rope_dim, S)
+def fused_kv_split_rope_cat(kv_latent, k_pe, k_nope_dim, v_dim, rope_dim):
+    """Fused KV split + k_pe expand + cat. No graph break.
+    Replaces:
+        k_nope, v = split(kv_latent, [k_nope_dim, v_dim])
+        k_full = cat([k_nope, k_pe.expand(-1,-1,H,-1)])
+    Args:
+        kv_latent: [B, S, H, k_nope_dim + v_dim]
+        k_pe: [B, S, 1, rope_dim] (already RoPE'd, contiguous format)
+    Returns:
+        key:   [B, S, H, k_nope_dim + rope_dim]
+        value: [B, S, H, v_dim]
+    """
+    return FusedKVRoPE.apply(kv_latent, k_pe, k_nope_dim, v_dim, rope_dim)