kernels-bot commited on 5 days ago

Commit

b3e9a67

verified ·

1 Parent(s): 1b1ada2

Uploaded using `kernel-builder`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch-cuda/__init__.py +24 -0
build/torch-cuda/_ops.py +8 -0
build/torch-cuda/_ops_compat.py +10 -0
build/torch-cuda/enums.py +30 -0
build/torch-cuda/functional/__init__.py +554 -0
build/torch-cuda/functional/backward.py +682 -0
build/torch-cuda/functional/forward.py +238 -0
build/torch-cuda/functional/grouped_gemm.py +0 -0
build/torch-cuda/functional/moe_config.py +581 -0
build/torch-cuda/functional/reduction_over_k_gather.py +164 -0
build/torch-cuda/functional/tile_scheduler.py +91 -0
build/torch-cuda/functional/topk_softmax.py +195 -0
build/torch-cuda/functional/triton_kernels/__init__.py +351 -0
build/torch-cuda/functional/triton_kernels/bitmatrix.py +147 -0
build/torch-cuda/functional/utils.py +25 -0
build/torch-cuda/jit.py +159 -0
build/torch-cuda/metadata.json +10 -0
build/torch-cuda/moe.py +368 -0
build/torch-cuda/quack/__init__.py +8 -0
build/torch-cuda/quack/_ops_compat.py +4 -0
build/torch-cuda/quack/activation.py +524 -0
build/torch-cuda/quack/autotuner.py +369 -0
build/torch-cuda/quack/broadcast_utils.py +29 -0
build/torch-cuda/quack/compile_utils.py +19 -0
build/torch-cuda/quack/copy_utils.py +614 -0
build/torch-cuda/quack/cute_dsl_ptxas.py +151 -0
build/torch-cuda/quack/cute_dsl_utils.py +104 -0
build/torch-cuda/quack/fast_math.py +80 -0
build/torch-cuda/quack/gemm.py +194 -0
build/torch-cuda/quack/gemm_act.py +510 -0
build/torch-cuda/quack/gemm_config.py +95 -0
build/torch-cuda/quack/gemm_dact.py +215 -0
build/torch-cuda/quack/gemm_default_epi.py +259 -0
build/torch-cuda/quack/gemm_interface.py +1058 -0
build/torch-cuda/quack/gemm_sm100.py +0 -0
build/torch-cuda/quack/gemm_sm90.py +2070 -0
build/torch-cuda/quack/gemm_symmetric.py +330 -0
build/torch-cuda/quack/gemm_wrapper_utils.py +317 -0
build/torch-cuda/quack/layout_utils.py +295 -0
build/torch-cuda/quack/pipeline.py +324 -0
build/torch-cuda/quack/reduce.py +279 -0
build/torch-cuda/quack/reduction_base.py +83 -0
build/torch-cuda/quack/sm100_utils.py +62 -0
build/torch-cuda/quack/sm90_utils.py +157 -0
build/torch-cuda/quack/sort/__init__.py +1 -0
build/torch-cuda/quack/sort/bitonic_sort.py +129 -0
build/torch-cuda/quack/sort/generate_sorting_networks.py +326 -0
build/torch-cuda/quack/sort/sorting_networks.py +120 -0
build/torch-cuda/quack/sort/utils.py +31 -0
build/torch-cuda/quack/tensormap_manager.py +115 -0

build/torch-cuda/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from functools import lru_cache
+__version__ = "0.1.1"
+from .enums import KernelBackendMoE
+from .moe import MoE
+from .functional import (
+    enable_quack_gemm,
+    moe_general_routing_inputs,
+    moe_TC_softmax_topk_layer,
+)
+__all__ = [
+    "KernelBackendMoE",
+    "MoE",
+    "enable_quack_gemm",
+    "moe_general_routing_inputs",
+    "moe_TC_softmax_topk_layer",
+]

build/torch-cuda/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._sonic_moe_57a1b31
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_sonic_moe_57a1b31::{op_name}"

build/torch-cuda/_ops_compat.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Compatibility helpers for op namespacing in source and built layouts."""
+try:
+    from ._ops import add_op_namespace_prefix as _generated_add_op_namespace_prefix
+except ImportError:
+    def _generated_add_op_namespace_prefix(name: str) -> str:
+        return name if "::" in name else f"sonicmoe::{name}"
+def add_op_namespace_prefix(name: str) -> str:
+    return _generated_add_op_namespace_prefix(name)

build/torch-cuda/enums.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from enum import Enum
+LIBRARY_NAME = "sonicmoe"
+TENSORMAP = "tensormap"
+class KernelBackendMoE(Enum):
+    scattermoe = "scattermoe"
+    torch = "torch"
+    sonicmoe = "sonicmoe"
+class ActivationType(Enum):
+    SWIGLU = "swiglu"
+    GEGLU = "geglu"
+    REGLU = "reglu"
+    RELU_SQ = "relu_sq"
+    RELU = "relu"
+    GELU = "gelu"
+    SILU = "silu"
+def is_glu(activation_type: ActivationType):
+    return activation_type in [ActivationType.SWIGLU, ActivationType.REGLU, ActivationType.GEGLU]

build/torch-cuda/functional/__init__.py ADDED Viewed

	@@ -0,0 +1,554 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+import os
+import torch
+import torch.nn.functional as F
+from ..quack.gemm_interface import gemm
+from ..enums import ActivationType, is_glu
+from ..quack_utils import gemm_dgated, gemm_gated
+from .backward import (
+    _down_projection_backward_act,
+    _down_projection_backward_weight,
+    _softmax_topk_bwd,
+    _token_broadcast_backward,
+    _up_projection_backward_act,
+    _up_projection_backward_weight,
+)
+from .forward import _down_projection_forward, _router_forward, _softmax_topk_fwd, _up_projection_forward
+from .triton_kernels import TC_topk_router_metadata_triton, general_routing_router_metadata_triton
+from .utils import enable_quack_gemm, is_using_quack_gemm
+class TC_Softmax_Topk_Router_Function(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, router_logits: torch.Tensor, E: int, K: int) -> tuple[torch.Tensor, torch.Tensor]:
+        T = router_logits.size(0)
+        # change this to router_logits.dtype (bfloat16) increase another 5 tflops at fwd at the cost of numerical accuracy
+        topk_router_score = torch.empty(T, K, dtype=torch.float32, device=router_logits.device)
+        topk_router_indices = torch.empty(T, K, dtype=torch.int32, device=router_logits.device)
+        _softmax_topk_fwd(router_logits, topk_router_score, topk_router_indices, E, K)
+        ctx.save_for_backward(topk_router_score, topk_router_indices)
+        ctx.E = E
+        ctx.dtype = router_logits.dtype
+        return topk_router_score, topk_router_indices
+    @staticmethod
+    def backward(ctx, dtopk_score: torch.Tensor, _: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        T, K = dtopk_score.size()
+        topk_router_score, topk_router_indices = ctx.saved_tensors
+        dlogits = torch.zeros(T, ctx.E, dtype=ctx.dtype, device=topk_router_score.device)
+        _softmax_topk_bwd(dlogits, None, dtopk_score, topk_router_score, topk_router_indices, K)
+        return dlogits, None, None
+class _UpProjection(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        b1: torch.Tensor | None,
+        expert_frequency_offset: torch.Tensor,
+        total_expert_freq: int,
+        K: int,
+        stream_id: int,
+        x_gather_idx: torch.Tensor,
+        s_scatter_idx: torch.Tensor,
+        s_reverse_scatter_idx: torch.Tensor,
+        num_activated_expert_per_token_offset: torch.Tensor,
+        is_varlen_K: bool,
+        activation_type: ActivationType,
+        is_inference_mode_enabled: bool,
+    ) -> torch.Tensor:
+        T, H = x.shape
+        I, H, E = w1.shape
+        is_glu_activation = is_glu(activation_type)
+        if is_glu_activation:
+            I //= 2
+        TK = total_expert_freq
+        if is_using_quack_gemm():
+            assert not torch.compiler.is_compiling()
+            assert is_glu_activation, "QuACK GEMM does not support non GLU activation yet"
+            z, y1 = gemm_gated(
+                x,
+                w1.permute(2, 1, 0),
+                activation="swiglu",
+                cu_seqlens_m=expert_frequency_offset,
+                A_idx=x_gather_idx,
+                dynamic_scheduler=False,
+            )
+        else:
+            z = torch.empty(TK, (2 * I if is_glu_activation else I), dtype=x.dtype, device=x.device)
+            y1 = torch.empty(TK, I, dtype=x.dtype, device=x.device)
+            _up_projection_forward(
+                x=x,
+                w1=w1,
+                z=z,
+                y1=y1,
+                b1=b1,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                stream_id=stream_id,
+                activation_type=activation_type.value,
+                is_glu_activation=is_glu_activation,
+                is_inference_mode_enabled=is_inference_mode_enabled,
+            )
+        ctx.T = T
+        ctx.TK = TK
+        ctx.E = E
+        ctx.K = K
+        ctx.H = H
+        ctx.I = I
+        ctx.is_varlen_K = is_varlen_K
+        ctx.is_glu_activation = is_glu_activation
+        ctx.stream_id = stream_id
+        ctx.save_for_backward(
+            x,
+            w1,
+            b1,
+            expert_frequency_offset,
+            x_gather_idx,
+            s_scatter_idx,
+            s_reverse_scatter_idx,
+            num_activated_expert_per_token_offset,
+        )
+        ctx.mark_non_differentiable(y1)
+        ctx.set_materialize_grads(False)
+        return y1, z
+    @staticmethod
+    def backward(ctx, _: None, dz: torch.Tensor):
+        is_compiling = torch.compiler.is_compiling()
+        if not is_compiling:
+            assert _ is None
+        T = ctx.T
+        TK = ctx.TK
+        E = ctx.E
+        K = ctx.K
+        H = ctx.H
+        is_glu_activation = ctx.is_glu_activation
+        is_varlen_K = ctx.is_varlen_K
+        stream_id = ctx.stream_id
+        (
+            x,
+            w1,
+            b1,
+            expert_frequency_offset,
+            x_gather_idx,
+            s_scatter_idx,
+            s_reverse_scatter_idx,
+            num_activated_expert_per_token_offset,
+        ) = ctx.saved_tensors
+        dw1 = torch.empty_like(w1)
+        db1 = None if b1 is None else torch.empty_like(b1)
+        if is_using_quack_gemm():
+            assert not is_compiling
+            gemm(
+                x.T,
+                dz,
+                out=dw1.permute(2, 1, 0),
+                cu_seqlens_k=expert_frequency_offset,
+                A_idx=x_gather_idx,
+                batch_idx_permute=None,
+                dynamic_scheduler=False,
+            )
+            dx_expanded = gemm(dz, w1.permute(2, 0, 1), cu_seqlens_m=expert_frequency_offset, dynamic_scheduler=False)
+        else:
+            dx_expanded = torch.empty(TK, H, dtype=dz.dtype, device=dz.device)
+            _up_projection_backward_act(
+                w1=w1,
+                dx_expanded=dx_expanded,
+                dz=dz,
+                db1=db1,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                s_scatter_idx=s_scatter_idx,
+                is_glu_activation=is_glu_activation,
+                stream_id=stream_id,
+            )
+            _up_projection_backward_weight(
+                x=x,
+                dw1=dw1,
+                dz=dz,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                is_glu_activation=is_glu_activation,
+                stream_id=stream_id,
+            )
+        dx_reduced = torch.empty(T, H, dtype=dz.dtype, device=dz.device)
+        _token_broadcast_backward(
+            dx_reduced=dx_reduced,
+            dx_expanded=dx_expanded,
+            s_reverse_scatter_idx=s_reverse_scatter_idx,
+            num_activated_expert_per_token_offset=num_activated_expert_per_token_offset,
+            varlen_K_max=(E if is_varlen_K else K),
+            H=H,
+            is_varlen_K=is_varlen_K,
+        )
+        return dx_reduced, dw1, db1, *[None] * 12
+class _DownProjection(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        y1: torch.Tensor,
+        z: torch.Tensor,
+        w2: torch.Tensor,
+        b2: torch.Tensor | None,
+        topk_scores: torch.Tensor,
+        expert_frequency_offset: torch.Tensor,
+        T: int,
+        K: int,
+        stream_id: int,
+        x_gather_idx: torch.Tensor,
+        s_scatter_idx: torch.Tensor,
+        s_reverse_scatter_idx: torch.Tensor,
+        num_activated_expert_per_token_offset: torch.Tensor,
+        is_varlen_K: bool,
+        activation_type: ActivationType,
+    ) -> torch.Tensor:
+        TK = y1.size(0)
+        H, I, E = w2.shape
+        if is_using_quack_gemm():
+            assert not torch.compiler.is_compiling()
+            assert b2 is None
+            y2 = gemm(y1, w2.permute(2, 1, 0), cu_seqlens_m=expert_frequency_offset)
+        else:
+            y2 = torch.empty(TK, H, dtype=y1.dtype, device=y1.device)
+            _down_projection_forward(
+                w2=w2,
+                y1=y1,
+                y2=y2,
+                b2=b2,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                stream_id=stream_id,
+            )
+        o = torch.empty(T, H, device=z.device, dtype=z.dtype)
+        topk_scores = topk_scores.flatten()
+        _router_forward(
+            y2=y2,
+            o=o,
+            topk_scores=topk_scores,
+            s_reverse_scatter_idx=s_reverse_scatter_idx,
+            num_activated_expert_per_token_offset=num_activated_expert_per_token_offset,
+            varlen_K_max=(E if is_varlen_K else K),
+            H=H,
+            is_varlen_K=is_varlen_K,
+        )
+        ctx.T = T
+        ctx.K = K
+        ctx.is_varlen_K = is_varlen_K
+        ctx.activation_type = activation_type
+        ctx.stream_id = stream_id
+        ctx.save_for_backward(
+            z,
+            w2,
+            b2,
+            topk_scores,
+            expert_frequency_offset,
+            x_gather_idx,
+            s_scatter_idx,
+            s_reverse_scatter_idx,
+        )
+        return o
+    @staticmethod
+    def backward(ctx, dout: torch.Tensor):
+        T = ctx.T
+        K = ctx.K
+        stream_id = ctx.stream_id
+        is_varlen_K = ctx.is_varlen_K
+        activation_type = ctx.activation_type
+        (
+            z,
+            w2,
+            b2,
+            topk_scores,
+            expert_frequency_offset,
+            x_gather_idx,
+            s_scatter_idx,
+            s_reverse_scatter_idx,
+        ) = ctx.saved_tensors
+        dw2 = torch.empty_like(w2)
+        db2 = None if b2 is None else torch.empty_like(b2)
+        dz = torch.empty_like(z)
+        if is_using_quack_gemm():
+            assert not torch.compiler.is_compiling()
+            assert is_glu(activation_type), "QuACK GEMM does not support non GLU activation yet"
+            s = topk_scores[s_scatter_idx]
+            _, y1s, ds = gemm_dgated(
+                dout,
+                w2.permute(2, 0, 1),
+                PreAct=z,
+                activation="swiglu",
+                dx_out=dz,
+                colvec_scale=s,
+                colvec_reduce=True,
+                cu_seqlens_m=expert_frequency_offset,
+                A_idx=x_gather_idx,
+                dynamic_scheduler=False,
+            )
+            gemm(
+                dout.T,
+                y1s,
+                out=dw2.permute(2, 0, 1),
+                cu_seqlens_k=expert_frequency_offset,
+                A_idx=x_gather_idx,
+                batch_idx_permute=None,
+                dynamic_scheduler=False,
+            )
+            ds = ds[s_reverse_scatter_idx]
+        else:
+            ds = torch.empty_like(topk_scores)
+            I = w2.size(1)
+            TK = x_gather_idx.size(0)
+            y1s = torch.empty(TK, I, dtype=z.dtype, device=z.device)
+            is_glu_activation = is_glu(activation_type)
+            _down_projection_backward_act(
+                dout=dout,
+                z=z,
+                w2=w2,
+                dz=dz,
+                ds=ds,
+                b2=b2,
+                db2=db2,
+                y1s=y1s,
+                topk_scores=topk_scores,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                s_scatter_idx=s_scatter_idx,
+                is_glu_activation=is_glu_activation,
+                activation_type=activation_type.value,
+                stream_id=stream_id,
+            )
+            _down_projection_backward_weight(
+                dout=dout,
+                y1s=y1s,
+                dw2=dw2,
+                expert_frequency_offset=expert_frequency_offset,
+                expert_schedule_order=None,
+                x_gather_idx=x_gather_idx,
+                stream_id=stream_id,
+            )
+        # TC top-K routing
+        if not is_varlen_K:
+            ds = ds.view(T, K)
+        return None, dz, dw2, db2, ds, *[None] * 10
+def moe_TC_softmax_topk_layer(
+    x: torch.Tensor,
+    router_w: torch.Tensor,
+    w1: torch.Tensor,
+    b1: torch.Tensor | None,
+    w2: torch.Tensor,
+    b2: torch.Tensor | None,
+    K: int,
+    stream_id: int,
+    activation_type: ActivationType | str = ActivationType.SWIGLU,
+    is_inference_mode_enabled: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert ((b1 is None) and (b2 is None)) or (
+        (b1 is not None) and (b2 is not None)
+    ), "b1 and b2 has to be None or not None at the same time!"
+    E = router_w.size(0)
+    router_logits = F.linear(x, router_w)
+    topk_scores, topk_indices = TC_Softmax_Topk_Router_Function.apply(router_logits, E, K)
+    T, K = topk_indices.size()
+    TK = T * K
+    device = topk_indices.device
+    s_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    s_reverse_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    expert_frequency = torch.empty(E, dtype=torch.int32, device=device)
+    expert_frequency_offset = torch.empty(E + 1, dtype=torch.int32, device=device)
+    x_gather_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    TC_topk_router_metadata_triton(
+        topk_indices, E, expert_frequency, expert_frequency_offset, x_gather_idx, s_scatter_idx, s_reverse_scatter_idx
+    )
+    T = x.size(0)
+    if type(activation_type) == str:
+        activation_type = ActivationType(activation_type)
+    y1, z = _UpProjection.apply(
+        x,
+        w1,
+        b1,
+        expert_frequency_offset,
+        T * K,
+        K,
+        stream_id,
+        x_gather_idx,
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        None,
+        False,  # is_varlen_K
+        activation_type,
+        is_inference_mode_enabled,
+    )
+    o = _DownProjection.apply(
+        y1,
+        z,
+        w2,
+        b2,
+        topk_scores,
+        expert_frequency_offset,
+        T,
+        K,
+        stream_id,
+        x_gather_idx,
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        None,
+        False,  # is_varlen_K
+        activation_type,
+    )
+    return o, router_logits, expert_frequency
+# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# Weight format requirements:
+# - w1_weight: Shape (2*I, H, E), stride order (2, 0, 1), must be interleaved [gate_row0, up_row0, gate_row1, up_row1, ...]
+# - w2_weight: Shape (H, I, E), stride order (2, 0, 1)
+# We assume token_indices is already SORTED ascendingly !!!
+#   and len(token_indices) = len(expert_indices) = len(router_scores)
+# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+def moe_general_routing_inputs(
+    x: torch.Tensor,
+    router_scores: torch.Tensor,
+    token_indices: torch.Tensor,
+    expert_indices: torch.Tensor,
+    w1: torch.Tensor,
+    b1: torch.Tensor | None,
+    w2: torch.Tensor,
+    b2: torch.Tensor | None,
+    E: int,
+    stream_id: int,
+    activation_type: ActivationType,
+    is_inference_mode_enabled: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert ((b1 is None) and (b2 is None)) or (
+        (b1 is not None) and (b2 is not None)
+    ), "b1 and b2 has to be None or not None at the same time!"
+    T = x.size(0)
+    TK = router_scores.size(0)
+    E = w2.size(-1)
+    device = router_scores.device
+    s_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    s_reverse_scatter_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    expert_frequency = torch.empty(E, dtype=torch.int32, device=device)
+    expert_frequency_offset = torch.empty(E + 1, dtype=torch.int32, device=device)
+    x_gather_idx = torch.empty(TK, dtype=torch.int32, device=device)
+    num_activated_expert_per_token_offset = torch.empty(T + 1, dtype=torch.int32, device=device)
+    general_routing_router_metadata_triton(
+        token_indices,
+        expert_indices,
+        T,
+        E,
+        expert_frequency,
+        expert_frequency_offset,
+        x_gather_idx,
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        num_activated_expert_per_token_offset,
+    )
+    y1, z = _UpProjection.apply(
+        x,
+        w1,
+        b1,
+        expert_frequency_offset,
+        TK,
+        None,  # K, not needed
+        stream_id,
+        x_gather_idx,
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        num_activated_expert_per_token_offset,
+        True,  # is_varlen_K
+        activation_type,
+        is_inference_mode_enabled,
+    )
+    o = _DownProjection.apply(
+        y1,
+        z,
+        w2,
+        b2,
+        router_scores,
+        expert_frequency_offset,
+        T,
+        None,  # K, not needed
+        stream_id,
+        x_gather_idx,
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        num_activated_expert_per_token_offset,
+        True,  # is_varlen_K
+        activation_type,
+    )
+    return o, expert_frequency

build/torch-cuda/functional/backward.py ADDED Viewed

	@@ -0,0 +1,682 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from typing import Optional
+import cuda.bindings.driver as cuda
+import cutlass.cute as cute
+import torch
+import triton
+import triton.language as tl
+from .._ops_compat import add_op_namespace_prefix
+from ..enums import LIBRARY_NAME, TENSORMAP, ActivationType
+from ..utils import ceil_divide, convert_torch_tensor_to_cute_tensor, get_powers_of_2
+from .moe_config import (
+    HopperWgmma_MoE_Down_proj_ActGrad_Bwd,
+    HopperWgmma_MoE_Down_proj_WeightGrad_Bwd,
+    HopperWgmma_MoE_Up_proj_ActGrad_Bwd,
+    HopperWgmma_MoE_Up_proj_WeightGrad_Bwd,
+)
+from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
+def _get_autotune_configs_for_db2_and_ds() -> list[triton.Config]:
+    configs = []
+    for BLOCK_TK in get_powers_of_2(4, 32):
+        configs.append(triton.Config({"BLOCK_TK": BLOCK_TK}, num_warps=8, num_stages=4))
+    return configs
+@triton.autotune(
+    configs=_get_autotune_configs_for_db2_and_ds(),
+    key=["H", "E"],
+)
+@triton.jit
+def db2_and_ds_kernel(
+    dout_ptr,  # (T, H)
+    s_ptr,  # (TK,)
+    new_ds_partial_ptr,  # (TK, n_h_blocks)
+    old_ds_partial_ptr,  # (TK, OLD_DS_PARTIAL_N)
+    b2_ptr,  # (E, H),
+    db2_ptr,  # (E, H),
+    x_gather_idx_ptr,  # (TK,), maps grouped -> token index
+    s_scatter_idx_ptr,  # (TK,), maps grouped -> scatter index
+    expert_offset_ptr,  # (E+1,), offsets in grouped layout
+    H: tl.constexpr,
+    E: tl.constexpr,
+    OLD_DS_PARTIAL_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,  # Block size for H dimension
+    BLOCK_TK: tl.constexpr,  # Block size for token dimension
+    BLOCK_OLD_DS_PARTIAL_N: tl.constexpr,
+):
+    Eidx = tl.program_id(0)  # expert id
+    Hidx = tl.program_id(1)  # h-block id
+    NUM_H_BLOCKS: tl.constexpr = tl.num_programs(1)
+    # Hidden dimension indices for this block
+    h_offsets = Hidx * BLOCK_H + tl.arange(0, BLOCK_H)
+    h_mask = h_offsets < H
+    E_count_start = tl.load(expert_offset_ptr + Eidx)
+    E_count_end = tl.load(expert_offset_ptr + Eidx + 1)
+    n_tokens = E_count_end - E_count_start
+    b2 = tl.load(b2_ptr + Eidx * H + h_offsets, mask=h_mask, other=0.0).to(tl.float32)
+    db2_acc = tl.zeros([BLOCK_H], dtype=tl.float32)
+    # Process tokens in blocks of BLOCK_TK
+    for block_start in tl.range(0, n_tokens, BLOCK_TK):
+        # Token offsets within this block
+        tk_offsets = block_start + tl.arange(0, BLOCK_TK)
+        tk_mask = tk_offsets < n_tokens
+        tk_grouped = E_count_start + tk_offsets
+        # Gather token indices: [BLOCK_TK]
+        token_indices = tl.load(x_gather_idx_ptr + tk_grouped, mask=tk_mask, other=0).to(tl.uint32)
+        # Get scatter indices: [BLOCK_TK]
+        scatter_indices = tl.load(s_scatter_idx_ptr + tk_grouped, mask=tk_mask, other=0).to(tl.uint32)
+        s = tl.load(s_ptr + scatter_indices, mask=tk_mask, other=0.0).to(tl.float32)
+        # Gather dout: [BLOCK_TK, BLOCK_H]
+        dout_offsets = token_indices[:, None] * H + h_offsets[None, :]
+        dout_mask = tk_mask[:, None] & h_mask[None, :]
+        dout = tl.load(dout_ptr + dout_offsets, mask=dout_mask, other=0.0).to(tl.float32)
+        # Accumulate db2: sum over tokens of (dout * s)
+        db2_acc += tl.sum(dout * s[:, None], axis=0)  # Sum over BLOCK_TK dimension
+        # Compute ds: dot(dout, b2) for this H-block
+        ds_partial = tl.sum(dout * b2[None, :], axis=1)  # [BLOCK_TK]
+        # On first H-block, add old_ds_partial.sum(dim=1)
+        if Hidx == 0:
+            n_offsets = tl.arange(0, BLOCK_OLD_DS_PARTIAL_N)
+            old_ds_partial_offsets = scatter_indices[:, None] * OLD_DS_PARTIAL_N + n_offsets[None, :]
+            old_ds_partial_mask = tk_mask[:, None] & (n_offsets[None, :] < OLD_DS_PARTIAL_N)
+            old_ds_partial_vals = tl.load(
+                old_ds_partial_ptr + old_ds_partial_offsets, mask=old_ds_partial_mask, other=0.0
+            ).to(tl.float32)
+            ds_partial += tl.sum(old_ds_partial_vals, axis=1)
+        tl.store(new_ds_partial_ptr + scatter_indices * NUM_H_BLOCKS + Hidx, ds_partial, mask=tk_mask)
+    tl.store(db2_ptr + Eidx * H + h_offsets, db2_acc, mask=h_mask)
+def _get_autotune_configs_for_db1() -> list[triton.Config]:
+    configs = []
+    for BLOCK_TK in get_powers_of_2(4, 128):
+        for BLOCK_I in get_powers_of_2(64, 4096):
+            if 4096 <= BLOCK_I * BLOCK_TK <= 16384:
+                configs.append(triton.Config({"BLOCK_I": BLOCK_I, "BLOCK_TK": BLOCK_TK}, num_warps=8, num_stages=4))
+    return configs
+def _prune_triton_autotune_config(configs, nargs, **kw):
+    pruned_configs = []
+    for c in configs:
+        if c.kwargs["BLOCK_I"] <= triton.next_power_of_2(nargs["I"]):
+            pruned_configs.append(c)
+    return pruned_configs
+@triton.autotune(
+    configs=_get_autotune_configs_for_db1(),
+    key=["I", "E"],
+    prune_configs_by={"early_config_prune": _prune_triton_autotune_config},
+)
+@triton.jit
+def db1_kernel(
+    dz_ptr,  # (T, H)
+    db1_ptr,  # (E, H),
+    expert_offset_ptr,  # (E+1,), offsets in grouped layout
+    I: tl.constexpr,
+    E: tl.constexpr,
+    BLOCK_I: tl.constexpr,  # Block size for H dimension
+    BLOCK_TK: tl.constexpr,  # Block size for token dimension
+):
+    Eidx = tl.program_id(0)  # expert id
+    E_count_start = tl.load(expert_offset_ptr + Eidx).to(tl.int64)
+    E_count_end = tl.load(expert_offset_ptr + Eidx + 1).to(tl.int64)
+    n_tokens = E_count_end - E_count_start
+    NUM_I_BLOCKS: tl.constexpr = triton.cdiv(I, BLOCK_I)
+    for Iidx in tl.static_range(0, NUM_I_BLOCKS, 1):
+        i_offsets = Iidx * BLOCK_I + tl.arange(0, BLOCK_I)
+        i_mask = i_offsets < I
+        db1_acc = tl.zeros([BLOCK_I], dtype=tl.float32)
+        # Process tokens in blocks of BLOCK_TK
+        for block_start in tl.range(0, n_tokens, BLOCK_TK):
+            # Token offsets within this block
+            tk_offsets = block_start + tl.arange(0, BLOCK_TK)
+            tk_mask = tk_offsets < n_tokens
+            tk_grouped = E_count_start + tk_offsets
+            dz_offsets = tk_grouped[:, None] * I + i_offsets[None, :]
+            dz_mask = tk_mask[:, None] & i_mask[None, :]
+            dz = tl.load(dz_ptr + dz_offsets, mask=dz_mask, other=0.0).to(tl.float32)
+            db1_acc += tl.sum(dz, axis=0)  # Sum over BLOCK_TK dimension
+        db1_offsets = Eidx.to(tl.int64) * I + i_offsets
+        tl.store(db1_ptr + db1_offsets, db1_acc, mask=i_mask)
+@triton.jit
+def _colsum_smallN_kernel(
+    y_ptr,  # *mut  T, shape [M]
+    x_ptr,  # *const T, shape [M, N]
+    stride_xm: tl.constexpr,
+    stride_xn: tl.constexpr,  # strides of X
+    stride_y: tl.constexpr,  # stride of Y (usually 1)
+    N: tl.constexpr,  # sizes
+    BLOCK_N: tl.constexpr,  # tile size along N
+):
+    row = tl.program_id(0)
+    # assume BLOCK_N >= N
+    offs = tl.arange(0, BLOCK_N)
+    mask = offs < N
+    # Load a tile from the row; cast to fp32 for the reduction
+    x = tl.load(x_ptr + row * stride_xm + offs * stride_xn, mask=mask, other=0).to(tl.float32)
+    # Reduce this tile to a scalar and add
+    acc = tl.sum(x, axis=0)
+    # Store the row-sum (cast back to y dtype)
+    tl.store(y_ptr + row * stride_y, acc)
+@torch.library.custom_op(add_op_namespace_prefix("_up_projection_backward_act"), mutates_args={"dx_expanded", "db1"})
+def _up_projection_backward_act(
+    w1: torch.Tensor,
+    dx_expanded: torch.Tensor,
+    dz: torch.Tensor,
+    db1: torch.Tensor | None,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor | None,
+    x_gather_idx: torch.Tensor,
+    s_scatter_idx: torch.Tensor,
+    is_glu_activation: bool,
+    stream_id: int,
+) -> None:
+    I, H, E = w1.size()
+    if is_glu_activation:
+        I //= 2
+    # db1 computation
+    if db1 is not None:
+        db1_kernel[(E,)](dz, db1, expert_frequency_offset, (2 * I if is_glu_activation else I), E)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    mS_scatter = convert_torch_tensor_to_cute_tensor(s_scatter_idx, (0,), 0, 4, 1, stream=stream_id)
+    mDz = convert_torch_tensor_to_cute_tensor(dz, (0, 1), 1, 16, 8, stream=stream_id)
+    mDx_expanded = convert_torch_tensor_to_cute_tensor(dx_expanded, (0, 1), 1, 16, 8, stream=stream_id)
+    mW1_trans = convert_torch_tensor_to_cute_tensor(w1.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    compile_dx_key = ("dx", E, H, I, is_glu_activation, dx_expanded.dtype)
+    if compile_dx_key not in _up_projection_backward_act.compile_cache:
+        dx_module = HopperWgmma_MoE_Up_proj_ActGrad_Bwd(E, H, I, is_glu_activation)
+        tensormaps = [dx_module.module.generate_tensormap(None, None, None) for _ in range(2)]
+        _up_projection_backward_act.compile_cache[compile_dx_key] = cute.compile(
+            dx_module,
+            mDz,
+            mW1_trans,
+            mDx_expanded,
+            mE_offset,
+            mX_gather,
+            mS_scatter,
+            tensormaps,
+            mE_permute_order,
+            current_stream,
+        )
+        _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"] = tensormaps
+    dx_tensormaps = _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"]
+    _up_projection_backward_act.compile_cache[compile_dx_key](
+        mDz,
+        mW1_trans,
+        mDx_expanded,
+        mE_offset,
+        mX_gather,
+        mS_scatter,
+        dx_tensormaps,
+        mE_permute_order,
+        current_stream,
+    )
+_up_projection_backward_act.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_up_projection_backward_weight"), mutates_args={"dw1"})
+def _up_projection_backward_weight(
+    x: torch.Tensor,
+    dw1: torch.Tensor,
+    dz: torch.Tensor,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor | None,
+    x_gather_idx: torch.Tensor,
+    is_glu_activation: bool,
+    stream_id: int,
+) -> None:
+    I, H, E = dw1.size()
+    if is_glu_activation:
+        I //= 2
+    x = x.detach()
+    mDz_trans = convert_torch_tensor_to_cute_tensor(dz.T, (1, 0), 0, 16, 8, stream=stream_id)
+    mDw1_trans = convert_torch_tensor_to_cute_tensor(dw1.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
+    mX_trans = convert_torch_tensor_to_cute_tensor(x.T, (1, 0), 0, 16, 8, stream=stream_id)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    compile_dw1_key = ("dw1", E, H, I, is_glu_activation, x.dtype)
+    if compile_dw1_key not in _up_projection_backward_weight.compile_cache:
+        dw1_module = HopperWgmma_MoE_Up_proj_WeightGrad_Bwd(E, H, I, is_glu_activation)
+        tensormaps = [dw1_module.module.generate_tensormap(None, None, None) for _ in range(1)]
+        _up_projection_backward_weight.compile_cache[compile_dw1_key] = cute.compile(
+            dw1_module,
+            mX_trans,
+            mDz_trans,
+            mDw1_trans,
+            mE_offset,
+            mX_gather,
+            tensormaps,
+            mE_permute_order,
+            current_stream,
+        )
+        _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"] = tensormaps
+    dw1_tensormaps = _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"]
+    _up_projection_backward_weight.compile_cache[compile_dw1_key](
+        mX_trans,
+        mDz_trans,
+        mDw1_trans,
+        mE_offset,
+        mX_gather,
+        dw1_tensormaps,
+        mE_permute_order,
+        current_stream,
+    )
+_up_projection_backward_weight.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_act"), mutates_args={"dz", "ds", "db2", "y1s"})
+def _down_projection_backward_act(
+    dout: torch.Tensor,
+    z: torch.Tensor,
+    w2: torch.Tensor,
+    dz: torch.Tensor,
+    ds: torch.Tensor,
+    b2: torch.Tensor | None,
+    db2: torch.Tensor | None,
+    y1s: torch.Tensor,
+    topk_scores: torch.Tensor,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor | None,
+    x_gather_idx: torch.Tensor,
+    s_scatter_idx: torch.Tensor,
+    is_glu_activation: bool,
+    activation_type: str,
+    stream_id: int,
+) -> None:
+    H, I, E = w2.size()
+    TK = x_gather_idx.size(0)
+    dout = dout.detach()
+    w2 = w2.detach()
+    topk_scores = topk_scores.detach()
+    mDout = convert_torch_tensor_to_cute_tensor(dout, (0, 1), 1, 16, 8, stream=stream_id)
+    mW2_trans = convert_torch_tensor_to_cute_tensor(w2.permute(1, 0, 2), (2, 1, 0), 0, 16, 8, stream=stream_id)
+    mS = convert_torch_tensor_to_cute_tensor(topk_scores, (0,), 0, 4, 1, stream=stream_id)
+    if is_glu_activation:
+        mDz_kernel_input = convert_torch_tensor_to_cute_tensor(
+            dz.view(torch.float32), (0, 1), 1, 16, 8, stream=stream_id
+        )
+        mZ_kernel_input = convert_torch_tensor_to_cute_tensor(
+            z.view(torch.float32), (0, 1), 1, 16, 8, stream=stream_id
+        )
+    else:
+        mDz_kernel_input = convert_torch_tensor_to_cute_tensor(dz.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+        mZ_kernel_input = convert_torch_tensor_to_cute_tensor(z.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+    mY1S = convert_torch_tensor_to_cute_tensor(y1s, (0, 1), 1, 16, 8, stream=stream_id)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    mS_scatter = convert_torch_tensor_to_cute_tensor(s_scatter_idx, (0,), 0, 4, 1, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    ds_partial = None
+    compile_dz_key = ("dz", E, H, I, z.dtype, activation_type)
+    if compile_dz_key not in _down_projection_backward_act.compile_cache:
+        # I don't know why but this sync appears to fix a mysterious initialization bug??
+        torch.cuda.synchronize()
+        dz_module = HopperWgmma_MoE_Down_proj_ActGrad_Bwd(E, H, I, ActivationType(activation_type))
+        tensormaps = [dz_module.module.generate_tensormap(None, None, None) for _ in range(3)]
+        ds_partial_N = max(ceil_divide(I, dz_module.module.tile_shape_mnk[1]), 1)
+        ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
+        mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
+        _down_projection_backward_act.compile_cache["ds_partial_N"] = ds_partial_N
+        _down_projection_backward_act.compile_cache[compile_dz_key] = cute.compile(
+            dz_module,
+            mDout,
+            mW2_trans,
+            mZ_kernel_input,
+            mDz_kernel_input,
+            mY1S,
+            mS,
+            mDS_partial,
+            mE_offset,
+            mX_gather,
+            mS_scatter,
+            tensormaps,
+            mE_permute_order,
+            current_stream,
+        )
+        _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"] = tensormaps
+    if ds_partial is None:
+        ds_partial_N = _down_projection_backward_act.compile_cache["ds_partial_N"]
+        ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
+        mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
+    dz_tensormaps = _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"]
+    _down_projection_backward_act.compile_cache[compile_dz_key](
+        mDout,
+        mW2_trans,
+        mZ_kernel_input,
+        mDz_kernel_input,
+        mY1S,
+        mS,
+        mDS_partial,
+        mE_offset,
+        mX_gather,
+        mS_scatter,
+        dz_tensormaps,
+        mE_permute_order,
+        current_stream,
+    )
+    if db2 is None:
+        # we don't need to update ds
+        if ds_partial.size(1) == 1:
+            ds.copy_(ds_partial.view(-1).to(dtype=ds.dtype))
+        elif ds_partial.size(1) <= 32:
+            ds.copy_(ds_partial.sum(dim=-1, dtype=ds.dtype))
+        else:
+            M, N = ds_partial.size()
+            _colsum_smallN_kernel[M,](
+                y_ptr=ds,
+                x_ptr=ds_partial,
+                stride_xm=ds_partial.stride(0),
+                stride_xn=ds_partial.stride(1),
+                stride_y=1,
+                N=N,
+                BLOCK_N=triton.next_power_of_2(N),
+            )
+    else:
+        # db2 and ds update
+        BLOCK_H = min(triton.next_power_of_2(H), 2048)
+        NUM_H_BLOCKS = triton.cdiv(H, BLOCK_H)
+        new_ds_partial = torch.empty(TK, NUM_H_BLOCKS, device=ds.device, dtype=torch.float32)
+        db2_and_ds_kernel[(E, NUM_H_BLOCKS)](
+            dout,
+            topk_scores,
+            new_ds_partial,
+            ds_partial,
+            b2,
+            db2,
+            x_gather_idx,
+            s_scatter_idx,
+            expert_frequency_offset,
+            H,
+            E,
+            ds_partial_N,
+            BLOCK_H=BLOCK_H,
+            BLOCK_OLD_DS_PARTIAL_N=triton.next_power_of_2(ds_partial_N),
+        )
+        if NUM_H_BLOCKS == 1:
+            ds.copy_(new_ds_partial.view(-1).to(dtype=ds.dtype))
+        else:
+            ds.copy_(new_ds_partial.sum(dim=-1, dtype=ds.dtype))
+_down_projection_backward_act.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_down_projection_backward_weight"), mutates_args={"dw2"})
+def _down_projection_backward_weight(
+    dout: torch.Tensor,
+    y1s: torch.Tensor,
+    dw2: torch.Tensor,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor | None,
+    x_gather_idx: torch.Tensor,
+    stream_id: int,
+) -> None:
+    H, I, E = dw2.size()
+    mDout_trans = convert_torch_tensor_to_cute_tensor(dout.T, (1, 0), 0, 16, 8, stream=stream_id)
+    mDw2 = convert_torch_tensor_to_cute_tensor(dw2, (2, 0, 1), 1, 16, 8, stream=stream_id)
+    mY1S_trans = convert_torch_tensor_to_cute_tensor(y1s.T, (1, 0), 0, 16, 8, stream=stream_id)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    compile_dw2_key = ("dw2", E, H, I, dw2.dtype)
+    if compile_dw2_key not in _down_projection_backward_weight.compile_cache:
+        dw2_module = HopperWgmma_MoE_Down_proj_WeightGrad_Bwd(E, H, I)
+        tensormaps = [dw2_module.module.generate_tensormap(None, None, None) for _ in range(1)]
+        _down_projection_backward_weight.compile_cache[compile_dw2_key] = cute.compile(
+            dw2_module,
+            mDout_trans,
+            mY1S_trans,
+            mDw2,
+            mE_offset,
+            mX_gather,
+            tensormaps,
+            mE_permute_order,
+            current_stream,
+        )
+        _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"] = tensormaps
+    dw2_tensormaps = _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"]
+    _down_projection_backward_weight.compile_cache[compile_dw2_key](
+        mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, dw2_tensormaps, mE_permute_order, current_stream
+    )
+_down_projection_backward_weight.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_token_broadcast_backward"), mutates_args={"dx_reduced"})
+def _token_broadcast_backward(
+    dx_reduced: torch.Tensor,
+    dx_expanded: torch.Tensor,
+    s_reverse_scatter_idx: torch.Tensor,
+    num_activated_expert_per_token_offset: Optional[torch.Tensor],
+    varlen_K_max: int,
+    H: int,
+    is_varlen_K: bool,
+) -> None:
+    if num_activated_expert_per_token_offset is None:
+        assert not is_varlen_K, "`num_activated_expert_per_token_offset` as None requires fixed top-K routing"
+    token_gather_and_sum_varlen_K_triton(
+        dx_expanded,
+        None,
+        dx_reduced,
+        s_reverse_scatter_idx,
+        num_activated_expert_per_token_offset,
+        dx_reduced.size(0),
+        varlen_K_max,
+        H,
+        is_varlen_K,
+    )
+@triton.jit
+def _softmax_bwd_scatter_small_kernel(
+    dlogits_ptr,
+    dlogits_full_ptr,
+    score_ptr,
+    dscore_ptr,
+    idx_ptr,
+    stride_dm: tl.constexpr,
+    stride_dn: tl.constexpr,
+    stride_sm: tl.constexpr,
+    stride_sn: tl.constexpr,
+    stride_gm: tl.constexpr,
+    stride_gk: tl.constexpr,
+    stride_im: tl.constexpr,
+    stride_ik: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    dlogits_is_none: tl.constexpr,
+):
+    row = tl.program_id(axis=0)
+    # tl.assume(K <= BLOCK_K)
+    k_offs = tl.arange(0, BLOCK_K)
+    k_mask = k_offs < K
+    idx = tl.load(idx_ptr + row * stride_im + k_offs * stride_ik, mask=k_mask, other=0).to(tl.int32)
+    s_sel = tl.load(score_ptr + row * stride_sm + k_offs * stride_sn, mask=k_mask, other=0).to(tl.float32)
+    g_sel = tl.load(dscore_ptr + row * stride_gm + k_offs * stride_gk, mask=k_mask, other=0).to(tl.float32)
+    # dot = sum_j g_j * y_j over selected columns
+    dot = tl.sum(g_sel * s_sel, axis=0)
+    # scatter-only: dx[idx] += y_sel * (g_sel - dot)
+    add_vals = s_sel * (g_sel - dot)
+    indices = row * stride_dm + idx * stride_dn
+    if not dlogits_is_none:
+        add_vals += tl.load(dlogits_ptr + indices, mask=k_mask)
+    tl.store(dlogits_full_ptr + indices, add_vals, mask=k_mask)
+@torch.library.custom_op(add_op_namespace_prefix("_softmax_topk_bwd"), mutates_args={"dlogits_full"})
+def _softmax_topk_bwd(
+    dlogits_full: torch.Tensor,
+    dlogits: Optional[torch.Tensor],
+    dtopk_score: torch.Tensor,
+    topk_router_score: torch.Tensor,
+    topk_router_indices: torch.Tensor,
+    K: int,
+) -> None:
+    T = dtopk_score.shape[0]
+    _softmax_bwd_scatter_small_kernel[T,](
+        dlogits,
+        dlogits_full,
+        topk_router_score,
+        dtopk_score,
+        topk_router_indices,
+        dlogits_full.stride(0),
+        dlogits_full.stride(1),
+        topk_router_score.stride(0),
+        topk_router_score.stride(1),
+        dtopk_score.stride(0),
+        dtopk_score.stride(1),
+        topk_router_indices.stride(0),
+        topk_router_indices.stride(1),
+        K,
+        triton.next_power_of_2(K),
+        (dlogits is None),
+    )
+@triton.jit
+def _topk_bwd_scatter_small_kernel(
+    dlogits_full_ptr,
+    dscore_ptr,
+    idx_ptr,
+    stride_dm: tl.constexpr,
+    stride_dn: tl.constexpr,
+    stride_gm: tl.constexpr,
+    stride_gk: tl.constexpr,
+    stride_im: tl.constexpr,
+    stride_ik: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    row = tl.program_id(axis=0)
+    # tl.assume(K <= BLOCK_K)
+    k_offs = tl.arange(0, BLOCK_K)
+    k_mask = k_offs < K
+    idx = tl.load(idx_ptr + row * stride_im + k_offs * stride_ik, mask=k_mask, other=0).to(tl.int32)
+    g_sel = tl.load(dscore_ptr + row * stride_gm + k_offs * stride_gk, mask=k_mask, other=0).to(tl.float32)
+    # scatter-only: dx[idx] += y_sel * (g_sel - dot)
+    add_vals = g_sel
+    indices = row * stride_dm + idx * stride_dn
+    tl.store(dlogits_full_ptr + indices, add_vals, mask=k_mask)
+@torch.library.custom_op(add_op_namespace_prefix("_topk_bwd"), mutates_args={"dlogits_full"})
+def _topk_bwd(
+    dlogits_full: torch.Tensor,
+    dtopk_values: torch.Tensor,
+    topk_indices: torch.Tensor,
+    K: int,
+) -> None:
+    T = dtopk_values.shape[0]
+    _topk_bwd_scatter_small_kernel[T,](
+        dlogits_full,
+        dtopk_values,
+        topk_indices,
+        dlogits_full.stride(0),
+        dlogits_full.stride(1),
+        dtopk_values.stride(0),
+        dtopk_values.stride(1),
+        topk_indices.stride(0),
+        topk_indices.stride(1),
+        K,
+        triton.next_power_of_2(K),
+    )

build/torch-cuda/functional/forward.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+import cuda.bindings.driver as cuda
+import cutlass.cute as cute
+import torch
+import triton
+import triton.language as tl
+from cutlass.cute.runtime import from_dlpack
+from ..quack.cute_dsl_utils import torch2cute_dtype_map
+from ..enums import LIBRARY_NAME, TENSORMAP, ActivationType
+from .._ops_compat import add_op_namespace_prefix
+from ..utils import convert_torch_tensor_to_cute_tensor
+from .moe_config import HopperWgmma_MoE_Down_proj_Fwd, HopperWgmma_MoE_Up_proj_Fwd
+from .reduction_over_k_gather import token_gather_and_sum_varlen_K_triton
+from .topk_softmax import TopK_Softmax
+@torch.library.custom_op(add_op_namespace_prefix("_topk_fwd"), mutates_args={"values", "indices"})
+def _topk_fwd(
+    x: torch.Tensor, k: int, values: torch.Tensor, indices: torch.Tensor, require_softmax_fusion: bool = True
+) -> None:
+    """Top-k forward pass.
+    Args:
+        x: Input tensor of shape (M, N)
+        k: Number of top elements to return
+    Returns:
+        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
+    """
+    N = x.size(1)
+    input_dtype = torch2cute_dtype_map[x.dtype]
+    output_dtype = torch2cute_dtype_map[values.dtype]
+    convert_from_dlpack = lambda tensor: (
+        from_dlpack(tensor.detach(), assumed_align=16).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    )
+    x_tensor, values_tensor, indices_tensor = [convert_from_dlpack(tensor) for tensor in (x, values, indices)]
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compile_key = (input_dtype, output_dtype, N, k, require_softmax_fusion)
+    if compile_key not in _topk_fwd.compile_cache:
+        topk_op = TopK_Softmax(input_dtype, output_dtype, N, k, require_softmax_fusion)
+        _topk_fwd.compile_cache[compile_key] = cute.compile(
+            topk_op, x_tensor, values_tensor, indices_tensor, current_stream
+        )
+    _topk_fwd.compile_cache[compile_key](x_tensor, values_tensor, indices_tensor, current_stream)
+_topk_fwd.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_up_projection_forward"), mutates_args={"z", "y1"})
+def _up_projection_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    z: torch.Tensor,
+    y1: torch.Tensor,
+    b1: torch.Tensor | None,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor,
+    x_gather_idx: torch.Tensor,
+    stream_id: int,
+    activation_type: str,
+    is_glu_activation: bool,
+    is_inference_mode_enabled: bool = False,
+) -> None:
+    I, H, E = w1.size()
+    if is_glu_activation:
+        I //= 2
+    mX = convert_torch_tensor_to_cute_tensor(x.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+    mW1 = convert_torch_tensor_to_cute_tensor(w1.detach(), (2, 0, 1), 1, 16, 8, stream=stream_id)
+    mZ = convert_torch_tensor_to_cute_tensor(z, (0, 1), 1, 16, 8, stream=stream_id)
+    mY1 = convert_torch_tensor_to_cute_tensor(y1, (0, 1), 1, 16, 8, stream=stream_id)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    if b1 is None:
+        mB1 = None
+    else:
+        mB1 = convert_torch_tensor_to_cute_tensor(b1.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    compile_w1_key = (E, H, I, (b1 is None), x.dtype, activation_type, is_inference_mode_enabled)
+    if compile_w1_key not in _up_projection_forward.compile_cache:
+        w1_module = HopperWgmma_MoE_Up_proj_Fwd(
+            E, H, I, activation_type=ActivationType(activation_type), inference_mode=is_inference_mode_enabled
+        )
+        tensormaps = [w1_module.module.generate_tensormap(None, None, None) for _ in range(2)]
+        _up_projection_forward.compile_cache[compile_w1_key] = cute.compile(
+            w1_module,
+            mX,
+            mW1,
+            mZ,
+            mY1,
+            mB1,
+            mE_offset,
+            mX_gather,
+            tensormaps[0],
+            tensormaps[1],
+            mE_permute_order,
+            current_stream,
+        )
+        _up_projection_forward.compile_cache[TENSORMAP] = tensormaps
+    w1_tensormaps = _up_projection_forward.compile_cache[TENSORMAP]
+    _up_projection_forward.compile_cache[compile_w1_key](
+        mX,
+        mW1,
+        mZ,
+        mY1,
+        mB1,
+        mE_offset,
+        mX_gather,
+        w1_tensormaps[0],
+        w1_tensormaps[1],
+        mE_permute_order,
+        current_stream,
+    )
+_up_projection_forward.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_down_projection_forward"), mutates_args={"y2"})
+def _down_projection_forward(
+    w2: torch.Tensor,
+    y1: torch.Tensor,
+    y2: torch.Tensor,
+    b2: torch.Tensor | None,
+    expert_frequency_offset: torch.Tensor,
+    expert_schedule_order: torch.Tensor,
+    x_gather_idx: torch.Tensor,
+    stream_id: int,
+) -> None:
+    H, I, E = w2.size()
+    mW2 = convert_torch_tensor_to_cute_tensor(w2.detach(), (2, 0, 1), 1, 16, 8, stream=stream_id)
+    mY1 = convert_torch_tensor_to_cute_tensor(y1.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+    mY2 = convert_torch_tensor_to_cute_tensor(y2, (0, 1), 1, 16, 8, stream=stream_id)
+    mE_offset = convert_torch_tensor_to_cute_tensor(expert_frequency_offset, (0,), 0, 4, 1, stream=stream_id)
+    mX_gather = convert_torch_tensor_to_cute_tensor(x_gather_idx, (0,), 0, 4, 1, stream=stream_id)
+    if expert_schedule_order is None:
+        mE_permute_order = None
+    else:
+        mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
+    if b2 is None:
+        mB2 = None
+    else:
+        mB2 = convert_torch_tensor_to_cute_tensor(b2.detach(), (0, 1), 1, 16, 8, stream=stream_id)
+    current_stream = cuda.CUstream(stream_id)
+    compile_w2_key = (E, H, I, (b2 is None), w2.dtype)
+    if compile_w2_key not in _down_projection_forward.compile_cache:
+        w2_module = HopperWgmma_MoE_Down_proj_Fwd(E, H, I)
+        tensormaps = [w2_module.module.generate_tensormap(None, None, None) for _ in range(1)]
+        _down_projection_forward.compile_cache[compile_w2_key] = cute.compile(
+            w2_module, mY1, mW2, mY2, mB2, mE_offset, mX_gather, tensormaps[0], mE_permute_order, current_stream
+        )
+        _down_projection_forward.compile_cache[TENSORMAP] = tensormaps
+    w2_tensormaps = _down_projection_forward.compile_cache[TENSORMAP]
+    _down_projection_forward.compile_cache[compile_w2_key](
+        mY1, mW2, mY2, mB2, mE_offset, mX_gather, w2_tensormaps[0], mE_permute_order, current_stream
+    )
+_down_projection_forward.compile_cache = {}
+@torch.library.custom_op(add_op_namespace_prefix("_router_forward"), mutates_args={"o"})
+def _router_forward(
+    y2: torch.Tensor,
+    o: torch.Tensor,
+    topk_scores: torch.Tensor,
+    s_reverse_scatter_idx: torch.Tensor,
+    num_activated_expert_per_token_offset: torch.Tensor,
+    varlen_K_max: int,
+    H: int,
+    is_varlen_K: bool,
+) -> None:
+    token_gather_and_sum_varlen_K_triton(
+        y2,
+        topk_scores,
+        o,
+        s_reverse_scatter_idx,
+        num_activated_expert_per_token_offset,
+        o.size(0),
+        varlen_K_max,
+        H,
+        is_varlen_K,
+    )
+@triton.jit
+def _softmax_fwd_small_kernel(
+    logits_ptr, stride_lm: tl.constexpr, stride_ln: tl.constexpr, K: tl.constexpr, BLOCK_K: tl.constexpr
+):
+    row = tl.program_id(axis=0)
+    # tl.assume(K <= BLOCK_K)
+    k_offs = tl.arange(0, BLOCK_K)
+    k_mask = k_offs < K
+    # load full row (all columns) in one go (N is small)
+    x = tl.load(logits_ptr + row * stride_lm + k_offs * stride_ln, mask=k_mask, other=-float("inf")).to(tl.float32)
+    x = x - tl.max(x, axis=0)
+    ex = tl.exp(x)
+    y = ex / tl.sum(ex, axis=0)
+    tl.store(logits_ptr + row * stride_lm + k_offs * stride_ln, y, mask=k_mask)
+@torch.library.custom_op(
+    add_op_namespace_prefix("_softmax_topk_fwd"), mutates_args={"topk_router_score", "topk_router_indices"}
+)
+def _softmax_topk_fwd(
+    router_logits: torch.Tensor, topk_router_score: torch.Tensor, topk_router_indices: torch.Tensor, E: int, K: int
+) -> None:
+    # T = router_logits.shape[0]
+    if E <= 4096 and K <= 16 and E % 8 == 0:
+        # fast topk-softmax fusion that covers most common MoE configs
+        _topk_fwd(router_logits, K, topk_router_score, topk_router_indices, require_softmax_fusion=True)
+    else:
+        topk_results = router_logits.topk(K, dim=-1)
+        topk_router_score.copy_(topk_results.values.softmax(dim=-1, dtype=torch.float32).to(topk_router_score.dtype))
+        topk_router_indices.copy_(topk_results.indices.to(topk_router_indices.dtype))

build/torch-cuda/functional/grouped_gemm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/functional/moe_config.py ADDED Viewed

	@@ -0,0 +1,581 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+import math
+from dataclasses import dataclass
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import torch
+from cutlass import const_expr
+from ..quack.tile_scheduler import RasterOrderOption
+from ..enums import ActivationType, is_glu
+from .grouped_gemm import HopperWgmma_MoE_kernel
+LIBRARY_NAME = "cutedsl_kernels"
+def ceil_div(a: int, b: int):
+    return int(math.ceil(a / b))
+@dataclass
+class HopperGEMMConfig:
+    tile_shape_mnk: cutlass.Constexpr[cute.Shape] = (128, 256, 64)
+    cluster_shape_mnk: cutlass.Constexpr[cute.Shape] = (2, 1)
+    epi_tile_size: cutlass.Constexpr[int] = 32
+    ## assume we always use persistent kernel
+    # is_persistent: cutlass.Constexpr[bool] = True
+    is_pingpong: cutlass.Constexpr[bool] = False
+    raster_order: RasterOrderOption = RasterOrderOption.Heuristic
+    L2_group_size: int = 8
+    initial_d_epi_stage: cutlass.Constexpr[int] = 4
+class HopperWgmma_MoE_Up_proj_Fwd:
+    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType, inference_mode=False):
+        super().__init__()
+        is_glu_activation = is_glu(activation_type)
+        if is_glu_activation:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 64 == 0
+            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        else:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 128 == 0
+            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
+        # TODO: this assertion does not mean that the MoE impl prohibits such config.
+        # Instead, we just do not search for the best configs manually yet for small-shaped MoE
+        if (I >= 128 and is_glu_activation) or (I >= 256 and not is_glu_activation):
+            up_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 256, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=(32 if not inference_mode else 64),
+                is_pingpong=False,
+                initial_d_epi_stage=2,
+                raster_order=RasterOrderOption.AlongM,
+            )
+        elif (I == 64 and is_glu_activation) or (I == 128 and not is_glu_activation):
+            up_config = HopperGEMMConfig(
+                tile_shape_mnk=(192, 128, 64),
+                cluster_shape_mnk=(1, 1),
+                epi_tile_size=(32 if not inference_mode else 64),
+                is_pingpong=True,
+                initial_d_epi_stage=8,
+                raster_order=RasterOrderOption.AlongM,
+            )
+        else:
+            raise NotImplementedError()
+        compute_swiglu = False
+        compute_geglu = False
+        compute_reglu = False
+        compute_relu_sq = False
+        compute_silu = False
+        compute_relu = False
+        compute_gelu = False
+        if activation_type == ActivationType.SWIGLU:
+            compute_swiglu = True
+        elif activation_type == ActivationType.GEGLU:
+            compute_geglu = True
+        elif activation_type == ActivationType.REGLU:
+            compute_reglu = True
+        elif activation_type == ActivationType.RELU_SQ:
+            compute_relu_sq = True
+        elif activation_type == ActivationType.RELU:
+            compute_relu = True
+        elif activation_type == ActivationType.SILU:
+            compute_silu = True
+        elif activation_type == ActivationType.GELU:
+            compute_gelu = True
+        else:
+            raise NotImplementedError(f"Activation function {activation_type} not supported yet!")
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            up_config.tile_shape_mnk,
+            (*up_config.cluster_shape_mnk, 1),
+            pingpong=up_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=compute_swiglu,
+            compute_reglu=compute_reglu,
+            compute_geglu=compute_geglu,
+            compute_relu_sq=compute_relu_sq,
+            compute_relu=compute_relu,
+            compute_silu=compute_silu,
+            compute_gelu=compute_gelu,
+            is_A_gather=True,
+            epi_tile_size=up_config.epi_tile_size,
+            initial_d_epi_stage=up_config.initial_d_epi_stage,
+            inference_mode=inference_mode,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            up_config.cluster_shape_mnk[0] * up_config.cluster_shape_mnk[1]
+        )
+        self.current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    @cute.jit
+    def __call__(
+        self, mX, mW1, mZ, mY1, mB1, mE_offset, mX_gather, mD_tensormap, mY1_tensormap, mE_permute_order, stream
+    ):
+        return self.module(
+            mX,
+            mW1,
+            None,
+            mB1,
+            mZ,
+            mY1,
+            None,
+            None,
+            mE_offset,
+            mX_gather,
+            None,
+            None,
+            None,
+            None,
+            None,
+            mD_tensormap,
+            mY1_tensormap,
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )
+class HopperWgmma_MoE_Down_proj_Fwd:
+    def __init__(self, E: int, H: int, I: int):
+        super().__init__()
+        assert (
+            H % 64 == 0 and H >= 512 and I % 64 == 0
+        ), f"{LIBRARY_NAME} only supports MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        if I >= 1024:
+            down_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 256, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=32,
+                is_pingpong=False,
+                initial_d_epi_stage=4,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        elif I >= 256:
+            down_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 192, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=(96 if H % 96 == 0 else 64),
+                is_pingpong=True,
+                initial_d_epi_stage=5,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        elif I >= 64:
+            down_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 192, 64),
+                cluster_shape_mnk=(1, 2),
+                epi_tile_size=64,
+                is_pingpong=True,
+                initial_d_epi_stage=8,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        else:
+            raise NotImplementedError()
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            down_config.tile_shape_mnk,
+            (*down_config.cluster_shape_mnk, 1),
+            pingpong=down_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=False,
+            is_A_gather=False,
+            epi_tile_size=down_config.epi_tile_size,
+            initial_d_epi_stage=down_config.initial_d_epi_stage,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            down_config.cluster_shape_mnk[0] * down_config.cluster_shape_mnk[1]
+        )
+    @cute.jit
+    def __call__(self, mY1, mW2, mY2, mB2, mE_offset, mX_gather, mD_tensormap, mE_permute_order, stream):
+        # we are not really using mX_gather in the Grouped GEMM,
+        # but CuTe-DSL compiler disallows dynamic flow so we still need to pass this argument
+        return self.module(
+            mY1,
+            mW2,
+            None,
+            mB2,
+            mY2,
+            None,
+            None,
+            None,
+            mE_offset,
+            mX_gather,
+            None,
+            None,
+            None,
+            None,
+            None,
+            mD_tensormap,
+            None,
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )
+class HopperWgmma_MoE_Down_proj_ActGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType):
+        super().__init__()
+        is_glu_activation = is_glu(activation_type)
+        if is_glu_activation:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 64 == 0
+            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        else:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 128 == 0
+            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
+        # heavy register pressure due to pingpong + heavy epilogue
+        #   effectively no alternatives to this config
+        dz_partial_ds_config = HopperGEMMConfig(
+            tile_shape_mnk=(128, 128, 64),
+            cluster_shape_mnk=(2, 1),
+            epi_tile_size=32,
+            initial_d_epi_stage=4,
+            is_pingpong=True,
+            raster_order=RasterOrderOption.Heuristic,
+        )
+        compute_swiglu = False
+        compute_geglu = False
+        compute_reglu = False
+        compute_relu_sq = False
+        compute_silu = False
+        compute_relu = False
+        compute_gelu = False
+        if activation_type == ActivationType.SWIGLU:
+            compute_swiglu = True
+        elif activation_type == ActivationType.GEGLU:
+            compute_geglu = True
+        elif activation_type == ActivationType.REGLU:
+            compute_reglu = True
+        elif activation_type == ActivationType.RELU_SQ:
+            compute_relu_sq = True
+        elif activation_type == ActivationType.RELU:
+            compute_relu = True
+        elif activation_type == ActivationType.SILU:
+            compute_silu = True
+        elif activation_type == ActivationType.GELU:
+            compute_gelu = True
+        else:
+            raise NotImplementedError(f"Activation function {activation_type} not supported yet!")
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            dz_partial_ds_config.tile_shape_mnk,
+            (*dz_partial_ds_config.cluster_shape_mnk, 1),
+            pingpong=dz_partial_ds_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=compute_swiglu,
+            compute_reglu=compute_reglu,
+            compute_geglu=compute_geglu,
+            compute_relu_sq=compute_relu_sq,
+            compute_relu=compute_relu,
+            compute_silu=compute_silu,
+            compute_gelu=compute_gelu,
+            compute_dz_and_partial_ds_and_y1s=True,
+            is_A_gather=True,
+            epi_tile_size=dz_partial_ds_config.epi_tile_size,
+            initial_d_epi_stage=dz_partial_ds_config.initial_d_epi_stage,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            dz_partial_ds_config.cluster_shape_mnk[0] * dz_partial_ds_config.cluster_shape_mnk[1]
+        )
+    @cute.jit
+    def __call__(
+        self,
+        mDout,
+        mW2_trans,
+        mZ_FP32_if_GLU_else_BF16,
+        mDz_FP32_if_GLU_else_BF16,
+        mY1S,
+        mS,
+        mDS_partial,
+        mE_offset,
+        mX_gather,
+        mS_scatter,
+        tensormaps,
+        mE_permute_order,
+        stream,
+    ):
+        return self.module(
+            mDout,
+            mW2_trans,
+            mZ_FP32_if_GLU_else_BF16,
+            None,
+            mDz_FP32_if_GLU_else_BF16,
+            mY1S,
+            mS,
+            mDS_partial,
+            mE_offset,
+            mX_gather,
+            None,
+            mS_scatter,
+            None,
+            None,
+            tensormaps[0],
+            tensormaps[1],
+            tensormaps[2],
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )
+class HopperWgmma_MoE_Down_proj_WeightGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int):
+        super().__init__()
+        assert (
+            H % 64 == 0 and H >= 512 and I % 64 == 0
+        ), f"{LIBRARY_NAME} only supports MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        if I >= 128:
+            dw2_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 256, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=16,
+                is_pingpong=False,
+                initial_d_epi_stage=6,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        elif I == 64:
+            dw2_config = HopperGEMMConfig(
+                tile_shape_mnk=(64, 192, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=32,
+                is_pingpong=True,
+                initial_d_epi_stage=6,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        else:
+            raise NotImplementedError()
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            dw2_config.tile_shape_mnk,
+            (*dw2_config.cluster_shape_mnk, 1),
+            pingpong=dw2_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=False,
+            compute_weight_gradient=True,
+            compute_dz_and_partial_ds_and_y1s=False,
+            is_A_gather=True,
+            epi_tile_size=dw2_config.epi_tile_size,
+            initial_d_epi_stage=dw2_config.initial_d_epi_stage,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            dw2_config.cluster_shape_mnk[0] * dw2_config.cluster_shape_mnk[1]
+        )
+    @cute.jit
+    def __call__(self, mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
+        return self.module(
+            mDout_trans,
+            mY1S_trans,
+            None,
+            None,
+            mDw2,
+            None,
+            None,
+            None,
+            mE_offset,
+            mX_gather,
+            None,
+            None,
+            None,
+            tensormaps[0],
+            None,
+            None,
+            None,
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )
+class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
+        super().__init__()
+        if is_glu_activation:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 64 == 0
+            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        else:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 128 == 0
+            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
+        if (I >= 512 and is_glu_activation) or (I >= 1024 and not is_glu_activation):
+            dx_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 256, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=32,
+                is_pingpong=False,
+                initial_d_epi_stage=4,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        elif (I >= 64 and is_glu_activation) or (I >= 128 and not is_glu_activation):
+            dx_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 192, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=64,
+                is_pingpong=True,
+                initial_d_epi_stage=8,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        else:
+            raise NotImplementedError()
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            dx_config.tile_shape_mnk,
+            (*dx_config.cluster_shape_mnk, 1),
+            pingpong=dx_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=False,
+            compute_dz_and_partial_ds_and_y1s=False,
+            is_A_gather=False,
+            epi_tile_size=dx_config.epi_tile_size,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            dx_config.cluster_shape_mnk[0] * dx_config.cluster_shape_mnk[1]
+        )
+        self.current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    @cute.jit
+    def __call__(
+        self, mDz, mW1_trans, mDx_expanded, mE_offset, mX_gather, mS_scatter, tensormaps, mE_permute_order, stream
+    ):
+        return self.module(
+            mDz,
+            mW1_trans,
+            None,
+            None,
+            mDx_expanded,
+            None,
+            None,
+            None,
+            mE_offset,
+            mX_gather,
+            None,
+            mS_scatter,
+            None,
+            None,
+            None,
+            tensormaps[0],
+            tensormaps[1],
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )
+class HopperWgmma_MoE_Up_proj_WeightGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
+        super().__init__()
+        if is_glu_activation:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 64 == 0
+            ), f"{LIBRARY_NAME} only supports GLU MoE with H % 64 == 0 (H >= 512) and I % 64 == 0"
+        else:
+            assert (
+                H % 64 == 0 and H >= 512 and I % 128 == 0
+            ), f"{LIBRARY_NAME} only supports non-GLU MoE with H % 64 == 0 (H >= 512) and I % 128 == 0"
+        if (I >= 128 and is_glu_activation) or (I >= 256 and not is_glu_activation):
+            dw1_config = HopperGEMMConfig(
+                tile_shape_mnk=(128, 256, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=16,
+                is_pingpong=False,
+                initial_d_epi_stage=6,
+                raster_order=RasterOrderOption.Heuristic,
+            )
+        elif (I == 64 and is_glu_activation) or (I == 128 and not is_glu_activation):
+            dw1_config = HopperGEMMConfig(
+                tile_shape_mnk=(256, 128, 64),
+                cluster_shape_mnk=(2, 1),
+                epi_tile_size=16,
+                is_pingpong=False,
+                initial_d_epi_stage=6,
+                raster_order=RasterOrderOption.AlongN,
+            )
+        else:
+            raise NotImplementedError()
+        self.module = HopperWgmma_MoE_kernel(
+            E,
+            cutlass.Float32,
+            dw1_config.tile_shape_mnk,
+            (*dw1_config.cluster_shape_mnk, 1),
+            pingpong=dw1_config.is_pingpong,
+            is_persistent=True,
+            compute_swiglu=False,
+            compute_weight_gradient=True,
+            compute_dz_and_partial_ds_and_y1s=False,
+            is_A_gather=True,
+            epi_tile_size=dw1_config.epi_tile_size,
+        )
+        self.max_active_clusters = cutlass.utils.HardwareInfo().get_max_active_clusters(
+            dw1_config.cluster_shape_mnk[0] * dw1_config.cluster_shape_mnk[1]
+        )
+    @cute.jit
+    def __call__(self, mX_trans, mDz_trans, mDw1_trans, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
+        return self.module(
+            mX_trans,
+            mDz_trans,
+            None,
+            None,
+            mDw1_trans,
+            None,
+            None,
+            None,
+            mE_offset,
+            mX_gather,
+            None,
+            None,
+            None,
+            tensormaps[0],
+            None,
+            None,
+            None,
+            None,
+            mE_permute_order,
+            const_expr(self.max_active_clusters),
+            stream,
+        )

build/torch-cuda/functional/reduction_over_k_gather.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from ..utils import get_powers_of_2
+### This triton impl is equivalent as the cute-dsl impl shown above,
+# and also achieves similar memory bandwidth on H100 for large K and H.
+# However, for small K and H, this impl is better by autotuning so we use it as the default.
+def _get_triton_autotune_configs() -> list[triton.Config]:
+    configs = []
+    for BLOCK_H in get_powers_of_2(256, 4096):
+        for BLOCK_K in get_powers_of_2(1, 128):
+            for num_warps in [4, 8]:
+                if BLOCK_K * BLOCK_H <= 32768:
+                    configs.append(
+                        triton.Config({"BLOCK_H": BLOCK_H, "BLOCK_K": BLOCK_K}, num_warps=num_warps, num_stages=4)
+                    )
+    return configs
+def _prune_triton_autotune_config(configs, nargs, **kw):
+    pruned_configs = []
+    for c in configs:
+        BLOCK_H = c.kwargs["BLOCK_H"]
+        BLOCK_K = c.kwargs["BLOCK_K"]
+        H = kw["H"]
+        MAX_K = kw["MAX_K"]
+        if (
+            BLOCK_H <= triton.next_power_of_2(H)
+            and BLOCK_K <= triton.next_power_of_2(MAX_K)
+            and min(H * MAX_K, 1024) <= (BLOCK_H * BLOCK_K)
+        ):
+            pruned_configs.append(c)
+    if len(pruned_configs) == 0:
+        return configs
+    else:
+        return pruned_configs
+@triton.autotune(
+    configs=_get_triton_autotune_configs(),
+    key=["H", "MAX_K", "w_is_None", "is_varlen_K"],
+    prune_configs_by={"early_config_prune": _prune_triton_autotune_config},
+)
+@triton.jit
+def token_gather_sum_kernel(
+    x_ptr,  # (Mtotal, H)
+    w_ptr,  # (Mtotal,)
+    M_perm_ptr,  # (Mtotal,) int32
+    M_offset_ptr,  # (T+1,)   int32
+    out_ptr,  # (T, H)
+    T,
+    H: tl.constexpr,
+    MAX_K: tl.constexpr,
+    # strides
+    stride_xM: tl.constexpr,
+    stride_xH: tl.constexpr,
+    stride_outT: tl.constexpr,
+    stride_outH: tl.constexpr,
+    # tile sizes
+    BLOCK_H: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    w_is_None: tl.constexpr,
+    is_varlen_K: tl.constexpr,
+):
+    # 1D tiling over T only
+    pid_t = tl.program_id(axis=0)
+    t_idx = pid_t.to(tl.uint32)
+    # Load segment starts and ends for this token
+    if is_varlen_K:
+        Ms = tl.load(M_offset_ptr + t_idx).to(tl.uint32)
+        Me = tl.load(M_offset_ptr + t_idx + 1).to(tl.uint32)
+        K_this_token = Me - Ms  # actual K for this token
+    else:
+        Ms = MAX_K * t_idx
+        K_this_token: tl.constexpr = MAX_K
+    # Outer loop over H tiles
+    for h_tile in tl.static_range(triton.cdiv(H, BLOCK_H)):
+        h_idx = (h_tile * BLOCK_H + tl.arange(0, BLOCK_H)).to(tl.uint32)  # [BLOCK_H]
+        m_h = h_idx < H
+        # Initialize accumulator for this H tile
+        acc = tl.zeros([BLOCK_H], dtype=tl.float32)  # [BLOCK_H]
+        # Inner loop over K tiles
+        for k_tile in tl.range(tl.cdiv(K_this_token, BLOCK_K)):
+            k_offset = k_tile * BLOCK_K
+            k_idx = (k_offset + tl.arange(0, BLOCK_K)).to(tl.uint32)  # [BLOCK_K]
+            # Mask for valid K indices
+            m_k = k_idx < K_this_token  # [BLOCK_K]
+            # Absolute positions into M_perm and w
+            m_abs = Ms + k_idx  # [BLOCK_K]
+            # Gather permuted indices
+            perm_idx = tl.load(M_perm_ptr + m_abs, mask=m_k, other=0).to(tl.uint32)  # [BLOCK_K]
+            # Load x values: [BLOCK_K, BLOCK_H]
+            x_ptrs = x_ptr + perm_idx[:, None] * stride_xM + h_idx[None, :] * stride_xH
+            x_mask = m_k[:, None] & m_h[None, :]
+            x_vals = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+            # Reduce along K dimension and add to accumulator
+            if w_is_None:
+                acc += tl.sum(x_vals, axis=0)  # [BLOCK_H]
+            else:
+                w_vals = tl.load(w_ptr + m_abs, mask=m_k, other=0.0).to(tl.float32)  # [BLOCK_K]
+                acc += tl.sum(x_vals * w_vals[:, None], axis=0)  # [BLOCK_H]
+        # Store final result for this H tile (only once!)
+        out_ptrs = out_ptr + t_idx * stride_outT + h_idx * stride_outH
+        tl.store(out_ptrs, acc, mask=m_h)
+def token_gather_and_sum_varlen_K_triton(
+    x: torch.Tensor,  # (Mtotal, H)
+    w: Optional[torch.Tensor],  # (Mtotal,)
+    out: torch.Tensor,  # (T, H)
+    M_perm: torch.Tensor,  # (Mtotal,) int32
+    M_offset: torch.Tensor,  # (T+1,)   int32, variable K per token
+    T: int,
+    MAX_K: int,  # maximum K across all tokens
+    H: int,
+    is_varlen_K: bool,
+):
+    """
+    1D parallelization over T, with iterative accumulation over K tiles and H tiles.
+    Supports variable K per token.
+    out[i, :] = sum_{j=0..K[i]-1}  x[M_perm[M_offset[i] + j], :] * w[M_offset[i] + j]
+    where K[i] = M_offset[i+1] - M_offset[i] can vary per token.
+    """
+    # 1D grid over T only
+    token_gather_sum_kernel[(T,)](
+        x,
+        w,
+        M_perm,
+        M_offset,
+        out,
+        T=T,
+        H=H,
+        MAX_K=MAX_K,
+        stride_xM=x.stride(0),
+        stride_xH=x.stride(1),
+        stride_outT=out.stride(0),
+        stride_outH=out.stride(1),
+        w_is_None=(w is None),
+        is_varlen_K=is_varlen_K,
+    )

build/torch-cuda/functional/tile_scheduler.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from __future__ import annotations
+import cutlass
+import cutlass.cute as cute
+from cutlass import Boolean, Int32, const_expr
+from ..quack.pipeline import PipelineStateWAdvance
+from ..quack.tile_scheduler import TileScheduler, VarlenMTileScheduler
+class SonicMoETileScheduler(TileScheduler):
+    @staticmethod
+    @cute.jit
+    def create(
+        params: TileScheduler.Params,
+        tile_count: cute.Tensor | None = None,
+        scheduler_pipeline: cutlass.pipeline.PipelineAsync | None = None,
+        is_scheduler_warp: bool | Boolean = False,
+        *,
+        loc=None,
+        ip=None,
+    ) -> SonicMoETileScheduler:
+        """is_scheduler_warp should only be true for one warp in the whole cluster"""
+        stages = 0
+        if const_expr(not params.is_persistent):
+            cidx, cidy, _ = cute.arch.cluster_idx()
+            cdimx, _, _ = cute.arch.cluster_dim()
+            cluster_id = cidx + cidy * cdimx
+            current_work_linear_idx = Int32(cluster_id)
+        else:
+            _, _, bidz = cute.arch.block_idx()
+            current_work_linear_idx = Int32(bidz)
+            if const_expr(params.tile_count_semaphore is not None):
+                assert tile_count is not None
+                assert scheduler_pipeline is not None
+                stages = const_expr(cute.size(tile_count))
+        return SonicMoETileScheduler(
+            current_work_linear_idx,
+            Int32(0),  # num_tiles_executed
+            tile_count,
+            scheduler_pipeline,
+            PipelineStateWAdvance(stages, Int32(0), Int32(0), Int32(1 if is_scheduler_warp else 0)),
+            params,
+            loc=loc,
+            ip=ip,
+        )
+    def prefetch_next_work(self, *, advance_count: int = 1, loc=None, ip=None):
+        old_current_work_linear_idx = self._current_work_linear_idx
+        if const_expr(self.params.is_persistent):
+            num_persistent_clusters = cute.arch.grid_dim()[2]
+            self._current_work_linear_idx += advance_count * Int32(num_persistent_clusters)
+        future_tile_coord_mnkl = self.get_current_work()
+        self._current_work_linear_idx = old_current_work_linear_idx
+        return future_tile_coord_mnkl
+class SonicMoEVarlenMTileScheduler(VarlenMTileScheduler, SonicMoETileScheduler):
+    @staticmethod
+    @cute.jit
+    def create(
+        params: VarlenMTileScheduler.Params,
+        tile_count: cute.Tensor | None = None,
+        scheduler_pipeline: cutlass.pipeline.PipelineAsync | None = None,
+        is_scheduler_warp: bool | Boolean = False,
+        *,
+        loc=None,
+        ip=None,
+    ) -> SonicMoEVarlenMTileScheduler:
+        stages = 0
+        _, _, bidz = cute.arch.block_idx()
+        current_work_linear_idx = Int32(bidz)
+        if const_expr(params.tile_count_semaphore is not None):
+            assert tile_count is not None
+            assert scheduler_pipeline is not None
+            stages = const_expr(cute.size(tile_count))
+        return SonicMoEVarlenMTileScheduler(
+            current_work_linear_idx,
+            Int32(0),  # num_tiles_executed
+            Int32(0),  # current_batch_idx
+            Int32(0),  # num_work_idx_before_cur_batch
+            tile_count,
+            scheduler_pipeline,
+            PipelineStateWAdvance(stages, Int32(0), Int32(0), Int32(1 if is_scheduler_warp else 0)),
+            params,
+            loc=loc,
+            ip=ip,
+        )

build/torch-cuda/functional/topk_softmax.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+# this impl is adapted from QuACK's topk https://github.com/Dao-AILab/quack/blob/main/quack/topk.py
+import math
+from typing import Type
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+from ..quack import utils
+from cutlass import const_expr
+from ..quack.sort.bitonic_sort import bitonic_topk
+from triton import next_power_of_2
+from ..utils import domain_offset_i64
+class TopK_Softmax:
+    def __init__(
+        self,
+        input_dtype: Type[cutlass.Numeric],
+        output_dtype: Type[cutlass.Numeric],
+        N: int,
+        k: int,
+        require_softmax_fusion: bool = True,
+    ):
+        self.input_dtype = input_dtype
+        self.output_dtype = output_dtype
+        self.N = N
+        self.input_vecsize = 128 // input_dtype.width
+        self.output_vecsize = 128 // output_dtype.width
+        self.k = k
+        self.next_power_of_2_N = next_power_of_2(N)
+        self.next_power_of_2_K = next_power_of_2(k)
+        assert k <= 128 and k <= N
+        assert N <= 4096 and N % 8 == 0
+        assert input_dtype.width <= output_dtype.width, "input bitwidth must <= output bitwidth"
+        self.require_softmax_fusion = require_softmax_fusion
+    def _calculate_threads_per_row(self):
+        # we want num_elems_per_thread >= self.k
+        # and each thread can handle at most 64 elements
+        N = self.next_power_of_2_N
+        num_threads_per_row = max(min(N // self.k, 32, N // 64), 1)
+        return num_threads_per_row
+    def _get_tv_layout(self, vecsize):
+        N = self.next_power_of_2_N
+        num_threads = 128 if N <= 16384 else 256
+        threads_per_row = self._calculate_threads_per_row()
+        cols_per_block = num_threads // threads_per_row
+        num_blocks_N = cute.ceil_div(min(N, 16384) // vecsize, threads_per_row)
+        tiler_mn = (cols_per_block, vecsize * num_blocks_N * threads_per_row)
+        tv_layout = cute.make_layout(
+            ((threads_per_row, cols_per_block), (vecsize, num_blocks_N)),
+            stride=(
+                (vecsize * cols_per_block, 1),
+                (cols_per_block, cols_per_block * vecsize * threads_per_row),
+            ),
+        )
+        return tiler_mn, tv_layout
+    @cute.jit
+    def __call__(
+        self,
+        mX: cute.Tensor,
+        mValues: cute.Tensor,
+        mIndices: cute.Tensor,
+        stream: cuda.CUstream,
+    ):
+        assert mX.element_type == self.input_dtype
+        assert mValues.element_type == self.output_dtype
+        assert mIndices.element_type == cutlass.Int32
+        input_tiler_mn, input_tv_layout = self._get_tv_layout(self.input_vecsize)
+        output_tiler_mn, output_tv_layout = self._get_tv_layout(self.output_vecsize)
+        num_threads = cute.size(input_tv_layout, mode=[0])
+        self.kernel(mX, mValues, mIndices, input_tv_layout, input_tiler_mn, output_tv_layout, output_tiler_mn).launch(
+            grid=[cute.ceil_div(mX.shape[0], input_tiler_mn[0]), 1, 1],
+            block=[num_threads, 1, 1],
+            stream=stream,
+        )
+    @cute.kernel
+    def kernel(
+        self,
+        mX: cute.Tensor,
+        mValues: cute.Tensor,
+        mIndices: cute.Tensor,
+        input_tv_layout: cute.Layout,
+        input_tiler_mn: cute.Shape,
+        output_tv_layout: cute.Layout,
+        output_tiler_mn: cute.Shape,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, _, _ = cute.arch.block_idx()
+        shape = mX.shape
+        idX = cute.make_identity_tensor(shape)
+        # slice for CTAs
+        # We use domain_offset_i64 to deal with tensors larger than 2^31 elements
+        mX = domain_offset_i64((bidx * input_tiler_mn[0], 0), mX)
+        gX = cute.local_tile(mX, input_tiler_mn, (0, 0))
+        cX = cute.local_tile(idX, input_tiler_mn, (bidx, 0))
+        # declare the atoms which will be used later for memory copy
+        copy_atom_load_X = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gX.element_type, num_bits_per_copy=128)
+        thr_copy_X = cute.make_tiled_copy(copy_atom_load_X, input_tv_layout, input_tiler_mn).get_slice(tidx)
+        tXgX = thr_copy_X.partition_S(gX)
+        tXcX = thr_copy_X.partition_S(cX)[(0, None), None, None]
+        # allocate fragments for gmem->rmem
+        tXrX = cute.make_rmem_tensor_like(tXgX)
+        is_even_N = const_expr(shape[1] == input_tiler_mn[1])
+        tXpX = (
+            utils.predicate_k(thr_copy_X.partition_S(cX), limit=shape[1])
+            if const_expr((not is_even_N) or (self.N != self.next_power_of_2_N))
+            else None
+        )
+        if tXcX[0][0] < shape[0]:
+            cute.copy(copy_atom_load_X, tXgX, tXrX, pred=tXpX)
+        tXrX_f32 = cute.make_rmem_tensor(tXrX.shape, cutlass.Float32)
+        tXrX_f32.store(tXrX.load().to(cutlass.Float32))
+        # Encode the indices into the bottom bits of values.
+        log_N = int(math.log2(self.next_power_of_2_N))
+        idx_mask = const_expr((1 << log_N) - 1)
+        input_vecsize = cutlass.const_expr(input_tv_layout.shape[1][0])
+        tXrX_u32 = cute.recast_tensor(tXrX_f32, cutlass.Uint32)
+        # Encode indices into the last log_N bits of tXrX_u32
+        for i in cutlass.range(cute.size(tXrX_u32), unroll_full=True):
+            # tXcX only keeps track of the indices for every @vecsize elements
+            col_idx = cutlass.Uint32(tXcX[i // input_vecsize][1] + i % input_vecsize)
+            # If positive, invert the bits of the index, so that if there's a tie,
+            # indices coming from a earlier column will win.
+            encoded_idx = ~col_idx if tXrX_f32[i] >= 0 else col_idx
+            # Mask to keep only the last log_N bits of the encoded index
+            encoded_idx = encoded_idx & idx_mask
+            # Clear the last log_N bits and set them to our encoded index
+            tXrX_u32[i] = (tXrX_u32[i] & ~idx_mask) | encoded_idx
+        # Fill OOB values with -inf for top-k
+        if const_expr((not is_even_N) or (self.N != self.next_power_of_2_N)):
+            utils.fill_oob(tXrX_f32, tXpX, -tXrX_f32.element_type.inf)
+        threads_per_row = input_tv_layout.shape[0][0]
+        topk_vals = bitonic_topk(tXrX_f32, self.next_power_of_2_K, warp_width=threads_per_row)
+        # Extract indices and clean values
+        topk_vals_u32 = cute.recast_tensor(topk_vals, cutlass.Uint32)
+        topk_indices = cute.make_rmem_tensor(self.k, cutlass.Int32)
+        for i in cutlass.range_constexpr(self.k):
+            # Extract the encoded index from the last log_N bits
+            encoded_idx = topk_vals_u32[i] & idx_mask
+            # Check if original value was positive by looking at the cleaned value
+            topk_vals_u32[i] = topk_vals_u32[i] & ~idx_mask  # Clear last log_N bits
+            # If positive, we need to invert the bits back to get original index
+            col_idx = ~encoded_idx if topk_vals[i] >= 0 else encoded_idx
+            topk_indices[i] = cutlass.Int32(col_idx & idx_mask)
+        if const_expr(self.require_softmax_fusion):
+            topk_vals_max = -cutlass.Float32.inf
+            for i in cutlass.range_constexpr(self.k):
+                topk_vals_max = cute.arch.fmax(topk_vals[i], topk_vals_max)
+            topk_exp_sum = cutlass.Int32(0.0)
+            for i in cutlass.range_constexpr(self.k):
+                topk_vals[i] = cute.math.exp(topk_vals[i] - topk_vals_max)
+                topk_exp_sum = topk_exp_sum + topk_vals[i]
+            for i in cutlass.range_constexpr(self.k):
+                topk_vals[i] = topk_vals[i] / topk_exp_sum
+        # Convert cleaned values to output type
+        topk_vals_out = cute.make_rmem_tensor_like(topk_indices, mValues.element_type)
+        for i in cutlass.range_constexpr(self.k):
+            topk_vals_out[i] = topk_vals[i].to(mValues.element_type)
+        row = tXcX[0][0]
+        # Only the 1st thread in this row writes the top-k values and indices
+        output_vecsize = cutlass.const_expr(output_tv_layout.shape[1][0])
+        if row < shape[0] and tXcX[0][1] == 0:
+            # Vectorized write
+            elems_per_store = const_expr(math.gcd(output_vecsize, self.k))
+            mValues_store = cute.tiled_divide(mValues[row, None], (elems_per_store,))
+            mIndices_store = cute.tiled_divide(mIndices[row, None], (elems_per_store,))
+            topk_vals_out_store = cute.tiled_divide(topk_vals_out, (elems_per_store,))
+            topk_indices_store = cute.tiled_divide(topk_indices, (elems_per_store,))
+            for i in cutlass.range_constexpr(cute.size(topk_vals_out_store.shape, [1])):
+                cute.autovec_copy(topk_vals_out_store[None, i], mValues_store[None, i])
+                cute.autovec_copy(topk_indices_store[None, i], mIndices_store[None, i])

build/torch-cuda/functional/triton_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import math
+import torch
+import triton
+import triton.language as tl
+from ..._ops_compat import add_op_namespace_prefix
+from .bitmatrix import _bitmatrix_metadata_compute_stage1, _bitmatrix_metadata_compute_stage2, _keyed_add
+@triton.jit
+def _compute_col_partial_sum_kernel(
+    topk_indices_ptr,
+    partial_sum_ptr,
+    T,
+    E: tl.constexpr,
+    n_tiles,
+    TOKENS_PER_TILE: tl.constexpr,
+    K_POW2: tl.constexpr,  # next_power_of_2(K),
+    K: tl.constexpr,  # actual number of experts per token
+    E_POW2: tl.constexpr,  # next_power_of_2(E)
+):
+    # One CTA per tile. Tile `t` covers tokens [t * TOKENS_PER_TILE, (t+1) * TOKENS_PER_TILE).
+    # Produces partial_sum[e, tile_id] = number of entries in this tile routed to expert e.
+    # Layout: partial_sum is [E, n_tiles] (row-major), so partial_sum[e, t] = partial_sum_ptr + e * n_tiles + t.
+    # Caller transposes to [n_tiles, E] before passing to stage1/stage2.
+    tile_id = tl.program_id(0)
+    # Zero this tile's column in partial_sum[*, tile_id].
+    # Chunked by E_POW2 to keep vector width a power of 2.
+    for e_start in tl.static_range(0, E, E_POW2):
+        e_offs = e_start + tl.arange(0, E_POW2)
+        tl.store(
+            partial_sum_ptr + e_offs * n_tiles + tile_id,
+            tl.zeros([E_POW2], tl.int32),
+            mask=e_offs < E,
+        )
+    # Load expert ids for this tile: shape [TOKENS_PER_TILE, K_POW2].
+    # Tokens beyond T and k-slots beyond K are masked out (other=-1).
+    tok_offs = tile_id * TOKENS_PER_TILE + tl.arange(0, TOKENS_PER_TILE)
+    k_offs = tl.arange(0, K_POW2)
+    tok_mask = tok_offs < T
+    load_mask = tok_mask[:, None] & (k_offs[None, :] < K)
+    safe_k = tl.minimum(k_offs, K - 1)  # avoid OOB when k_offs >= K
+    expert_ids = tl.load(
+        topk_indices_ptr + tok_offs[:, None] * K + safe_k[None, :],
+        mask=load_mask,
+        other=-1,
+    )
+    # Flatten to [TOKENS_PER_TILE * K_POW2] and histogram into partial_sum.
+    # safe_experts remaps masked (-1) entries to expert 0 (harmless: flat_mask=False).
+    flat_experts = tl.reshape(expert_ids, [TOKENS_PER_TILE * K_POW2])
+    flat_mask = tl.reshape(load_mask, [TOKENS_PER_TILE * K_POW2])
+    safe_experts = tl.where(flat_mask, flat_experts, 0)
+    tl.atomic_add(
+        partial_sum_ptr + safe_experts * n_tiles + tile_id,
+        tl.full([TOKENS_PER_TILE * K_POW2], 1, dtype=tl.int32),
+        mask=flat_mask,
+    )
+@torch.library.custom_op(
+    add_op_namespace_prefix("triton_kernels__TC_topk_router_metadata"),
+    mutates_args={
+        "expert_frequency",
+        "expert_frequency_offset",
+        "x_gather_idx",
+        "s_scatter_idx",
+        "s_reverse_scatter_idx",
+    },
+)
+def TC_topk_router_metadata_triton(
+    topk_router_indices: torch.Tensor,
+    E: int,
+    expert_frequency: torch.Tensor,
+    expert_frequency_offset: torch.Tensor,
+    x_gather_idx: torch.Tensor,
+    s_scatter_idx: torch.Tensor,
+    s_reverse_scatter_idx: torch.Tensor,
+) -> None:
+    T, K = topk_router_indices.size()
+    TK = T * K
+    device = topk_router_indices.device
+    E_POW2 = triton.next_power_of_2(E)
+    K_POW2 = triton.next_power_of_2(K)
+    TOKENS_PER_BLOCK = 1024 // K_POW2
+    n_tiles = triton.cdiv(T, TOKENS_PER_BLOCK)
+    # ── Kernel 1: tiled histogram ─────────────────────────────────────────────
+    # col_partial_sum_trans[E, n_tiles]: raw per-expert-per-tile counts.
+    # Stored transposed so each CTA writes to its own column (tile_id), avoiding
+    # cross-CTA write conflicts. Transposed back to [n_tiles, E] for stage1/stage2.
+    col_partial_sum_trans = torch.empty(E, n_tiles, dtype=torch.int32, device=device)
+    _compute_col_partial_sum_kernel[(n_tiles,)](
+        topk_router_indices,
+        col_partial_sum_trans,
+        T,
+        E,
+        n_tiles,
+        TOKENS_PER_TILE=TOKENS_PER_BLOCK,
+        K_POW2=K_POW2,
+        K=K,
+        E_POW2=E_POW2,
+    )
+    expert_frequency.copy_(col_partial_sum_trans.sum(dim=1, dtype=torch.int32))
+    col_partial_sum = col_partial_sum_trans.T  # [n_tiles, E]
+    # ── Kernel 2: stage1 ─────────────────────────────────────────────────────
+    # - For each expert e (pid < E): convert col_partial_sum[*, e] from raw
+    #   counts to exclusive prefix sums over tiles in-place.
+    # - For pid == E: write exclusive cumsum of expert_freq_offset into
+    #   expert_freq_off[0:E] (= col_offs, a view into expert_freq_off).
+    _bitmatrix_metadata_compute_stage1[(E + 2,)](
+        expert_frequency,
+        expert_frequency_offset,
+        E,
+        col_partial_sum,
+        n_tiles,
+        TK,
+        BLOCK_M=128,
+        BLOCK_N=E_POW2,
+    )
+    # ── Kernel 3: stage2 ─────────────────────────────────────────���───────────
+    # For each tile: sort entries by expert, compute output positions, scatter.
+    _bitmatrix_metadata_compute_stage2[(n_tiles,)](
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        x_gather_idx,
+        topk_router_indices,
+        T,
+        col_partial_sum,
+        n_tiles,
+        expert_frequency_offset[:E],
+        K_POW2=K_POW2,
+        TOKENS_PER_BLOCK=TOKENS_PER_BLOCK,
+        K=K,
+    )
+# ── general_routing_router_metadata_triton --- Kernel 1: tiled histogram over flat selected_E ────────────────────────────
+@triton.jit
+def _general_compute_col_partial_sum_kernel(
+    selected_E_ptr,
+    partial_sum_ptr,  # [E, n_tiles], column-major per tile
+    TK,
+    E: tl.constexpr,
+    n_tiles,
+    BLOCK_SIZE: tl.constexpr,
+    E_POW2: tl.constexpr,
+):
+    tile_id = tl.program_id(0)
+    # Zero this tile's column in partial_sum[*, tile_id].
+    for e_start in tl.static_range(0, E, E_POW2):
+        e_offs = e_start + tl.arange(0, E_POW2)
+        tl.store(
+            partial_sum_ptr + e_offs * n_tiles + tile_id,
+            tl.zeros([E_POW2], tl.int32),
+            mask=e_offs < E,
+        )
+    # Load expert ids for this tile (flat indexing into selected_E).
+    offs = tile_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < TK
+    expert_ids = tl.load(selected_E_ptr + offs, mask=mask, other=-1)
+    safe_experts = tl.where(mask, expert_ids, 0)
+    tl.atomic_add(
+        partial_sum_ptr + safe_experts * n_tiles + tile_id,
+        tl.full([BLOCK_SIZE], 1, dtype=tl.int32),
+        mask=mask,
+    )
+# ── general_routing_router_metadata_triton --- Kernel 3: sort entries by expert within each tile, scatter ────────────────
+@triton.jit
+def _general_metadata_compute_stage2(
+    s_scatter_idx_ptr,
+    s_reverse_scatter_idx_ptr,
+    x_gather_idx_ptr,
+    selected_E_ptr,
+    sorted_selected_T_ptr,
+    TK,
+    partial_sum_ptr,  # [n_tiles, E] with strides (1, n_tiles)
+    n_tiles,
+    expert_offs_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    tl.static_assert(BLOCK_SIZE <= 32768)
+    pid_m = tl.program_id(0)
+    offs_local = tl.arange(0, BLOCK_SIZE)
+    offs_global = pid_m * BLOCK_SIZE + offs_local
+    mask = offs_global < TK
+    # Load expert id for each entry in this tile.
+    expert = tl.load(selected_E_ptr + offs_global, mask=mask, other=-1).to(tl.uint32)
+    # Pack (expert, local_offset) into uint32 and sort by expert.
+    # Upper 16 bits = expert id, lower 16 bits = pre-sort local offset.
+    kv_pairs = tl.sort(((expert << 16) | offs_local).to(tl.uint32), 0)
+    expert = kv_pairs >> 16
+    mask = expert != 0xFFFF
+    # Segmented scan for within-expert rank.
+    scan_input = (kv_pairs & 0xFFFF0000) | 0x00000001
+    inclusive_run_lengths = tl.associative_scan(scan_input, 0, _keyed_add)
+    within_expert_rank = (inclusive_run_lengths - 1) & 0xFFFF
+    # Output position = expert_offs[e] + partial_sum[tile, e] + within_expert_rank.
+    s_reverse_scatter_val = tl.load(partial_sum_ptr + pid_m + expert * n_tiles, mask=mask)
+    s_reverse_scatter_val += tl.load(expert_offs_ptr + expert, mask=mask)
+    s_reverse_scatter_val += within_expert_rank
+    # Recover pre-sort entry index and look up the token index.
+    presort_offs = kv_pairs & 0xFFFF
+    entry_idx = pid_m * BLOCK_SIZE + presort_offs
+    token_idx = tl.load(sorted_selected_T_ptr + entry_idx, mask=mask)
+    tl.store(s_reverse_scatter_idx_ptr + entry_idx, s_reverse_scatter_val, mask=mask)
+    tl.store(s_scatter_idx_ptr + s_reverse_scatter_val, entry_idx, mask=mask)
+    tl.store(x_gather_idx_ptr + s_reverse_scatter_val, token_idx, mask=mask)
+# ── general_routing_router_metadata_triton --- Kernel 4: parallel binary search for token offset ─────────────────────────
+# Since sorted_selected_T is sorted ascending, num_activated_expert_per_token_offset[t]
+# is exactly searchsorted_left(sorted_selected_T, t): the index of the first entry
+# with token index >= t.  We compute this via parallel binary search over T+1 queries,
+# replacing the PyTorch bincount + cumsum path.
+@triton.jit
+def _token_offset_searchsorted_kernel(
+    sorted_T_ptr,  # [TK] int32, sorted ascending
+    offset_ptr,  # [T+1] int32, output
+    T,  # number of tokens
+    TK,  # length of sorted_T
+    BLOCK_SIZE: tl.constexpr,
+    N_ITERS: tl.constexpr,  # ceil(log2(TK + 1)), controls binary search depth
+):
+    pid = tl.program_id(0)
+    t_offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = t_offs <= T  # T+1 total values: offset[0], ..., offset[T]
+    t_vals = t_offs.to(tl.int32)
+    # Binary search: find smallest i such that sorted_T[i] >= t_vals
+    lo = tl.zeros([BLOCK_SIZE], dtype=tl.int32)
+    hi = tl.full([BLOCK_SIZE], TK, dtype=tl.int32)
+    for _ in tl.static_range(0, N_ITERS):
+        mid = (lo + hi) >> 1
+        # When mid >= TK, treat the value as +inf (>= any t), so hi = mid.
+        safe_mid = tl.where(mid < TK, mid, 0)
+        val = tl.load(sorted_T_ptr + safe_mid, mask=mask & (TK > 0), other=T)
+        go_right = (val < t_vals) & (mid < TK)
+        lo = tl.where(go_right, mid + 1, lo)
+        hi = tl.where(go_right, hi, mid)
+    tl.store(offset_ptr + t_offs, lo, mask=mask)
+@torch.library.custom_op(
+    add_op_namespace_prefix("triton_kernels__general_routing_router_metadata"),
+    mutates_args={
+        "expert_frequency",
+        "expert_frequency_offset",
+        "x_gather_idx",
+        "s_scatter_idx",
+        "s_reverse_scatter_idx",
+        "num_activated_expert_per_token_offset",
+    },
+)
+def general_routing_router_metadata_triton(
+    sorted_selected_T: torch.Tensor,
+    selected_E: torch.Tensor,
+    T: int,
+    E: int,
+    expert_frequency: torch.Tensor,
+    expert_frequency_offset: torch.Tensor,
+    x_gather_idx: torch.Tensor,
+    s_scatter_idx: torch.Tensor,
+    s_reverse_scatter_idx: torch.Tensor,
+    num_activated_expert_per_token_offset: torch.Tensor,
+) -> None:
+    TK = selected_E.size(0)
+    device = selected_E.device
+    E_POW2 = triton.next_power_of_2(E)
+    BLOCK_SIZE = 1024
+    n_tiles = triton.cdiv(TK, BLOCK_SIZE)
+    # ── Kernel 1: tiled histogram ─────────────────────────────────────────
+    col_partial_sum_trans = torch.empty(E, n_tiles, dtype=torch.int32, device=device)
+    _general_compute_col_partial_sum_kernel[(n_tiles,)](
+        selected_E,
+        col_partial_sum_trans,
+        TK,
+        E,
+        n_tiles,
+        BLOCK_SIZE=BLOCK_SIZE,
+        E_POW2=E_POW2,
+    )
+    expert_frequency.copy_(col_partial_sum_trans.sum(dim=1, dtype=torch.int32))
+    col_partial_sum = col_partial_sum_trans.T  # [n_tiles, E], strides (1, n_tiles)
+    # ── Kernel 2: stage1 ─────────────────────────────────────────────────
+    _bitmatrix_metadata_compute_stage1[(E + 2,)](
+        expert_frequency,
+        expert_frequency_offset,
+        E,
+        col_partial_sum,
+        n_tiles,
+        TK,
+        BLOCK_M=128,
+        BLOCK_N=E_POW2,
+    )
+    # ── Kernel 3: stage2 ─────────────────────────────────────────────────
+    _general_metadata_compute_stage2[(n_tiles,)](
+        s_scatter_idx,
+        s_reverse_scatter_idx,
+        x_gather_idx,
+        selected_E,
+        sorted_selected_T,
+        TK,
+        col_partial_sum,
+        n_tiles,
+        expert_frequency_offset[:E],
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    # ── Kernel 4: num_activated_expert_per_token_offset via searchsorted ──
+    # sorted_selected_T is sorted ascending, so offset[t] = searchsorted_left(sorted_T, t).
+    # Parallel binary search: each thread handles one token index, O(log TK) work.
+    N_ITERS = max(1, math.ceil(math.log2(TK + 1)))
+    TOKEN_BLOCK = 1024
+    n_token_blocks = triton.cdiv(T + 1, TOKEN_BLOCK)
+    _token_offset_searchsorted_kernel[(n_token_blocks,)](
+        sorted_selected_T,
+        num_activated_expert_per_token_offset,
+        T,
+        TK,
+        BLOCK_SIZE=TOKEN_BLOCK,
+        N_ITERS=N_ITERS,
+    )

build/torch-cuda/functional/triton_kernels/bitmatrix.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import triton
+import triton.language as tl
+# https://github.com/triton-lang/triton/blob/434aecbe933af6a8d49595d4197bfc3df7618748/python/triton_kernels/triton_kernels/tensor_details/bitmatrix.py#L33
+@triton.jit
+def _keyed_add(x, y):
+    # we keep the key in the upper 16 bits of a uint32:
+    key_mask: tl.constexpr = 0xFFFF0000
+    kx = x & key_mask
+    ky = y & key_mask
+    z = tl.where(kx == ky, x + y - kx, y)
+    return z
+# Adapted from https://github.com/triton-lang/triton/blob/434aecbe933af6a8d49595d4197bfc3df7618748/python/triton_kernels/triton_kernels/tensor_details/bitmatrix.py#L44
+@triton.jit
+def _bitmatrix_metadata_compute_stage1(
+    expert_freq_ptr,
+    expert_freq_offs_ptr,
+    E: tl.constexpr,
+    partial_sum_ptr,
+    n_tiles,
+    TK,
+    BLOCK_M: tl.constexpr,  # chunk size for iterating over tiles per expert
+    BLOCK_N: tl.constexpr,  # chunk size for iterating over experts in cumsum
+):
+    # Assume grid size == E + 1
+    pid = tl.program_id(0)
+    if pid < E:
+        # convert partial_sum[e, *] from raw counts to exclusive prefix
+        # sums over tiles. After this kernel, partial_sum[e, t] =
+        # number of entries for expert e in tiles 0..t-1.
+        # This is read by stage2 to locate each entry's position within expert e's contiguous output segment.
+        expert_partial_sum_ptr = partial_sum_ptr + pid * n_tiles
+        curr_sum = 0
+        for start in range(0, n_tiles, BLOCK_M):
+            offs = start + tl.arange(0, BLOCK_M)
+            tile_counts = tl.load(expert_partial_sum_ptr + offs, mask=offs < n_tiles, other=0)
+            excl_cumsum = tl.cumsum(tile_counts, 0) - tile_counts + curr_sum
+            curr_sum += tl.sum(tile_counts, 0)
+            tl.store(expert_partial_sum_ptr + offs, excl_cumsum, mask=offs < n_tiles)
+    elif pid == E:
+        # Exclusive prefix sum of per-expert total counts → expert_offs[e].
+        # expert_freq_offset[e] = total entries routed to expert e (from A.sum(dim=1)).
+        # expert_offs[e] = sum of expert_freq_offset[0..e-1] = global start of expert e.
+        curr_sum = 0
+        for start in tl.static_range(0, E, BLOCK_N):
+            offs = start + tl.arange(0, BLOCK_N)
+            expert_freq = tl.load(expert_freq_ptr + offs, mask=offs < E, other=0)
+            excl_cumsum = tl.cumsum(expert_freq, 0) - expert_freq + curr_sum
+            curr_sum += tl.sum(expert_freq, 0)
+            tl.store(expert_freq_offs_ptr + offs, excl_cumsum, mask=offs < E)
+    elif pid == E + 1:
+        # expert_freq_off[E] = TK (total number of entries)
+        tl.store(expert_freq_offs_ptr + E, TK)
+# Adapted from https://github.com/triton-lang/triton/blob/434aecbe933af6a8d49595d4197bfc3df7618748/python/triton_kernels/triton_kernels/tensor_details/bitmatrix.py#L44
+@triton.jit
+def _bitmatrix_metadata_compute_stage2(
+    s_scatter_idx_ptr,
+    s_reverse_scatter_idx_ptr,
+    x_gather_idx_ptr,
+    topk_indices_ptr,
+    T,
+    partial_sum_ptr,
+    n_tiles,
+    expert_offs_ptr,
+    K_POW2: tl.constexpr,  # padded K, == BLOCK_SIZE / BLOCK
+    K: tl.constexpr,  # actual experts per token
+    TOKENS_PER_BLOCK: tl.constexpr,  # tokens per tile
+):
+    # One CTA per tile, same tiling as _compute_col_partial_sum_kernel.
+    # For each entry (token t, k-slot k) in this tile:
+    #   s_reverse_scatter_idx[entry_idx] = output position in expert-sorted order
+    #   s_scatter_idx[output_pos]        = entry_idx   (inverse permutation)
+    #   x_gather_idx[output_pos]         = token index (= entry_idx // K)
+    #
+    # Output position = expert_offs[e]          (global start of expert e)
+    #                 + partial_sum[tile, e]     (entries for e in earlier tiles, after stage1)
+    #                 + within_expert_rank       (position within this tile's group for e)
+    BLOCK_SIZE: tl.constexpr = TOKENS_PER_BLOCK * K_POW2
+    IS_POW2_K: tl.constexpr = K == K_POW2  # fast path: no padding waste
+    tl.static_assert(BLOCK_SIZE <= 32768)
+    pid_m = tl.program_id(0)
+    offs_local = tl.arange(0, BLOCK_SIZE)  # position within this tile's flat [BLOCK*K_POW2] space
+    offs_global = pid_m * BLOCK_SIZE + offs_local
+    mask = offs_global < T * K_POW2
+    # Load expert id for each slot. IS_POW2_K fast path reads topk_indices as a
+    # flat 1D array (no padding gaps). Non-pow2 path reads 2D with k_slot masking.
+    if IS_POW2_K:
+        expert = tl.load(topk_indices_ptr + offs_global, mask=mask, other=-1).to(tl.uint32)
+    else:
+        token_i_local = offs_local // K_POW2
+        k_slot = offs_local % K_POW2
+        token_i_global = pid_m * TOKENS_PER_BLOCK + token_i_local
+        load_mask = mask & (k_slot < K)
+        safe_k = tl.minimum(k_slot, K - 1)
+        expert = tl.load(
+            topk_indices_ptr + token_i_global * K + safe_k,
+            mask=load_mask,
+            other=-1,
+        ).to(tl.uint32)
+    # Pack (expert, presort_offs) into a uint32 kv pair and sort by expert.
+    # Upper 16 bits = expert id (sort key), lower 16 bits = pre-sort local offset.
+    # Invalid slots have expert=0xffff (from other=-1 cast to uint32 >> 16).
+    kv_pairs = tl.sort(((expert << 16) | offs_local).to(tl.uint32), 0)
+    expert = kv_pairs >> 16
+    mask = expert != 0xFFFF  # exclude padding/OOB slots
+    # Segmented scan to compute within-expert rank (0-based exclusive count).
+    # scan_input packs expert id in upper 16 bits and count=1 in lower 16 bits.
+    # _keyed_add resets the count at each expert boundary.
+    scan_input = (kv_pairs & 0xFFFF0000) | 0x00000001
+    inclusive_run_lengths = tl.associative_scan(scan_input, 0, _keyed_add)
+    within_expert_rank = (inclusive_run_lengths - 1) & 0xFFFF  # exclusive = inclusive - 1
+    # Output position for this entry in the expert-sorted output array.
+    # partial_sum layout after stage1: [n_tiles, E], stride (1, n_tiles).
+    # So partial_sum[pid_m, expert] = partial_sum_ptr + pid_m*1 + expert*n_tiles.
+    s_reverse_scatter_idx = tl.load(partial_sum_ptr + pid_m + expert * n_tiles, mask=mask)
+    s_reverse_scatter_idx += tl.load(expert_offs_ptr + expert, mask=mask)
+    s_reverse_scatter_idx += within_expert_rank
+    if IS_POW2_K:
+        # presort_offs == offs_local before sort; entry_idx is the flat index into
+        # topk_router_indices.view(-1), i.e. token * K + k_slot.
+        presort_offs = kv_pairs & 0xFFFF
+        entry_idx = pid_m * BLOCK_SIZE + presort_offs
+        tl.store(s_reverse_scatter_idx_ptr + entry_idx, s_reverse_scatter_idx, mask=mask)
+        tl.store(s_scatter_idx_ptr + s_reverse_scatter_idx, entry_idx, mask=mask)
+        tl.store(x_gather_idx_ptr + s_reverse_scatter_idx, entry_idx // K_POW2, mask=mask)
+    else:
+        # presort_offs is in K_POW2-padded space; convert to unpadded entry_idx.
+        presort_offs = kv_pairs & 0xFFFF
+        token_i_global_s = pid_m * TOKENS_PER_BLOCK + presort_offs // K_POW2
+        entry_idx = token_i_global_s * K + presort_offs % K_POW2
+        tl.store(s_reverse_scatter_idx_ptr + entry_idx, s_reverse_scatter_idx, mask=mask)
+        tl.store(s_scatter_idx_ptr + s_reverse_scatter_idx, entry_idx, mask=mask)
+        tl.store(x_gather_idx_ptr + s_reverse_scatter_idx, token_i_global_s, mask=mask)

build/torch-cuda/functional/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+import os
+from contextlib import contextmanager
+_IS_USING_QUACK_GEMM = os.getenv("USE_QUACK_GEMM", "0") == "1"
+@contextmanager
+def enable_quack_gemm(enable: bool = True):
+    global _IS_USING_QUACK_GEMM
+    previous_value = _IS_USING_QUACK_GEMM
+    _IS_USING_QUACK_GEMM = enable
+    yield
+    _IS_USING_QUACK_GEMM = previous_value
+def is_using_quack_gemm() -> bool:
+    return _IS_USING_QUACK_GEMM

build/torch-cuda/jit.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+import inspect
+import os
+from shutil import rmtree
+from typing import Callable
+from uuid import uuid4
+import torch
+from torch.utils.cpp_extension import load as load_cpp_extension
+_CPP_MODULE_PREFIX = "sonicmoe"
+_GLOBAL_RANK = int(os.getenv("RANK", 0))
+_WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1))
+_ALL_COMPILED_MODULES = {}
+@torch.compiler.disable
+def _get_cpp_function(function_name: str, module_name: str, source_files: list[str], build_directory: str) -> Callable:
+    module_name = f"{_CPP_MODULE_PREFIX}_{module_name}"
+    extra_cflags = ["-O3", "-Wall", "-shared", "-fPIC", "-fdiagnostics-color"]
+    extra_cuda_cflags = ["-O3", "-lineinfo"]
+    extra_include_paths = [
+        os.path.dirname(__file__),  # sonicmoe/include
+        os.path.dirname(os.path.dirname(__file__)) + "/cutlass/include",  # cutlass
+        os.path.dirname(os.path.dirname(__file__)) + "/cutlass/tools/util/include",  # cutlass
+    ]
+    module = _ALL_COMPILED_MODULES.get(module_name, None)
+    if module is None:
+        if torch.distributed.is_initialized():
+            os.makedirs(build_directory, exist_ok=True)
+            if _GLOBAL_RANK == 0:
+                module = load_cpp_extension(
+                    module_name,
+                    sources=source_files,
+                    with_cuda=True,
+                    extra_cflags=extra_cflags,
+                    extra_cuda_cflags=extra_cuda_cflags,
+                    extra_include_paths=extra_include_paths,
+                    build_directory=build_directory,
+                    verbose=True,
+                )
+            torch.distributed.barrier()
+            if _GLOBAL_RANK != 0:
+                module = load_cpp_extension(
+                    module_name,
+                    sources=source_files,
+                    with_cuda=True,
+                    extra_cflags=extra_cflags,
+                    extra_cuda_cflags=extra_cuda_cflags,
+                    extra_include_paths=extra_include_paths,
+                    build_directory=build_directory,
+                    verbose=False,
+                )
+        else:
+            if _WORLD_SIZE > 1:
+                build_directory = os.path.join(build_directory, str(uuid4()))
+            os.makedirs(build_directory, exist_ok=True)
+            module = load_cpp_extension(
+                module_name,
+                sources=source_files,
+                with_cuda=True,
+                extra_cflags=extra_cflags,
+                extra_cuda_cflags=extra_cuda_cflags,
+                extra_include_paths=extra_include_paths,
+                build_directory=build_directory,
+                verbose=True,
+            )
+            if _WORLD_SIZE > 1:
+                rmtree(build_directory, ignore_errors=True)
+        _ALL_COMPILED_MODULES[module_name] = module
+    return getattr(module, function_name)
+def cpp_jit(
+    function_name: str | None = None,
+    extra_source_files: list[str] = [],
+    build_directory: str | None = None,
+    depth: int = 0,
+) -> Callable:
+    """wrapper to compile C++/CUDA source code at runtime.
+    Args:
+        function_name (str | None, optional): name of the function to expose from the C++ file, the python function
+            name should match the funcion name in the C++ file if this is not specified. Defaults to None.
+        extra_source_files (list[str], optional): any extra files to use for compilation, by default it scans the
+            directory of the python stub file. Defaults to [].
+        build_directory (str | None, optional): directory in which to place the build artifacts. Defaults to None.
+        depth (int, optional): number of times dirname is called to get the build path. Defaults to 2.
+    Returns:
+        Callable: returns the wrapped function that can be used to call the C++ functions from python
+    """
+    cpp_function = None
+    args_spec = None
+    source_files = []
+    source_files.extend(extra_source_files)
+    calling_filename = inspect.stack()[1].filename
+    calling_directory = os.path.dirname(calling_filename)
+    for dirname, _, filenames in os.walk(calling_directory):
+        filenames = [os.path.join(dirname, f) for f in filenames]
+        filenames = filter(lambda f: os.path.splitext(f)[1] in [".cu", ".cpp"], filenames)
+        source_files.extend(filenames)
+    if build_directory is None:
+        module_name = calling_directory
+        for _ in range(depth):
+            module_name = os.path.dirname(module_name)
+        module_name = os.path.basename(module_name)
+        build_directory = os.path.join(os.path.dirname(os.path.dirname(__file__)), "build", module_name)
+    def _run(*args, **kwargs):
+        nonlocal cpp_function
+        if cpp_function is None:
+            cpp_function = _get_cpp_function(
+                function_name=_run.__name__,
+                module_name=module_name,
+                source_files=source_files,
+                build_directory=build_directory,
+            )
+        full_args = []
+        full_args.extend(args)
+        for variable_name in args_spec.args[len(args) :]:
+            full_args.append(kwargs[variable_name])
+        return cpp_function(*full_args)
+    def _wrapper(function: Callable) -> Callable:
+        nonlocal args_spec
+        args_spec = inspect.getfullargspec(function)
+        _run.__doc__ = function.__doc__
+        _run.__name__ = function.__name__ if function_name is None else function_name
+        _run.__signature__ = inspect.signature(function)
+        return _run
+    return _wrapper

build/torch-cuda/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [
+    "nvidia-cutlass-dsl"
+  ],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch-cuda/moe.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# ********************************************************************************
+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Xinle Cheng, Ion Stoica, Tri Dao
+# ********************************************************************************
+from typing import Callable
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .enums import ActivationType, KernelBackendMoE, is_glu
+from .functional import moe_TC_softmax_topk_layer
+try:
+    from xma.modules.moe import scattered_experts
+    _IS_XMA_AVAILABLE = True
+except ImportError:
+    _IS_XMA_AVAILABLE = False
+def _swiglu(x: torch.Tensor) -> torch.Tensor:
+    u = x[..., 1::2]
+    g = x[..., ::2]
+    return u * F.silu(g)
+def _geglu(x: torch.Tensor) -> torch.Tensor:
+    u = x[..., 1::2]
+    g = x[..., ::2]
+    return (F.gelu(g.to(dtype=torch.float32)) * u).to(dtype=g.dtype)
+def _gelu(x: torch.Tensor) -> torch.Tensor:
+    return F.gelu(x.to(dtype=torch.float32)).to(dtype=x.dtype)
+def _reglu(x: torch.Tensor) -> torch.Tensor:
+    u = x[..., 1::2]
+    g = x[..., ::2]
+    return (F.relu(g) * u).to(dtype=g.dtype)
+def _relu(x: torch.Tensor) -> torch.Tensor:
+    return F.relu(x)
+def _relu_sq(x: torch.Tensor) -> torch.Tensor:
+    return F.relu(x) ** 2
+def _silu(x: torch.Tensor) -> torch.Tensor:
+    return F.silu(x)
+class Experts(nn.Module):
+    def __init__(
+        self, num_experts: int, in_features: int, out_features: int, add_bias: bool = True, std: float | None = None
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, out_features, in_features))
+        self.bias = None
+        if add_bias:
+            self.bias = nn.Parameter(torch.empty(num_experts, out_features))
+        self.std = std
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.reset_parameters()
+    def up_projection_scattermoe_forward(
+        self,
+        input: torch.Tensor,
+        num_experts_per_token: int | None = None,
+        sorted_expert_idxs: torch.Tensor | None = None,
+        sorted_scattered_idxs: torch.Tensor | None = None,
+        expert_offsets: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert self.bias is None
+        if not _IS_XMA_AVAILABLE:
+            raise ImportError(
+                "install accelerated-model-architectures from https://github.com/open-lm-engine/accelerated-model-architectures"
+            )
+        input = scattered_experts(
+            inputs=input,
+            expert_weights=self.weight.permute(0, 2, 1),
+            k=num_experts_per_token,
+            sorted_expert_idxs=sorted_expert_idxs,
+            sorted_scattered_idxs=sorted_scattered_idxs,
+            expert_offsets=expert_offsets,
+            gates=None,
+            grouped_in=False,
+            grouped_out=True,
+        )
+        return input
+    def down_projection_scattermoe_forward(
+        self,
+        input: torch.Tensor,
+        num_experts_per_token: int | None = None,
+        sorted_expert_idxs: torch.Tensor | None = None,
+        sorted_scattered_idxs: torch.Tensor | None = None,
+        expert_offsets: torch.Tensor | None = None,
+        gates: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        assert self.bias is None
+        if not _IS_XMA_AVAILABLE:
+            raise ImportError(
+                "install accelerated-model-architectures from https://github.com/open-lm-engine/accelerated-model-architectures"
+            )
+        input = scattered_experts(
+            inputs=input,
+            expert_weights=self.weight.permute(0, 2, 1),
+            k=num_experts_per_token,
+            sorted_expert_idxs=sorted_expert_idxs,
+            sorted_scattered_idxs=sorted_scattered_idxs,
+            expert_offsets=expert_offsets,
+            gates=gates,
+            grouped_in=True,
+            grouped_out=False,
+        )
+        return input
+    def torch_forward(
+        self, input: torch.Tensor, expert_frequency: torch.Tensor | None, return_list: bool = False
+    ) -> list[torch.Tensor] | torch.Tensor:
+        if isinstance(input, torch.Tensor):
+            input = input.split(expert_frequency.tolist(), dim=0)
+        else:
+            assert expert_frequency is None
+        input = [
+            F.linear(input[i], self.weight[i], None if self.bias is None else self.bias[i])
+            for i in range(self.num_experts)
+        ]
+        if not return_list:
+            input = torch.cat(input, dim=0)
+        return input
+    def extra_repr(self):
+        return "num_experts={}, in_features={}, out_features={}".format(
+            self.num_experts, self.in_features, self.out_features
+        )
+    @torch.no_grad()
+    def reset_parameters(self) -> None:
+        nn.init.normal_(self.weight, mean=0, std=self.std)
+        if hasattr(self, "bias") and self.bias is not None:
+            self.bias.zero_()
+class MoE(nn.Module):
+    def __init__(
+        self,
+        num_experts: int,
+        num_experts_per_tok: int,
+        hidden_size: int,
+        intermediate_size: int,
+        activation_function: ActivationType,
+        add_bias: bool,
+        std: float,
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.top_k = num_experts_per_tok
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.router = nn.Linear(in_features=self.hidden_size, out_features=num_experts, bias=False)
+        self.activation_function = activation_function
+        self.c_fc = Experts(
+            num_experts=num_experts,
+            in_features=self.hidden_size,
+            out_features=2 * self.intermediate_size if is_glu(activation_function) else self.intermediate_size,
+            add_bias=add_bias,
+            std=std,
+        )
+        self.c_proj = Experts(
+            num_experts=num_experts,
+            in_features=self.intermediate_size,
+            out_features=self.hidden_size,
+            add_bias=add_bias,
+            std=std,
+        )
+        self.stream_id = torch.cuda.current_stream().cuda_stream
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kernel_backend_moe: KernelBackendMoE = KernelBackendMoE.sonicmoe,
+        is_inference_mode: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        original_shape = hidden_states.shape
+        # hidden_states -> (batch_size, query_length, hidden_size)
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        if kernel_backend_moe == KernelBackendMoE.sonicmoe and self.num_experts <= 32768:
+            hidden_states, router_logits, expert_frequency = moe_TC_softmax_topk_layer(
+                hidden_states,
+                self.router.weight,
+                self.c_fc.weight.permute(1, 2, 0),
+                self.c_fc.bias,
+                self.c_proj.weight.permute(1, 2, 0),
+                self.c_proj.bias,
+                self.top_k,
+                self.stream_id,
+                self.activation_function,
+                is_inference_mode or not self.training,
+            )
+        else:
+            # hidden_states -> (total_q, hidden_size)
+            router_logits, router_weights, selected_experts = self._compute_routing_weights(hidden_states)
+            # router_logits -> (total_q, num_experts)
+            # router_weights -> (total_q, top_k)
+            # selected_experts -> (total_q, top_k)
+            hidden_states, expert_frequency = self._compute_experts(
+                hidden_states,
+                router_weights,
+                selected_experts,
+                kernel_backend_moe=kernel_backend_moe,
+            )
+        hidden_states = hidden_states.view(original_shape)
+        # hidden_states -> (batch_size, query_length, hidden_size)
+        if is_inference_mode:
+            aux_loss = None
+        else:
+            aux_loss = self._compute_switch_loss(
+                logits=router_logits,
+                probs=F.softmax(router_logits, dim=-1, dtype=torch.float32),
+                expert_frequency=expert_frequency,
+            )
+        return hidden_states, aux_loss
+    # copied from https://github.com/open-lm-engine/lm-engine/blob/1447883df709727839bbbb367ce727fa56962a6a/lm_engine/hf_models/modeling_utils/mlp_blocks/moe.py#L432-L455
+    # NOTE we don't do all_reduce here for expert frequency for simplicity across data parallel workers
+    def _compute_switch_loss(
+        self, logits: torch.Tensor, probs: torch.Tensor, expert_frequency: torch.Tensor
+    ) -> torch.Tensor:
+        logits = logits.view(-1, logits.size(-1))
+        probs = probs.view(-1, probs.size(-1))
+        num_experts = logits.size(1)
+        acc_probs = probs.sum(0)
+        expert_frequency = expert_frequency.float()
+        aux_loss = num_experts * (F.normalize(acc_probs, p=1, dim=0) * F.normalize(expert_frequency, p=1, dim=0)).sum()
+        return aux_loss
+    def _compute_routing_weights(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        # hidden_states -> (total_q, hidden_size)
+        router_logits = self.router(hidden_states)
+        # router_logits -> (total_q, num_experts)
+        router_weights, selected_experts = self._get_topk(router_logits)
+        # router_weights -> (total_q, top_k)
+        # selected_experts -> (total_q, top_k)
+        router_weights = F.softmax(router_weights.float(), dim=-1)
+        router_weights = router_weights.type_as(hidden_states)
+        return router_logits, router_weights, selected_experts
+    def _compute_experts(
+        self,
+        hidden_states: torch.Tensor,
+        router_weights: torch.Tensor,
+        selected_experts: torch.Tensor,
+        kernel_backend_moe: KernelBackendMoE,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        selected_experts = selected_experts.flatten()
+        with torch.no_grad():
+            sorted_expert_idxs, sorted_scattered_idxs = selected_experts.sort()
+        expert_frequency = selected_experts.bincount(minlength=self.num_experts).to(torch.int32)
+        expert_offsets = expert_frequency.cumsum(-1).to(torch.int32)
+        act_func = {
+            ActivationType.SWIGLU: _swiglu,
+            ActivationType.GEGLU: _geglu,
+            ActivationType.REGLU: _reglu,
+            ActivationType.GELU: _gelu,
+            ActivationType.RELU: _relu,
+            ActivationType.SILU: _silu,
+            ActivationType.RELU_SQ: _relu_sq,
+        }[self.activation_function]
+        T = hidden_states.size(0)
+        if kernel_backend_moe == KernelBackendMoE.scattermoe:
+            hidden_states = self.c_fc.up_projection_scattermoe_forward(
+                input=hidden_states,
+                num_experts_per_token=self.top_k,
+                sorted_expert_idxs=sorted_expert_idxs,
+                sorted_scattered_idxs=sorted_scattered_idxs,
+                expert_offsets=expert_offsets,
+            )
+            hidden_states = act_func(hidden_states)
+            hidden_states = self.c_proj.down_projection_scattermoe_forward(
+                input=hidden_states,
+                num_experts_per_token=1,
+                sorted_expert_idxs=sorted_expert_idxs,
+                sorted_scattered_idxs=sorted_scattered_idxs,
+                expert_offsets=expert_offsets,
+                gates=router_weights,
+            )
+        elif kernel_backend_moe == KernelBackendMoE.torch:
+            # sort and group input tokens according to expert assignment
+            fan_in_index = sorted_scattered_idxs // self.top_k
+            # gather the gate values for grouped input tokens
+            router_weights = router_weights.flatten()
+            batch_gates = router_weights[sorted_scattered_idxs]
+            hidden_states = hidden_states[fan_in_index]
+            hidden_states = self.c_fc.torch_forward(
+                input=hidden_states, expert_frequency=expert_frequency, return_list=True
+            )
+            hidden_states = [act_func(i) for i in hidden_states]
+            hidden_states = self.c_proj.torch_forward(input=hidden_states, expert_frequency=None, return_list=False)
+            hidden_states = hidden_states * batch_gates.unsqueeze(-1)
+            zeros = torch.zeros((T, self.hidden_size), dtype=torch.float32, device=hidden_states.device)
+            hidden_states = zeros.index_add(0, fan_in_index, hidden_states)
+        else:
+            raise ValueError(f"unexpected kernel_backend_moe ({kernel_backend_moe})")
+        return hidden_states, expert_frequency
+    def _get_topk(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.top_k == 1:
+            x, indices = x.max(dim=-1, keepdim=True)
+        else:
+            x, indices = x.topk(self.top_k, dim=-1)
+        return x, indices

build/torch-cuda/quack/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+__version__ = "0.2.5"
+import os
+if os.environ.get("CUTE_DSL_PTXAS_PATH", None) is not None:
+    from . import cute_dsl_ptxas
+    cute_dsl_ptxas.patch()

build/torch-cuda/quack/_ops_compat.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .._ops_compat import add_op_namespace_prefix
+def add_quack_op_namespace_prefix(name: str) -> str:
+    return add_op_namespace_prefix(f"quack__{name}")

build/torch-cuda/quack/activation.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Copyright (c) 2025, Tri Dao.
+import math
+from typing import Tuple
+import cutlass.cute as cute
+from cutlass import Float32, Boolean, const_expr
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+from . import utils as utils
+F32_or_F32x2 = Float32 | Tuple[Float32, Float32]
+@dsl_user_op
+def tanh(a: float | Float32, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "tanh.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+def sigmoid(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        # return 0.5 + 0.5 * cute.math.tanh(0.5 * x, fastmath=True)
+        return 0.5 + 0.5 * tanh(0.5 * x)
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return utils.fma_packed_f32x2(tanh_x_half, (0.5, 0.5), (0.5, 0.5))
+@dsl_user_op
+def dsigmoid_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    # return dout * out * (1.0 - out)
+    return dout * (out - out * out)
+@dsl_user_op
+def relu(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0))
+    else:
+        return cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0))
+@dsl_user_op
+@cute.jit
+def drelu(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        return dout if x_pos else Float32(0.0), cute.arch.fmax(x, Float32(0.0))
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        dx = (dout[0] if x0_pos else Float32(0.0), dout[1] if x1_pos else Float32(0.0))
+        return dx, relu(x)
+@dsl_user_op
+def relu_sq(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * x
+    else:
+        relu_x = (cute.arch.fmax(x[0], Float32(0.0)), cute.arch.fmax(x[1], Float32(0.0)))
+        return utils.mul_packed_f32x2(relu_x, x)
+@dsl_user_op
+@cute.jit
+def drelu_sq(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    """
+    ReLU squared backward pass: computes gradient w.r.t. x and recomputes forward
+    Given: relu_sq_out = max(x, 0) * x, and dout = grad w.r.t. relu_sq_out
+    Returns: (dx, relu_sq_out) where:
+    - dx = dout * 2 * x if x > 0, else 0
+    - relu_sq_out = max(x, 0) * x
+    """
+    if const_expr(not isinstance(x, tuple)):
+        relu_x = relu(x)
+        relu_sq_out = relu_x * x
+        # Derivative: d/dx[max(x,0) * x] = 2*x if x > 0, else 0
+        dx = 2.0 * (dout * relu_x)
+        return dx, relu_sq_out
+    else:
+        relu_x = relu(x)
+        relu_sq_out = utils.mul_packed_f32x2(relu_x, x)
+        dx = utils.mul_packed_f32x2((2.0, 2.0), utils.mul_packed_f32x2(dout, relu_x))
+        return dx, relu_sq_out
+@dsl_user_op
+def gelu_tanh_approx(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """
+    gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+            = 0.5 * x * (1 + tanh(x * (0.797885 + 0.0356774 * x * x)))
+    """
+    sqrt_2_over_pi = math.sqrt(2 / math.pi)  # ~0.797885
+    sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # ~0.0356774
+    if const_expr(not isinstance(x, tuple)):
+        return 0.5 * (
+            x
+            # Currently cute.math.tanh(x, fastmath=True) generates very slow code
+            # * (1 + cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x)), fastmath=True))
+            * (1.0 + tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * (x * x))))
+        )
+    else:
+        x_sq = utils.mul_packed_f32x2(x, x)
+        x_sq_scaled = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        x_tanh_z = utils.fma_packed_f32x2(tanh_z, x, x)
+        return utils.mul_packed_f32x2((0.5, 0.5), x_tanh_z)
+@dsl_user_op
+def dgelu_tanh_approx(
+    x: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2]:
+    """
+    GELU tanh approximation backward pass: computes gradient w.r.t. x and recomputes forward
+    Given: gelu_out = 0.5 * x * (1 + tanh(x * (c1 + c2 * x^2))), and dout = grad w.r.t. gelu_out
+    Returns: (dx, gelu_out)
+    Derivative uses the chain rule:
+    d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+    where z = x * (c1 + c2 * x^2), dz/dx = c1 + 3 * c2 * x^2
+    and sech^2(z) = 1 - tanh^2(z)
+    """
+    sqrt_2_over_pi = math.sqrt(2 / math.pi)  # c1 ~0.797885
+    sqrt_2_over_pi_coeff = 0.044715 * sqrt_2_over_pi  # c2 ~0.0356774
+    sqrt_2_over_pi_coeff_3 = 3.0 * sqrt_2_over_pi_coeff  # c3 ~0.01070322
+    if const_expr(not isinstance(x, tuple)):
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = x * x
+        # tanh_z = cute.math.tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq), fastmath=True)
+        tanh_z = tanh(x * (sqrt_2_over_pi + sqrt_2_over_pi_coeff * x_sq))
+        half_tanh_z_plus_one = 0.5 + 0.5 * tanh_z
+        gelu_out = x * half_tanh_z_plus_one
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = 1 - tanh_z * tanh_z
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = sqrt_2_over_pi + sqrt_2_over_pi_coeff_3 * x_sq
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        dgelu = half_tanh_z_plus_one + x * (0.5 * (sech2_z * dz_dx))
+        dx = dout * dgelu
+        return dx, gelu_out
+    else:
+        # Compute z = x * (c1 + c2 * x^2)
+        x_sq = utils.mul_packed_f32x2(x, x)
+        x_sq_scaled = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff, sqrt_2_over_pi_coeff), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        z = utils.mul_packed_f32x2(x, x_sq_scaled)
+        tanh_z = (tanh(z[0]), tanh(z[1]))
+        half_tanh_z_plus_one = utils.fma_packed_f32x2(tanh_z, (0.5, 0.5), (0.5, 0.5))
+        gelu_out = utils.mul_packed_f32x2(x, half_tanh_z_plus_one)
+        # Compute gradient
+        # sech^2(z) = 1 - tanh^2(z)
+        sech2_z = utils.fma_packed_f32x2(tanh_z, (-tanh_z[0], -tanh_z[1]), (1.0, 1.0))
+        # dz/dx = c1 + 3 * c2 * x^2
+        dz_dx = utils.fma_packed_f32x2(
+            x_sq, (sqrt_2_over_pi_coeff_3, sqrt_2_over_pi_coeff_3), (sqrt_2_over_pi, sqrt_2_over_pi)
+        )
+        # d/dx[gelu(x)] = 0.5 * (1 + tanh(z)) + 0.5 * x * sech^2(z) * dz/dx
+        sech2_dz_dx = utils.mul_packed_f32x2(sech2_z, dz_dx)
+        x_sech2_dz_dx = utils.mul_packed_f32x2(x, sech2_dz_dx)
+        dgelu = utils.fma_packed_f32x2(x_sech2_dz_dx, (0.5, 0.5), half_tanh_z_plus_one)
+        dx = utils.mul_packed_f32x2(dout, dgelu)
+        return dx, gelu_out
+@dsl_user_op
+@cute.jit
+def softplus(x: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        use_linear = Boolean(x > 20.0)
+        return (
+            cute.math.log(Float32(cute.math.exp(x, fastmath=True)) + 1.0, fastmath=True)
+            if not use_linear
+            else x
+        )
+    else:
+        log2_e = math.log2(math.e)
+        x_log2e = utils.mul_packed_f32x2(x, (log2_e, log2_e))
+        x_exp = (cute.math.exp(x_log2e[0], fastmath=True), cute.math.exp(x_log2e[1], fastmath=True))
+        x_exp_p1 = utils.add_packed_f32x2(x_exp, (1.0, 1.0))
+        log_x_exp_p1 = (
+            cute.math.log2(x_exp_p1[0], fastmath=True),
+            cute.math.log2(x_exp_p1[1], fastmath=True),
+        )
+        ln2 = math.log(2.0)
+        softplus_x = utils.mul_packed_f32x2(log_x_exp_p1, (ln2, ln2))
+        use_linear_0 = Boolean(x[0] > 20.0)
+        use_linear_1 = Boolean(x[1] > 20.0)
+        return (
+            softplus_x[0] if not use_linear_0 else x[0],
+            softplus_x[1] if not use_linear_1 else x[1],
+        )
+@dsl_user_op
+@cute.jit
+def dsoftplus_from_output(out: Float32, dout: Float32, *, loc=None, ip=None) -> Float32:
+    use_linear = Boolean(out > 20.0)
+    # dx = dout * (1.0 - cute.math.exp(-out, fastmath=True)) if not use_linear else dout
+    dx = dout - dout * cute.math.exp(-out, fastmath=True)
+    return dx if not use_linear else dout
+@dsl_user_op
+def silu(x: F32_or_F32x2, *, already_halved: bool = False, loc=None, ip=None) -> F32_or_F32x2:
+    """
+    silu(x) = x * sigmoid(x) = x * (1 + tanh(x / 2)) / 2 = (0.5 * x) * tanh(0.5 * x) + (0.5 * x)
+    This compiles down to 3 SASS instructions: FMUL to get 0.5 * x, MUFU.TANH, and FFMA.
+    """
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x if const_expr(not already_halved) else x
+        # return x_half * cute.math.tanh(x_half, fastmath=True) + x_half
+        return x_half * tanh(x_half) + x_half
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x) if const_expr(not already_halved) else x
+        tanh_x_half = (tanh(x_half[0]), tanh(x_half[1]))
+        return utils.fma_packed_f32x2(x_half, tanh_x_half, x_half)
+@dsl_user_op
+def swiglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    if const_expr(not isinstance(x, tuple)):
+        return silu(x) * y
+    else:
+        return utils.mul_packed_f32x2(silu(x), y)
+@dsl_user_op
+def dswiglu(
+    x: F32_or_F32x2,
+    y: F32_or_F32x2,
+    dout: F32_or_F32x2,
+    *,
+    already_halved: bool = False,
+    loc=None,
+    ip=None,
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    SwiGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: swiglu_out = silu(x) * y, and dout = grad w.r.t. swiglu_out
+    Returns: (dx, dy, swiglu_out) where dx = dout * y * d_silu(x), dy = dout * silu(x)
+    d_silu(x) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+    This has been optimized to use fewer instructions (i.e. we expand things out
+    to use FFMA instead of FADD and FMUL).
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(0.5 * x))
+        # FMUL, MUFU.TANH, then FFMA
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = x * sigmoid_x  # FMUL
+        else:
+            tanh_x = tanh(x)  # MUFU.TANH
+            sigmoid_x = 0.5 * tanh_x + 0.5  # FFMA
+            silu_x = x * tanh_x + x  # FFMA
+        silu_x_dout = silu_x * dout  # FMUL
+        #   d_silu(x) * dout
+        # = sigmoid_x * (1 + x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + sigmoid_x * x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x * (1 - sigmoid_x)) * dout
+        # = (sigmoid_x + silu_x - silu_x * sigmoid_x) * dout
+        # = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        d_silu_x_dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x_dout  # FFMA, FFMA
+        dx = d_silu_x_dout * y  # FMUL
+        dy = silu_x_dout
+        swiglu_out = silu_x * y  # FMUL
+        # Overall it's 1 MUFU.TANH, 5 FMUL, 3 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(x) and silu(x)
+        if const_expr(not already_halved):
+            sigmoid_x = sigmoid(x)
+            silu_x = utils.mul_packed_f32x2(x, sigmoid_x)
+        else:
+            tanh_x = (tanh(x[0]), tanh(x[1]))
+            sigmoid_x = utils.fma_packed_f32x2(tanh_x, (0.5, 0.5), (0.5, 0.5))
+            silu_x = utils.fma_packed_f32x2(x, tanh_x, x)
+        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        # d_silu(x) * dout = (sigmoid_x - silu_x * sigmoid_x) * dout + silu_x * dout
+        sigmoid_x_minus_silu_x_sigmoid_x = utils.fma_packed_f32x2(
+            sigmoid_x, (-silu_x[0], -silu_x[1]), sigmoid_x
+        )
+        d_silu_x_dout = utils.fma_packed_f32x2(sigmoid_x_minus_silu_x_sigmoid_x, dout, silu_x_dout)
+        dx = utils.mul_packed_f32x2(d_silu_x_dout, y)
+        dy = silu_x_dout
+        swiglu_out = utils.mul_packed_f32x2(silu_x, y)
+        return dx, dy, swiglu_out
+@dsl_user_op
+def swiglu_oai(
+    x: F32_or_F32x2, y: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> F32_or_F32x2:
+    """The swiglu variant used in gpt-oss, which has a scaling factor on x and bias of 1 to y.
+    https://github.com/openai/gpt-oss/blob/7be9334950053a888e24887a57dac797a17d6e00/gpt_oss/torch/model.py#L249
+    x * sigmoid(alpha * x) * (y + 1)
+    Compile down to FMUL, FMUL, TANH, FFMA, FFMA
+    """
+    # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
+    if const_expr(not isinstance(x, tuple)):
+        x_half = 0.5 * x
+        # silu_x = x_half * cute.math.tanh(alpha * x_half, fastmath=True) + x_half
+        silu_x = x_half * tanh(alpha * x_half) + x_half
+        return silu_x * y + silu_x
+    else:
+        x_half = utils.mul_packed_f32x2((0.5, 0.5), x)
+        alpha_x_half = utils.mul_packed_f32x2((alpha, alpha), x_half)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        silu_x = utils.fma_packed_f32x2(x_half, tanh_alpha_x_half, x_half)
+        return utils.fma_packed_f32x2(silu_x, y, silu_x)
+@dsl_user_op
+def dswiglu_oai(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, alpha: float = 1.702, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    Swiglu OAI backward pass: computes gradients w.r.t. x and y
+    Given: swiglu_oai_out = x * sigmoid(alpha * x) * (y + 1), and dout = grad w.r.t. swiglu_oai_out
+    Returns: (dx, dy, swiglu_oai_out)
+    Derivative of x * sigmoid(alpha * x) w.r.t. x:
+    d/dx[x * sigmoid(alpha * x)] = sigmoid(alpha * x) + alpha * x * sigmoid(alpha * x) * (1 - sigmoid(alpha * x))
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(alpha * x) using tanh: sigmoid(z) = 0.5 * (1 + tanh(z/2))
+        alpha_x_half = (0.5 * alpha) * x  # FMUL
+        # MUFU.TANH, then FFMA
+        # sigmoid_alpha_x = 0.5 + 0.5 * cute.math.tanh(alpha_x_half, fastmath=True)
+        sigmoid_alpha_x = 0.5 + 0.5 * tanh(alpha_x_half)
+        silu_x = x * sigmoid_alpha_x  # FMUL
+        silu_x_dout = silu_x * dout  # FMUL
+        # FFMA, FFMA, FMUL
+        d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        dx = d_silu_x_dout * y + d_silu_x_dout  # FFMA, instead of multiply by y + 1
+        dy = silu_x_dout
+        swiglu_out = silu_x * y + silu_x  # FFMA, instead of multiply by y + 1
+        # Overall it's 1 MUFU.TANH, 4 FMUL, 5 FFMA
+        return dx, dy, swiglu_out
+    else:
+        # Compute sigmoid(alpha * x)
+        alpha_x_half = utils.mul_packed_f32x2(((0.5 * alpha), (0.5 * alpha)), x)
+        tanh_alpha_x_half = (tanh(alpha_x_half[0]), tanh(alpha_x_half[1]))
+        sigmoid_alpha_x = utils.fma_packed_f32x2(tanh_alpha_x_half, (0.5, 0.5), (0.5, 0.5))
+        silu_x = utils.mul_packed_f32x2(x, sigmoid_alpha_x)
+        silu_x_dout = utils.mul_packed_f32x2(silu_x, dout)
+        # d_silu_x_dout = (sigmoid_alpha_x + alpha * (silu_x - silu_x * sigmoid_alpha_x)) * dout
+        silu_x_minus_product = utils.fma_packed_f32x2(
+            silu_x, (-sigmoid_alpha_x[0], -sigmoid_alpha_x[1]), silu_x
+        )
+        sigmoid_plus_alpha_diff = utils.fma_packed_f32x2(
+            (alpha, alpha), silu_x_minus_product, sigmoid_alpha_x
+        )
+        d_silu_x_dout = utils.mul_packed_f32x2(sigmoid_plus_alpha_diff, dout)
+        dx = utils.fma_packed_f32x2(d_silu_x_dout, y, d_silu_x_dout)
+        dy = silu_x_dout
+        swiglu_out = utils.fma_packed_f32x2(silu_x, y, silu_x)
+        return dx, dy, swiglu_out
+@dsl_user_op
+def glu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """GLU: Gated Linear Unit
+    glu(x, y) = sigmoid(x) * y
+    Using tanh to compute sigmoid: sigmoid(x) = 0.5 * (1 + tanh(x/2))
+    """
+    if const_expr(not isinstance(x, tuple)):
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        return sigmoid_x * y  # FMUL
+    else:
+        sigmoid_x = sigmoid(x)
+        return utils.mul_packed_f32x2(sigmoid_x, y)
+@dsl_user_op
+def dglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    GLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: glu_out = sigmoid(x) * y, and dout = grad w.r.t. glu_out
+    Returns: (dx, dy, glu_out) where:
+    - dx = dout * y * sigmoid(x) * (1 - sigmoid(x))
+    - dy = dout * sigmoid(x)
+    - glu_out = sigmoid(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Compute sigmoid(x) using tanh: sigmoid(x) = 0.5 * (1 + tanh(x/2))
+        sigmoid_x = sigmoid(x)  # FMUL, MUFU.TANH, then FFMA
+        sigmoid_x_dout = sigmoid_x * dout  # FMUL
+        glu_out = sigmoid_x * y  # FMUL
+        # dx = y * sigmoid(x) * (1 - sigmoid(x)) * dout
+        #    = y * (1 - sigmoid(x)) * sigmoid_x_dout
+        #    = (y - y * sigmoid(x)) * sigmoid_x_dout
+        #    = (y - glu_out) * sigmoid_x_dout
+        dx = (y - glu_out) * sigmoid_x_dout  # FADD, FMUL
+        dy = sigmoid_x_dout
+        # Total: 1 MUFU.TANH, 4 FMUL, 1 FADD, 1 FFMA
+        return dx, dy, glu_out
+    else:
+        sigmoid_x = sigmoid(x)
+        sigmoid_x_dout = utils.mul_packed_f32x2(sigmoid_x, dout)
+        glu_out = utils.mul_packed_f32x2(sigmoid_x, y)
+        # dx = (y - glu_out) * sigmoid_x_dout
+        y_minus_glu_out = utils.sub_packed_f32x2(y, glu_out)
+        dx = utils.mul_packed_f32x2(y_minus_glu_out, sigmoid_x_dout)
+        dy = sigmoid_x_dout
+        return dx, dy, glu_out
+@dsl_user_op
+def reglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """ReGLU: ReLU Gated Linear Unit
+    reglu(x, y) = relu(x) * y = max(x, 0) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        return cute.arch.fmax(x, Float32(0.0)) * y
+    else:
+        relu_x = relu(x)
+        return utils.mul_packed_f32x2(relu_x, y)
+@dsl_user_op
+@cute.jit
+def dreglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    ReGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: reglu_out = relu(x) * y, and dout = grad w.r.t. reglu_out
+    Returns: (dx, dy, reglu_out) where:
+    - dx = dout * y if x > 0, else 0
+    - dy = dout * relu(x)
+    - reglu_out = relu(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        x_pos = Boolean(x > 0)
+        relu_x = cute.arch.fmax(x, Float32(0.0))
+        dx = (dout * y) if x_pos else Float32(0.0)
+        dy = dout * relu_x
+        reglu_out = relu_x * y
+        return dx, dy, reglu_out
+    else:
+        x0_pos = Boolean(x[0] > 0)
+        x1_pos = Boolean(x[1] > 0)
+        relu_x = relu(x)
+        dout_y = utils.mul_packed_f32x2(dout, y)
+        dx = ((dout_y[0] if x0_pos else Float32(0.0)), (dout_y[1] if x1_pos else Float32(0.0)))
+        dy = utils.mul_packed_f32x2(dout, relu_x)
+        reglu_out = utils.mul_packed_f32x2(relu_x, y)
+        return dx, dy, reglu_out
+@dsl_user_op
+def geglu(x: F32_or_F32x2, y: F32_or_F32x2, *, loc=None, ip=None) -> F32_or_F32x2:
+    """GeGLU: GELU Gated Linear Unit
+    geglu(x, y) = gelu(x) * y
+    Uses the tanh approximation of GELU
+    """
+    if const_expr(not isinstance(x, tuple)):
+        return gelu_tanh_approx(x) * y
+    else:
+        return utils.mul_packed_f32x2(gelu_tanh_approx(x), y)
+@dsl_user_op
+def dgeglu(
+    x: F32_or_F32x2, y: F32_or_F32x2, dout: F32_or_F32x2, *, loc=None, ip=None
+) -> Tuple[F32_or_F32x2, F32_or_F32x2, F32_or_F32x2]:
+    """
+    GeGLU backward pass: computes gradients w.r.t. x (gate) and y (up projection)
+    Given: geglu_out = gelu(x) * y, and dout = grad w.r.t. geglu_out
+    Returns: (dx, dy, geglu_out) where:
+    - dx = dout * y * d_gelu(x)
+    - dy = dout * gelu(x)
+    - geglu_out = gelu(x) * y
+    """
+    if const_expr(not isinstance(x, tuple)):
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = dgelu_x_dout * y
+        dy = gelu_x * dout
+        geglu_out = gelu_x * y
+        return dx, dy, geglu_out
+    else:
+        # Reuse dgelu_tanh_approx to compute d_gelu(x) * dout and gelu(x)
+        dgelu_x_dout, gelu_x = dgelu_tanh_approx(x, dout)
+        # Compute gradients for geglu
+        dx = utils.mul_packed_f32x2(dgelu_x_dout, y)
+        dy = utils.mul_packed_f32x2(gelu_x, dout)
+        geglu_out = utils.mul_packed_f32x2(gelu_x, y)
+        return dx, dy, geglu_out

build/torch-cuda/quack/autotuner.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Adapted from https://github.com/triton-lang/triton/blob/main/python/triton/runtime/autotuner.py
+# Copyright (C) 2025, Tri Dao.
+from __future__ import annotations
+import builtins
+import os
+import time
+import inspect
+import base64
+import hashlib
+import json
+from pathlib import Path
+from functools import cached_property, partial
+from typing import Dict, Tuple, List, Optional, Any
+import torch
+from torch import Tensor
+import triton
+from . import __version__
+PACKAGE_NAME = "quack"
+VERSION = __version__
+def get_home_dir():
+    return os.getenv(f"{PACKAGE_NAME.upper()}_HOME", Path.home())
+def default_cache_dir():
+    return os.path.join(get_home_dir(), f".{PACKAGE_NAME}", "cache")
+class FileCacheManager(triton.runtime.cache.FileCacheManager):
+    def __init__(self, key):
+        super().__init__(key)
+        self.cache_dir = (
+            os.getenv(f"{PACKAGE_NAME.upper()}_CACHE_DIR", "").strip() or default_cache_dir()
+        )
+        if self.cache_dir:
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        else:
+            raise RuntimeError("Could not create or locate cache dir")
+def _base32(key):
+    # Assume key is a hex string.
+    return base64.b32encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
+class Autotuner:
+    def __init__(
+        self,
+        fn,
+        key,
+        configs,
+        restore_value=None,
+        prune_configs_by: Optional[Dict] = None,
+        do_bench=None,
+        cache_results=False,
+    ):
+        """
+        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+            'perf_model': performance model used to predicate running time with different configs, returns running time
+            'top_k': number of configs to bench
+            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
+        """
+        if not configs:
+            self.configs = [AutotuneConfig()]
+        else:
+            self.configs = configs
+        signature = inspect.signature(fn)
+        self.keys = key
+        self.cache: Dict[Tuple, AutotuneConfig] = {}
+        self.arg_names = list(signature.parameters.keys())
+        self.cache_results = (
+            cache_results or os.getenv(f"{PACKAGE_NAME.upper()}_CACHE_AUTOTUNING", None) == "1"
+        )
+        self.restore_value = []
+        if restore_value is not None:
+            self.restore_value = list(restore_value)
+        if len(self.restore_value) > 0:
+            def _pre_hook(kwargs):
+                self.restore_copies = {name: kwargs[name].clone() for name in self.restore_value}
+            self.pre_hook = _pre_hook
+        else:
+            self.pre_hook = None
+        if len(self.restore_value) > 0:
+            def _post_hook(kwargs, exception):
+                for name in self.restore_value:
+                    kwargs[name].copy_(self.restore_copies[name])
+                self.restore_copies = {}
+            self.post_hook = _post_hook
+        else:
+            self.post_hook = None
+        self.perf_model = None
+        self.configs_top_k = 1.0
+        self.early_config_prune = None
+        if prune_configs_by:
+            self.perf_model = prune_configs_by.get("perf_model", self.perf_model)
+            self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k)
+            self.early_config_prune = prune_configs_by.get(
+                "early_config_prune", self.early_config_prune
+            )
+        self.fn = fn
+        self._do_bench = do_bench
+    @cached_property
+    def do_bench(self):
+        if self._do_bench is None:
+            return partial(triton.testing.do_bench, warmup=5, rep=25)
+        return self._do_bench
+    def _bench(self, *args, config, **meta):
+        verbose = os.environ.get(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1"
+        if verbose:
+            print(f"Autotuning kernel {self.fn.__name__} with config {config}")
+        # check for conflicts, i.e. meta-parameters both provided
+        # as kwargs and by the autotuner
+        conflicts = meta.keys() & config.kwargs.keys()
+        if conflicts:
+            raise ValueError(
+                f"Conflicting meta-parameters: {', '.join(conflicts)}."
+                " Make sure that you don't re-define auto-tuned symbols."
+            )
+        # augment meta-parameters with tunable ones
+        current = dict(meta, **config.all_kwargs())
+        full_nargs = {**self.nargs, **current}
+        def kernel_call():
+            if self.pre_hook is not None:
+                self.pre_hook(full_nargs)
+            try:
+                self.fn.__call__(
+                    *args,
+                    **current,
+                )
+            except Exception as e:
+                try:
+                    if self.post_hook is not None:
+                        self.post_hook(full_nargs, exception=e)
+                finally:
+                    # Throw exception raised by `self.fn.run`
+                    raise
+            if self.post_hook is not None:
+                self.post_hook(full_nargs, exception=None)
+        try:
+            return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
+        except Exception as e:
+            if verbose:
+                print(f"Autotuning failed with {e}")
+            return [float("inf"), float("inf"), float("inf")]
+    @torch.compiler.disable
+    def check_disk_cache(self, tuning_key, configs, bench_fn):
+        if not tuning_key:
+            bench_fn()
+            return
+        fn = self.fn
+        config_str_list = [str(c) for c in configs]
+        assert len(config_str_list) == len(set(config_str_list)), "Config strings must be unique"
+        cache_key = [VERSION, str(tuning_key)] + config_str_list
+        cache_key = hashlib.sha256("-".join(cache_key).encode("utf-8")).hexdigest()
+        cache = FileCacheManager(_base32(cache_key))
+        file_name = f"{fn.__name__[:150]}.autotune.json"
+        path = cache.get_file(file_name)
+        # There's an environment variable to force cache update
+        if path and not os.environ.get(f"{PACKAGE_NAME.upper()}_FORCE_CACHE_UPDATE", False):
+            str2config = {s: c for s, c in zip(config_str_list, configs)}
+            with open(path, "r") as cached_configs:
+                timings = json.load(cached_configs)["configs_timings"]
+                timings = {str2config[config]: timing for config, timing in timings}
+                self.cache[tuning_key] = builtins.min(timings, key=timings.get)
+                self.configs_timings = timings
+                self.bench_time = 0
+            return
+        bench_fn()
+        cache.put(
+            json.dumps(
+                {
+                    "key": tuning_key,
+                    "configs_timings": [
+                        (str(config), timings) for config, timings in self.configs_timings.items()
+                    ],
+                }
+            ),
+            file_name,
+            binary=False,
+        )
+    def __call__(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        used_cached_result = True
+        if len(self.configs) > 1:
+            all_args = {**self.nargs, **kwargs}
+            _args = {k: v for (k, v) in all_args.items() if k in self.arg_names}
+            # Need "str" to make it json-serializable
+            key = [str(_args[key]) for key in self.keys if key in _args]
+            for _, arg in _args.items():
+                if isinstance(arg, Tensor):
+                    key.append(str(arg.shape))
+                    # If stride != 0, 1, we just cache it as 2
+                    key.append(str([s if s in {0, 1} else 2 for s in arg.stride()]))
+                    key.append(str(arg.dtype))
+            key = tuple(key)
+            if key not in self.cache:
+                used_cached_result = False
+                pruned_configs = self.prune_configs(kwargs)
+                @torch.compiler.disable  # Don't want any tracing here
+                def benchmark():
+                    bench_start = time.time()
+                    timings = {
+                        config: self._bench(*args, config=config, **kwargs)
+                        for config in pruned_configs
+                    }
+                    bench_end = time.time()
+                    if os.getenv(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1":
+                        for config, time_ in timings.items():
+                            print(f"[{config}] -> {time_[0]:.3f}ms")
+                    self.bench_time = bench_end - bench_start
+                    self.cache[key] = builtins.min(timings, key=timings.get)
+                    self.configs_timings = timings
+                if self.cache_results:
+                    self.check_disk_cache(key, pruned_configs, benchmark)
+                else:
+                    benchmark()
+            config = self.cache[key]
+        else:
+            config = self.configs[0]
+        self.best_config = config
+        if (
+            os.getenv(f"{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING", None) == "1"
+            and not used_cached_result
+        ):
+            print(
+                f"{PACKAGE_NAME} autotuning for function {self.fn.__name__} finished after "
+                f"{self.bench_time:.2f}s; best config selected: {self.best_config};"
+            )
+        ret = self.fn.__call__(
+            *args,
+            **kwargs,
+            **config.all_kwargs(),
+        )
+        self.nargs = None
+        return ret
+    def prune_configs(self, kwargs: Dict) -> List[Any]:
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            elif not isinstance(top_k, int):
+                # Slice index must be an integer
+                raise TypeError(
+                    "Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int"
+                )
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.all_kwargs(),
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
+class AutotuneConfig:
+    """
+    An object that represents a possible kernel configuration for the auto-tuner to try.
+    :ivar kwargs: a dictionary of meta-parameters to pass to the kernel as keyword arguments.
+    :type kwargs: dict[Str, Any]
+    """
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def __setstate__(self, state):
+        self.kwargs = state.get("kwargs", {})
+    def all_kwargs(self):
+        return self.kwargs
+    def __str__(self):
+        res = []
+        for k, v in self.kwargs.items():
+            res.append(f"{k}: {v}")
+        return ", ".join(res)
+    def __hash__(self):
+        return hash(tuple(*self.all_kwargs().items()))
+    def __eq__(self, other):
+        self_tuple = tuple(*self.all_kwargs().items())
+        other_tuple = tuple(*other.all_kwargs().items())
+        return self_tuple == other_tuple
+def autotune(
+    configs, key=None, prune_configs_by=None, restore_value=None, do_bench=None, cache_results=True
+):
+    f"""
+    Decorator for auto-tuning a function function.
+    .. highlight:: python
+    If the environment variable :code:`{PACKAGE_NAME.upper()}_PRINT_AUTOTUNING` is set to
+    :code:`"1"`, we will print a message to stdout after autotuning each
+    kernel, including the time spent autotuning and the best configuration.
+    :param configs: a list of :code:`AutotuneConfig` objects
+    :type configs: list[AutotuneConfig]
+    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
+    :type key: list[str]
+    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+        'perf_model': performance model used to predicate running time with different configs, returns running time
+        'top_k': number of configs to bench
+        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
+    :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
+    :type restore_value: list[str]
+    :param do_bench: a benchmark function to measure the time of each run.
+    :type do_bench: lambda fn, quantiles
+    :param cache_results: whether to cache autotune timings to disk.  Defaults to False.
+    "type cache_results: bool
+    """
+    if key is None:
+        key = []
+    def decorator(fn):
+        return Autotuner(
+            fn,
+            key,
+            configs,
+            restore_value=restore_value,
+            prune_configs_by=prune_configs_by,
+            do_bench=do_bench,
+            cache_results=cache_results,
+        )
+    return decorator

build/torch-cuda/quack/broadcast_utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Callable
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from .layout_utils import make_acc_tensor_mn_view
+@cute.jit
+def vec_op(tCrC: cute.Tensor, tCrVec: cute.Tensor, op: Callable, is_colvec: bool) -> None:
+    if const_expr(tCrC.element_type != Float32):  # Convert to f32
+        tCrC_f32 = cute.make_fragment(tCrC.shape, Float32)
+        tCrC_f32.store(tCrC.load().to(Float32))
+    else:
+        tCrC_f32 = tCrC
+    # this happens to work for frgA layout too, not just acc layout
+    tCrC_f32_mn = make_acc_tensor_mn_view(tCrC_f32)
+    if const_expr(is_colvec):
+        assert cute.size(tCrC_f32_mn, mode=[0]) == cute.size(tCrVec)
+        for r in cutlass.range(cute.size(tCrC_f32_mn, mode=[0]), unroll_full=True):
+            tCrC_f32_mn[r, None].store(op(tCrC_f32_mn[r, None].load(), tCrVec[r]))
+    else:
+        assert cute.size(tCrC_f32_mn, mode=[1]) == cute.size(tCrVec)
+        for c in cutlass.range(cute.size(tCrC_f32_mn, mode=[1]), unroll_full=True):
+            tCrC_f32_mn[None, c].store(op(tCrC_f32_mn[None, c].load(), tCrVec[c]))
+    if const_expr(tCrC.element_type != Float32):  # Convert back to original dtype
+        tCrC.store(tCrC_f32.load().to(tCrC.element_type))

build/torch-cuda/quack/compile_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Optional
+import cutlass.cute as cute
+def make_fake_tensor(dtype, shape, divisibility=1, leading_dim=-1) -> Optional[cute.Tensor]:
+    if leading_dim < 0:
+        leading_dim = len(shape) + leading_dim
+    if dtype is None:
+        return None
+    stride = tuple(
+        cute.sym_int64(divisibility=divisibility) if i != leading_dim else 1
+        for i in range(len(shape))
+    )
+    return cute.runtime.make_fake_tensor(
+        dtype, shape, stride=stride, assumed_align=divisibility * dtype.width // 8
+    )

build/torch-cuda/quack/copy_utils.py ADDED Viewed

	@@ -0,0 +1,614 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import re
+from typing import Optional, Type, Tuple, Callable
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Boolean, const_expr
+from cutlass.cute.nvgpu import cpasync, warpgroup
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass.pipeline
+@dsl_user_op
+def cvt_copy(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    retile: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    assert isinstance(src.iterator, cute.Pointer) and src.memspace == cute.AddressSpace.rmem
+    if const_expr(src.element_type != dst.element_type):
+        src_cvt = cute.make_fragment_like(src, dst.element_type)
+        src_cvt.store(src.load().to(dst.element_type))
+        src = src_cvt
+    if const_expr(retile):
+        src = tiled_copy.retile(src)
+    cute.copy(tiled_copy, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+@dsl_user_op
+def load_s2r(src: cute.Tensor, *, loc=None, ip=None) -> cute.Tensor:
+    dst = cute.make_fragment_like(src, src.element_type, loc=loc, ip=ip)
+    cute.autovec_copy(src, dst, loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def load_s2r_retile(
+    tiled_copy: cute.TiledCopy,
+    src: cute.Tensor,
+    dst_shape: cute.Tensor | cute.Shape,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    # Will also accept dst_shape being a tensor, in which case we write into that tensor
+    if const_expr(not isinstance(dst_shape, cute.Tensor)):
+        dst = cute.make_fragment(dst_shape, src.element_type, loc=loc, ip=ip)
+    else:
+        dst = dst_shape
+    cute.copy(tiled_copy, src, tiled_copy.retile(dst), loc=loc, ip=ip)
+    return dst
+@dsl_user_op
+def get_copy_atom(
+    dtype: Type[cutlass.Numeric], num_copy_elems: int, is_async: bool = False, *, loc=None, ip=None
+) -> cute.CopyAtom:
+    num_copy_bits = const_expr(min(128, num_copy_elems * dtype.width))
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    return cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+@dsl_user_op
+def copy(
+    src: cute.Tensor,
+    dst: cute.Tensor,
+    *,
+    pred: Optional[cute.Tensor] = None,
+    is_async: bool = False,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    num_copy_elems = src.shape[0][0]
+    copy_atom = get_copy_atom(src.element_type, num_copy_elems, is_async)
+    cute.copy(copy_atom, src, dst, pred=pred, loc=loc, ip=ip, **kwargs)
+def tiled_copy_1d(
+    dtype: Type[cutlass.Numeric], num_threads: int, num_copy_elems: int = 1, is_async: bool = False
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    thr_layout = cute.make_layout(num_threads)
+    val_layout = cute.make_layout(num_copy_elems)
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def tiled_copy_2d(
+    dtype: Type[cutlass.Numeric],
+    threads_per_row: int,
+    num_threads: int,
+    num_copy_elems: int = 1,
+    is_async: bool = False,
+) -> cute.TiledCopy:
+    num_copy_bits = num_copy_elems * dtype.width
+    copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+    copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+    assert num_threads % threads_per_row == 0
+    thr_layout = cute.make_ordered_layout(
+        (num_threads // threads_per_row, threads_per_row),
+        order=(1, 0),
+    )
+    val_layout = cute.make_layout((1, num_copy_elems))
+    return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+@cute.jit
+def predicate_k(tAcA: cute.Tensor, limit: Int32) -> cute.Tensor:
+    # Only compute predicates for the "k" dimension. For the mn dimension, we will use "if"
+    tApA = cute.make_fragment(
+        cute.make_layout(
+            (cute.size(tAcA, mode=[0, 1]), cute.size(tAcA, mode=[1]), cute.size(tAcA, mode=[2])),
+            stride=(cute.size(tAcA, mode=[2]), 0, 1),
+        ),
+        Boolean,
+    )
+    for rest_v in cutlass.range_constexpr(tApA.shape[0]):
+        for rest_k in cutlass.range_constexpr(tApA.shape[2]):
+            tApA[rest_v, 0, rest_k] = cute.elem_less(tAcA[(0, rest_v), 0, rest_k][1], limit)
+    return tApA
+# def tiled_copy_2d(
+#     dtype: Type[cutlass.Numeric], major_mode_size: int, num_threads: int, is_async: bool = False
+# ) -> cute.TiledCopy:
+#     num_copy_bits = math.gcd(major_mode_size, 128 // dtype.width) * dtype.width
+#     copy_elems = num_copy_bits // dtype.width
+#     copy_op = cpasync.CopyG2SOp() if is_async else cute.nvgpu.CopyUniversalOp()
+#     copy_atom = cute.make_copy_atom(copy_op, dtype, num_bits_per_copy=num_copy_bits)
+#     gmem_threads_per_row = major_mode_size // copy_elems
+#     assert num_threads % gmem_threads_per_row == 0
+#     thr_layout = cute.make_ordered_layout(
+#         (num_threads // gmem_threads_per_row, gmem_threads_per_row),
+#         order=(1, 0),
+#     )
+#     val_layout = cute.make_layout((1, copy_elems))
+#     return cute.make_tiled_copy_tv(copy_atom, thr_layout, val_layout)
+def parse_swizzle_from_pointer(ptr: cute.Pointer) -> Tuple[int, int, int]:
+    """Extract swizzle parameters from a pointer's swizzle_type.
+    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
+    b, m, s are the swizzle parameters (bits, base, shift).
+    Returns:
+        A cute.Swizzle object constructed from the extracted parameters
+    Raises:
+        ValueError: If the swizzle_type string cannot be parsed
+    """
+    # Ideally there should be a better API to get swizzle parameters, but we'll just parse
+    # the string here.
+    swizzle_str = str(ptr.type.swizzle_type)
+    # Extract the inner part "S<b,m,s>"
+    match = re.search(r"S<(\d+),(\d+),(\d+)>", swizzle_str)
+    if match:
+        b, m, s = int(match.group(1)), int(match.group(2)), int(match.group(3))
+        return b, m, s
+    else:
+        raise ValueError(f"Could not parse swizzle_type: {swizzle_str}")
+def swizzle_int(ptr_int: Int32, b: int, m: int, s: int) -> Int32:
+    bit_msk = (1 << b) - 1
+    yyy_msk = bit_msk << (m + s)
+    return ptr_int ^ ((ptr_int & yyy_msk) >> s)
+def swizzle_ptr(ptr: cute.Pointer):
+    b, m, s = parse_swizzle_from_pointer(ptr)
+    ptr_int = swizzle_int(ptr.toint(), b, m, s)
+    return cute.make_ptr(ptr.dtype, ptr_int, ptr.memspace, assumed_align=ptr.alignment)
+def as_position_independent_swizzle_tensor(tensor: cute.Tensor) -> cute.Tensor:
+    outer = tensor.layout
+    width = tensor.element_type.width
+    inner = cute.make_swizzle(*parse_swizzle_from_pointer(tensor.iterator))
+    # Need to recast the swizzle from byte (e.g. <3, 4, 3> to element units (e.g. <3, 3, 3> for
+    # for 16 bits and <3, 2, 3> for 32 bits)
+    new_layout = cute.recast_layout(
+        width, 8, cute.make_composed_layout(inner, 0, cute.recast_layout(8, width, outer))
+    )
+    # recast_ptr to remove the pointer swizzle
+    return cute.make_tensor(cute.recast_ptr(tensor.iterator, dtype=tensor.element_type), new_layout)
+def partition_D_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_D(tensor).iterator),
+        thr_copy.partition_D(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+def partition_S_position_independent(
+    thr_copy: cute.core.ThrCopy, tensor: cute.Tensor
+) -> cute.Tensor:
+    return cute.make_tensor(
+        swizzle_ptr(thr_copy.partition_S(tensor).iterator),
+        thr_copy.partition_S(as_position_independent_swizzle_tensor(tensor)).layout,
+    )
+@dsl_user_op
+def sm90_get_smem_load_op(
+    layout_c: cutlass.utils.LayoutEnum,
+    elem_ty_c: Type[cutlass.Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """
+    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.
+    Parameters:
+    -----------
+    layout_c : LayoutEnum
+        The layout enum of the output tensor D.
+    elem_ty_c : Type[Numeric]
+        The element type for output tensor D.
+    Returns:
+    --------
+    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
+    """
+    if not isinstance(elem_ty_c, cutlass.cutlass_dsl.NumericMeta):
+        raise TypeError(f"elem_ty_c must be a Numeric, but got {elem_ty_c}")
+    is_m_major = layout_c.is_m_major_c()
+    if elem_ty_c.width == 16:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(is_m_major, 4), elem_ty_c, loc=loc, ip=ip
+        )
+    else:
+        return cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), elem_ty_c, loc=loc, ip=ip)
+def get_smem_store_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.StMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+def get_smem_load_atom(
+    arch: cutlass.Constexpr[int], element_type: Type[cute.Numeric], transpose: bool = False
+) -> cute.CopyAtom:
+    if const_expr(arch < 90 or element_type.width != 16):
+        return cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            element_type,
+            num_bits_per_copy=(2 if not transpose else 1) * element_type.width,
+        )
+    else:
+        return cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=transpose, num_matrices=4),
+            element_type,
+        )
+def get_smem_store_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sC = thr_copy.partition_D(sC)
+    else:
+        tRS_sC = partition_D_position_independent(thr_copy, sC)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sC[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sC
+def get_smem_load_C(
+    tiled_mma: cute.TiledMma,
+    sC: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    transpose: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sC.element_type
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_C(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sC = thr_copy.partition_S(sC)
+    else:
+        tSR_sC = partition_S_position_independent(thr_copy, sC)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = thr_copy_RS.partition_S(cute.make_identity_tensor(sC.shape[:2])).shape
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sC[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    return copy_fn, thr_copy, tSR_sC
+def get_smem_store_A(
+    tiled_mma: cute.TiledMma, sA: cute.Tensor, tidx: Int32, arch: int, position_independent=False
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_store_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tRS_sA = thr_copy.partition_D(sA)
+    else:
+        tRS_sA = partition_D_position_independent(thr_copy, sA)
+    def copy_fn(src: cute.Tensor, dst_idx: Int32, **new_kwargs):
+        cvt_copy(tiled_copy, src, tRS_sA[None, None, None, dst_idx], retile=True, **new_kwargs)
+    return copy_fn, thr_copy, tRS_sA
+def get_smem_load_A(
+    tiled_mma: cute.TiledMma,
+    sA: cute.Tensor,
+    tidx: Int32,
+    arch: int,
+    with_dst_tensor: bool = False,
+    position_independent=False,
+) -> Tuple[Callable, cute.TiledCopy, cute.Tensor]:
+    dtype = sA.element_type
+    transpose = tiled_mma.op.a_major_mode == warpgroup.OperandMajorMode.MN
+    copy_atom = get_smem_load_atom(arch, dtype, transpose)
+    tiled_copy = cute.make_tiled_copy_A(copy_atom, tiled_mma)
+    thr_copy = tiled_copy.get_slice(tidx)
+    if const_expr(not position_independent):
+        tSR_sA = thr_copy.partition_S(sA)
+    else:
+        tSR_sA = partition_S_position_independent(thr_copy, sA)
+    copy_atom_RS = get_smem_store_atom(arch, dtype, transpose)
+    thr_copy_RS = cute.make_tiled_copy_C(copy_atom_RS, tiled_mma).get_slice(tidx)
+    tRS_shape = tiled_mma.partition_shape_A(sA.shape[:2])
+    def copy_fn(src_idx: Int32, **new_kwargs):
+        return load_s2r_retile(
+            tiled_copy, tSR_sA[None, None, None, src_idx], dst_shape=tRS_shape, **new_kwargs
+        )
+    def copy_fn_w_dst_tensor(src_idx: Int32, dst: cute.Tensor, **new_kwargs):
+        return load_s2r_retile(tiled_copy, tSR_sA[None, None, None, src_idx], dst, **new_kwargs)
+    return copy_fn if not with_dst_tensor else copy_fn_w_dst_tensor, thr_copy, tSR_sA
+def tma_get_copy_fn(
+    atom: cute.CopyAtom,
+    cta_coord: cute.Coord,
+    cta_layout: cute.Layout,
+    src_tensor: cute.Tensor,
+    dst_tensor: cute.Tensor,
+    filter_zeros: bool = False,
+    single_stage: bool = False,
+    **kwargs,
+) -> Callable:
+    src_is_smem = const_expr(
+        isinstance(src_tensor.iterator, cute.Pointer)
+        and src_tensor.memspace == cute.AddressSpace.smem
+    )
+    smem_tensor, gmem_tensor = (src_tensor, dst_tensor) if src_is_smem else (dst_tensor, src_tensor)
+    group_rank_smem = const_expr(cute.rank(smem_tensor) - (1 if not single_stage else 0))
+    group_rank_gmem = const_expr(cute.rank(gmem_tensor) - (1 if not single_stage else 0))
+    # ((atom_v, rest_v), STAGE), ((atom_v, rest_v), RestK)
+    s, g = cpasync.tma_partition(
+        atom,
+        cta_coord,
+        cta_layout,
+        cute.group_modes(smem_tensor, 0, group_rank_smem),
+        cute.group_modes(gmem_tensor, 0, group_rank_gmem),
+    )
+    if const_expr(filter_zeros):
+        s = cute.filter_zeros(s)
+        g = cute.filter_zeros(g)
+    src, dst = (s, g) if src_is_smem else (g, s)
+    def copy_tma(src_idx, dst_idx, **new_kwargs):
+        cute.copy(atom, src[None, src_idx], dst[None, dst_idx], **new_kwargs, **kwargs)
+    def copy_tma_single_stage(**new_kwargs):
+        cute.copy(atom, src, dst, **new_kwargs, **kwargs)
+    return (copy_tma if const_expr(not single_stage) else copy_tma_single_stage), s, g
+def tma_producer_copy_fn(copy: Callable, pipeline: cutlass.pipeline.PipelineAsync):
+    def copy_fn(src_idx, producer_state: cutlass.pipeline.PipelineState, **new_kwargs):
+        copy(
+            src_idx=src_idx,
+            dst_idx=producer_state.index,
+            tma_bar_ptr=pipeline.producer_get_barrier(producer_state),
+            **new_kwargs,
+        )
+    return copy_fn
+@cute.jit
+def gather_m_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (whatever, K)
+    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_M), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    tAsA = thr_copy_A.partition_D(sA)
+    # k-major
+    assert tAsA.shape[2] == 1
+    tAsA = cute.group_modes(cute.slice_(tAsA, (None, None, 0, None)), 0, 2)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    m_idx = cute.make_fragment(rows_per_thread, Int32)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        row_idx = tAcA[0, m, 0][0]
+        if tApA_m[m]:
+            m_idx[m] = gsAIdx[row_idx]
+        else:
+            m_idx[m] = 0  # It's ok to load row 0 in the case of OOB
+    mA_k = cute.logical_divide(mA, (None, tile_shape_mk[1]))
+    def copy_fn(src_idx, dst_idx, pred: bool = False):
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        mA_cur = mA_k[None, (None, src_idx)]
+        for m in cutlass.range_constexpr(tAcA.shape[1]):
+            # cute.tiled_divide(mA_cur[m_idx[m], None], (elems_per_load,)) would give shape
+            # ((elems_per_load), thread_per_row)
+            # But we actually want shape ((elems_per_load, 1), thread_per_row) to match tAsA
+            # So we append 1s to the last dimension and then do tiled_divide, then slice.
+            mA_row = cute.tiled_divide(
+                cute.append_ones(mA_cur[m_idx[m], None], up_to_rank=2), (elems_per_load, 1)
+            )[None, None, 0]
+            if const_expr(is_even_m_smem) or tApA_m[m]:
+                # There's only 1 load per row
+                assert cute.size(tAcA.shape, mode=[2]) == 1
+                ki = tAcA[0, 0, 0][1] // elems_per_load
+                cute.copy(thr_copy_A, mA_row[None, ki], tAsA[(None, m), dst_idx], pred=tApA_k)
+    return copy_fn
+@cute.jit
+def gather_k_get_copy_fn(
+    thr_copy_A: cute.ThrCopy,
+    mA: cute.Tensor,  # (tile_M, whatever)
+    sA: cute.Tensor,  # (tile_M, tile_N, STAGE)
+    gsAIdx: cute.Tensor,  # (tile_K, RestK), either gmem or smem
+    limit_m: Int32,
+    limit_k: Int32,
+) -> Callable:
+    gAIdx, sAIdx = None, None
+    if const_expr(gsAIdx.memspace == cute.AddressSpace.gmem):
+        gAIdx = gsAIdx
+    else:
+        assert gsAIdx.memspace == cute.AddressSpace.smem
+        sAIdx = gsAIdx
+    tile_shape_mk = (cute.size(sA, mode=[0]), cute.size(sA, mode=[1]))
+    # (atom_v, CPY_M, 1, STAGE)
+    tAsA = thr_copy_A.partition_D(sA)
+    # m-major
+    tAsA = cute.group_modes(tAsA, 0, 3)
+    is_even_m_smem = tile_shape_mk[0] % thr_copy_A.tiler_mn[0].shape == 0
+    if const_expr(not is_even_m_smem):
+        limit_m = min(limit_m, tile_shape_mk[0])
+    elems_per_load = cute.size(tAsA.shape[0][0])
+    cA = cute.make_identity_tensor(tile_shape_mk)
+    tAcA = thr_copy_A.partition_S(cA)
+    t0AcA = thr_copy_A.get_slice(0).partition_S(cA)
+    # Instead of comparing tAcA to limit_m, we instead compare t0AcA to limit_m - tAcA[0][0]
+    # since we know that tAcA[m][0] = t0AcA[m][0] + tAcA[0][0].
+    # This is so that when we do the comparison, t0AcA is known at compile time.
+    limit_m = limit_m - tAcA[0][0]
+    limit_k = limit_k - tAcA[0][1]
+    # Read and cache indices for A
+    rows_per_thread = const_expr(cute.size(tAcA.shape, mode=[1]))
+    cols_per_thread = const_expr(cute.size(tAcA.shape, mode=[2]))
+    tApA_m = cute.make_fragment(rows_per_thread, Boolean)
+    for m in cutlass.range(rows_per_thread, unroll_full=True):
+        tApA_m[m] = t0AcA[0, m, 0][0] < limit_m
+    threads_per_col = const_expr(thr_copy_A.tiler_mn[0].shape // elems_per_load)
+    # This is very convoluted but idk a better way
+    # for tile_M=128, flat_divide gives (8, 16, K),
+    # then logical_divide gives ((8, 1), (8, 2), K).
+    tidx = thr_copy_A.thr_idx
+    tAmA = cute.logical_divide(
+        cute.flat_divide(mA, (elems_per_load,)), (elems_per_load, threads_per_col)
+    )[None, (tidx % threads_per_col, None), None]  # ((8, 1), 2, K)
+    def prefetch_from_gmem_fn(src_idx, pred: bool = False) -> Tuple[cute.Tensor, cute.Tensor]:
+        # Prefetch mAIdx early, even before smem is free
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        gAIdx_cur = gAIdx[None, src_idx]
+        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            if const_expr(not pred):
+                k_idx[k] = gAIdx_cur[col_idx]
+            else:
+                if tApA_k[k]:
+                    k_idx[k] = gAIdx_cur[col_idx]
+                else:
+                    k_idx[k] = -1
+        return k_idx, tApA_k
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline, src_idx, dst_idx, a_prefetch_consumer_state, pred: bool = False
+    ) -> Tuple[cute.Tensor, cute.Tensor]:
+        tApA_k = None
+        if const_expr(pred):
+            tApA_k = cute.make_fragment(cols_per_thread, Boolean)
+            limit_k_cur = limit_k - src_idx * tile_shape_mk[1]
+            for k in cutlass.range(cols_per_thread, unroll_full=True):
+                tApA_k[k] = t0AcA[0, 0, k][1] < limit_k_cur
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        sAIdx_cur = sAIdx[None, dst_idx]
+        k_idx = cute.make_fragment(cols_per_thread, Int32)
+        for k in cutlass.range(cols_per_thread):
+            col_idx = tAcA[0, 0, k][1]
+            k_idx[k] = sAIdx_cur[col_idx]
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return k_idx, tApA_k
+    def copy_fn(
+        src_idx, dst_idx, k_idx_tApA_k: Tuple[cute.Tensor, cute.Tensor], pred: bool = False
+    ):
+        k_idx, tApA_k = k_idx_tApA_k
+        tApA_k_pred = None
+        if const_expr(pred):
+            tApA_k_pred = cute.prepend_ones(tApA_k, up_to_rank=2)  # (1, cols_per_thread)
+        for k in cutlass.range_constexpr(tAcA.shape[2]):
+            # copy_A(tAmA[None, None, k_idx[k]], tAsA[(None, None, k), smem_idx], pred=cute.prepend_ones(tApA_m, up_to_rank=2))
+            for m in cutlass.range_constexpr(tAcA.shape[1]):
+                if tApA_m[m]:
+                    cute.copy(
+                        thr_copy_A,
+                        tAmA[None, m, k_idx[k]],
+                        tAsA[(None, m, k), dst_idx],
+                        pred=None if const_expr(tApA_k_pred is None) else tApA_k_pred[None, k],
+                    )
+    return copy_fn, prefetch_from_gmem_fn if const_expr(
+        gAIdx is not None
+    ) else prefetch_from_smem_fn

build/torch-cuda/quack/cute_dsl_ptxas.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+System ptxas replacement for CUTLASS DSL.
+Environment variables:
+    CUTE_DSL_PTXAS_PATH    - Path to ptxas (e.g., /usr/local/cuda/bin/ptxas)
+    CUTE_DSL_PTXAS_VERBOSE - Set to 1 for verbose output
+"""
+import os
+import sys
+import re
+import ctypes
+import subprocess
+from pathlib import Path
+import cutlass
+CUTE_DSL_PTXAS_PATH = os.environ.get("CUTE_DSL_PTXAS_PATH", None)
+VERBOSE = os.environ.get("CUTE_DSL_PTXAS_VERBOSE", "0") == "1"
+_original_load_cuda_library = None
+_user_wanted_ptx = False  # True if user originally set CUTE_DSL_KEEP_PTX=1
+def _log(msg):
+    if VERBOSE:
+        print(f"[ptxas] {msg}", file=sys.stderr)
+def _get_ptx(compiled_func) -> tuple[str, Path] | None:
+    """Find and read PTX file, stripping null bytes."""
+    func_name = getattr(compiled_func, "function_name", None)
+    if not func_name:
+        return None
+    dump_dir = os.environ.get("CUTE_DSL_DUMP_DIR", Path.cwd())
+    for ptx_path in Path(dump_dir).glob(f"*{func_name}*.ptx"):
+        content = ptx_path.read_text().rstrip("\x00")
+        if ".entry " in content and content.rstrip().endswith("}"):
+            _log(f"Found PTX: {ptx_path}")
+            return content, ptx_path
+    return None
+def _compile_ptx(ptx_path: Path, ptx_content: str) -> bytes:
+    """Compile PTX to cubin using system ptxas."""
+    # Extract arch from PTX
+    match = re.search(r"\.target\s+(sm_\d+[a-z]?)", ptx_content)
+    arch = match.group(1) if match else "sm_90a"
+    # Write stripped content back if needed
+    if ptx_path.read_text() != ptx_content:
+        ptx_path.write_text(ptx_content)
+    # Compile
+    cubin_tmp = ptx_path.with_suffix(".cubin.tmp")
+    try:
+        assert CUTE_DSL_PTXAS_PATH is not None
+        result = subprocess.run(
+            [CUTE_DSL_PTXAS_PATH, f"-arch={arch}", "-O3", "-o", str(cubin_tmp), str(ptx_path)],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"ptxas failed: {result.stderr}")
+        cubin_data = cubin_tmp.read_bytes()
+        _log(f"Compiled {ptx_path.name} -> {len(cubin_data)} bytes ({arch})")
+        # Save cubin if CUTE_DSL_KEEP_CUBIN is set
+        if os.environ.get("CUTE_DSL_KEEP_CUBIN", "0") == "1":
+            cubin_out = ptx_path.with_suffix(".cubin")
+            cubin_out.write_bytes(cubin_data)
+            _log(f"Saved: {cubin_out}")
+        return cubin_data
+    finally:
+        cubin_tmp.unlink(missing_ok=True)
+def _patched_load_cuda_library(self):
+    """Replacement for _load_cuda_library that uses system ptxas."""
+    result = _get_ptx(self)
+    if not result:
+        _log("PTX not found, falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    ptx_content, ptx_path = result
+    try:
+        cubin = _compile_ptx(ptx_path, ptx_content)
+    except Exception as e:
+        _log(f"Compilation failed ({e}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Load cubin
+    import cuda.bindings.runtime as cuda_runtime
+    err, library = cuda_runtime.cudaLibraryLoadData(cubin, None, None, 0, None, None, 0)
+    if err != cuda_runtime.cudaError_t.cudaSuccess:
+        _log(f"cudaLibraryLoadData failed ({err}), falling back to embedded ptxas")
+        return _original_load_cuda_library(self)
+    # Register kernels on all devices
+    _, cuda_load_to_device = self._get_cuda_init_and_load()
+    lib_ptr = ctypes.c_void_p(int(library))
+    dev_id = ctypes.c_int32(0)
+    err_val = ctypes.c_int32(0)
+    args = (ctypes.c_void_p * 3)(
+        ctypes.cast(ctypes.pointer(lib_ptr), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(dev_id), ctypes.c_void_p),
+        ctypes.cast(ctypes.pointer(err_val), ctypes.c_void_p),
+    )
+    for dev in range(self.num_devices):
+        dev_id.value = dev
+        cuda_load_to_device(args)
+        if err_val.value != 0:
+            _log("cuda_load_to_device failed, falling back to embedded ptxas")
+            return _original_load_cuda_library(self)
+    _log(f"Loaded kernel from {ptx_path.name}")
+    # Delete PTX if user didn't originally want it kept
+    if not _user_wanted_ptx:
+        ptx_path.unlink(missing_ok=True)
+    return [cuda_runtime.cudaLibrary_t(lib_ptr.value)]
+def patch():
+    """Install system ptxas hook. Call before importing cutlass."""
+    global _original_load_cuda_library, _user_wanted_ptx
+    assert CUTE_DSL_PTXAS_PATH is not None
+    if not os.path.isfile(CUTE_DSL_PTXAS_PATH) or not os.access(CUTE_DSL_PTXAS_PATH, os.X_OK):
+        raise RuntimeError(f"ptxas not found: {CUTE_DSL_PTXAS_PATH}")
+    # Track if user originally wanted PTX kept
+    _user_wanted_ptx = os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1"
+    # os.environ['CUTE_DSL_KEEP_PTX'] = '1'
+    assert os.environ.get("CUTE_DSL_KEEP_PTX", "0") == "1", (
+        "Require CUTE_DSL_KEEP_PTX=1 to use system's ptxas"
+    )
+    cls = cutlass.cutlass_dsl.cuda_jit_executor.CudaDialectJitCompiledFunction
+    _original_load_cuda_library = cls._load_cuda_library
+    cls._load_cuda_library = _patched_load_cuda_library
+    _log("Patch applied")
+    return

build/torch-cuda/quack/cute_dsl_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple
+from functools import lru_cache
+from dataclasses import dataclass, fields
+import torch
+try:
+    from triton.tools.disasm import extract
+except ImportError:
+    extract = None
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float16, BFloat16, Float32
+from cutlass.base_dsl.typing import JitArgument
+from cutlass.cutlass_dsl import NumericMeta
+StaticTypes = (cutlass.Constexpr, NumericMeta, int, bool, str, float, type(None))
+load_cubin_module_data_og = cutlass.base_dsl.runtime.cuda.load_cubin_module_data
+cute_compile_og = cute.compile
+torch2cute_dtype_map = {
+    torch.float16: Float16,
+    torch.bfloat16: BFloat16,
+    torch.float32: Float32,
+    torch.int32: Int32,
+    torch.int64: Int64,
+}
+@lru_cache
+def get_max_active_clusters(cluster_size):
+    return cutlass.utils.HardwareInfo().get_max_active_clusters(cluster_size=cluster_size)
+@lru_cache
+def get_device_capacity(device: torch.device = None) -> Tuple[int, int]:
+    return torch.cuda.get_device_capability(device)
+@dataclass
+class ParamsBase:
+    def __extract_mlir_values__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
+        values, self._values_pos = [], []
+        for obj in non_constexpr_fields:
+            obj_values = cutlass.extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+    def __new_from_mlir_values__(self, values):
+        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
+        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
+        non_constexpr_fields = {
+            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
+        }
+        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+            values = values[n_items:]
+        return self.__class__(**non_constexpr_fields, **constexpr_fields)
+@dataclass
+class ArgumentsBase(JitArgument):
+    def __c_pointers__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
+        c_ptrs = []
+        for obj in non_constexpr_fields:
+            if hasattr(obj, "__c_pointers__"):
+                c_ptrs.extend(obj.__c_pointers__())
+        return c_ptrs
+    def __get_mlir_types__(self):
+        all_fields = [getattr(self, field.name) for field in fields(self)]
+        non_constexpr_fields = [f for f in all_fields if not isinstance(f, StaticTypes)]
+        types, self._values_pos = [], []
+        for obj in non_constexpr_fields:
+            if hasattr(obj, "__get_mlir_types__"):
+                obj_types = obj.__get_mlir_types__()
+                types.extend(obj_types)
+                self._values_pos.append(len(obj_types))
+            else:
+                self._values_pos.append(0)
+        return types
+    def __new_from_mlir_values__(self, values):
+        all_fields = {field.name: getattr(self, field.name) for field in fields(self)}
+        constexpr_fields = {n: f for n, f in all_fields.items() if isinstance(f, StaticTypes)}
+        non_constexpr_fields = {
+            n: f for n, f in all_fields.items() if not isinstance(f, StaticTypes)
+        }
+        for (name, field), n_items in zip(non_constexpr_fields.items(), self._values_pos):
+            non_constexpr_fields[name] = cutlass.new_from_mlir_values(field, values[:n_items])
+            values = values[n_items:]
+        return self.__class__(**non_constexpr_fields, **constexpr_fields)

build/torch-cuda/quack/fast_math.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Uint32
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+from .cute_dsl_utils import ParamsBase
+@cute.jit
+def clz(x: Int32) -> Int32:
+    # for i in cutlass.range_constexpr(32):
+    #     if (1 << (31 - i)) & x:
+    #         return Int32(i)
+    # return Int32(32)
+    # Early exit is not supported yet
+    res = Int32(32)
+    done = False
+    for i in cutlass.range(32):
+        if ((1 << (31 - i)) & x) and not done:
+            res = Int32(i)
+            done = True
+    return res
+def find_log2(x: Int32) -> Int32:
+    a: Int32 = Int32(31 - clz(x))
+    return a + ((x & (x - 1)) != 0)  # Round up, add 1 if not a power of 2.
+@dsl_user_op
+def umulhi(a: Int32, b: Int32, *, loc=None, ip=None) -> Uint32:
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Int32(a).ir_value(loc=loc, ip=ip), Int32(b).ir_value(loc=loc, ip=ip)],
+            "mul.hi.u32 $0, $1, $2;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dataclass
+class FastDivmod(ParamsBase):
+    divisor: Int32
+    multiplier: Uint32
+    shift_right: Uint32
+    # called by host
+    @staticmethod
+    def create(divisor: Int32) -> "FastDivmod":
+        """Construct the FastDivmod object, in host code.
+        This precomputes some values based on the divisor and is computationally expensive.
+        """
+        p = Uint32(31 + find_log2(divisor))
+        divisor_u32 = Uint32(divisor)
+        multiplier = Uint32(((cutlass.Uint64(1) << p) + divisor_u32 - 1) // divisor_u32)
+        shift_right = Uint32(p - 32)
+        return FastDivmod(divisor, multiplier, shift_right)
+    @cute.jit
+    def div(self, dividend: Int32) -> Int32:
+        return (
+            Int32(umulhi(dividend, self.multiplier) >> self.shift_right)
+            if self.divisor != 1
+            else dividend
+        )
+    def divmod(self, dividend: Int32) -> Tuple[Int32, Int32]:
+        quotient = self.div(dividend)
+        remainder = dividend - quotient * self.divisor
+        return quotient, remainder

build/torch-cuda/quack/gemm.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import Optional
+from functools import partial
+from torch import Tensor
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass import Float32
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from .gemm_wrapper_utils import GemmWrapperBase
+from .gemm_default_epi import GemmDefaultSm90, GemmDefaultSm100
+def gemm(
+    # (l, m, k) or (total_m, k) if varlen_m or (m, total_k) if varlen_k or (whatever, k) if gather_A_varlen_m or (m, whatever) if gather_A_varlen_k
+    A: Tensor,
+    B: Tensor,  # (l, n, k) or (n, total_k) if varlen_k
+    D: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    rowvec_bias: Optional[Tensor] = None,  # (l, n)
+    colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
+    cu_seqlens_k: Optional[Tensor] = None,  # (l+1,) cumulative sum of k values for variable length
+    A_idx: Optional[Tensor] = None,  # (total_m,) or (total_k,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (l,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
+) -> None:
+    varlen = cu_seqlens_m is not None or cu_seqlens_k is not None
+    assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
+        "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
+    )
+    gather_A = A_idx is not None
+    if gather_A:
+        assert varlen, "gather_A requires varlen (cu_seqlens_m or cu_seqlens_k must be specified)"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    if varlen:
+        assert persistent, "varlen requires persistent=True"
+    if add_to_output:
+        assert cu_seqlens_m is None, "Add to output not supported with varlen_m"
+    if cu_seqlens_m is not None:
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+    if cu_seqlens_k is not None:
+        assert A.stride(-2) == 1, "varlen_k requires A to be m-major"
+        assert B.stride(-2) == 1, "varlen_k requires B to be n-major"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, cu_seqlens_m=cu_seqlens_m, cu_seqlens_k=cu_seqlens_k, A_idx=A_idx
+    )
+    GemmWrapperBase.permute_tensors(
+        tensor_infos, varlen_m=cu_seqlens_m is not None, varlen_k=cu_seqlens_k is not None
+    )
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmDefaultSm100 if device_capacity[0] > 9 else GemmDefaultSm90
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    def scalar_arg(scalar: float | Tensor):
+        if isinstance(scalar, float):
+            return Float32(scalar) if scalar != 1.0 else None
+        else:
+            assert isinstance(scalar, Tensor)
+            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
+    epi_args = GemmCls.EpilogueArguments(
+        scalar_arg(alpha),
+        scalar_arg(beta),
+        mRowVecBroadcast=from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1
+        )
+        if rowvec_bias is not None
+        else None,
+        mColVecBroadcast=from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1 if cu_seqlens_m is None else 0
+        )
+        if colvec_bias is not None
+        else None,
+        add_to_output=add_to_output,
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters,
+        tile_count_semaphore,
+        batch_idx_permute,
+        max_swizzle_size,
+    )
+    # Create varlen arguments if needed (assumes persistent=True when varlen)
+    varlen_args = GemmWrapperBase.create_varlen_args(
+        cu_seqlens_m,
+        cu_seqlens_k,
+        A_idx,
+        max_active_clusters,
+        cluster_shape_mnk,
+        tensor_infos,
+        GemmCls.num_epi_tensormaps,
+        pingpong,
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        None,  # activation
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        # Technically we don't need to recompile for different max_swizzle_size, but currently
+        # not recompiling will skew the autotuning results due to power throttling.
+        # Effectively we're recompiling as a way to pause between benchmarks during autotuning.
+        max_swizzle_size,
+        rowvec_bias.dtype if rowvec_bias is not None else None,
+        colvec_bias.dtype if colvec_bias is not None else None,
+        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
+        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
+        add_to_output,
+        cu_seqlens_m is not None,
+        cu_seqlens_k is not None,
+        gather_A,
+        batch_idx_permute is not None,
+        key_tensor_names=("A", "B", "D", "C"),
+    )
+    cache = gemm.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=gather_A,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm.compile_cache = {}

build/torch-cuda/quack/gemm_act.py ADDED Viewed

	@@ -0,0 +1,510 @@

+# Copyright (c) 2025, Wentao Guo, Tri Dao.
+from typing import Tuple, Optional, Callable
+from functools import partial
+from dataclasses import dataclass
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.hopper_helpers as sm90_utils_og
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass import Int32, Float32, Boolean, const_expr
+from cutlass.cutlass_dsl import if_generate
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import from_dlpack
+from .cute_dsl_utils import ArgumentsBase, ParamsBase
+from .varlen_utils import VarlenManager
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .gemm_default_epi import GemmDefaultEpiMixin
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from .gemm_wrapper_utils import GemmWrapperBase
+from . import sm90_utils as sm90_utils
+from . import copy_utils as copy_utils
+from . import activation
+class GemmActMixin(GemmDefaultEpiMixin):
+    num_epi_tensormaps: int = 1
+    @dataclass
+    class EpilogueArguments(ArgumentsBase):
+        mPostAct: cute.Tensor
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+    @dataclass
+    class EpilogueParams(ParamsBase):
+        tma_atom_postact: cute.CopyAtom
+        mPostAct_mnl: cute.Tensor
+        epi_postact_smem_layout_staged: cute.ComposedLayout
+        epi_tile_postact: cute.Tile
+        act_fn: cutlass.Constexpr[Optional[Callable]] = None
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+    def epi_to_underlying_arguments(
+        self, args: EpilogueArguments, *, loc=None, ip=None
+    ) -> EpilogueParams:
+        self.postact_dtype = args.mPostAct.element_type
+        self.postact_layout = cutlass.utils.LayoutEnum.from_tensor(args.mPostAct)
+        self.cta_tile_shape_postact_mn = self.cta_tile_shape_mnk[:2]
+        epi_tile_postact = self.epi_tile
+        utils_cls = sm100_utils if self.arch == 100 else sm90_utils
+        epi_postact_smem_layout_staged = utils_cls.make_smem_layout_epi(
+            self.postact_dtype, self.postact_layout, epi_tile_postact, self.epi_stage
+        )
+        tma_atom_postact, tma_tensor_postact = self._make_tma_epi_atoms_and_tensors(
+            args.mPostAct,
+            epi_postact_smem_layout_staged,
+            epi_tile_postact,
+            op_type="store",
+        )
+        # Assume all strides are divisible by 32 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=32 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mRowVecBroadcast, mColVecBroadcast = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (args.mRowVecBroadcast, args.mColVecBroadcast)
+        ]
+        return self.EpilogueParams(
+            tma_atom_postact,
+            tma_tensor_postact,
+            epi_postact_smem_layout_staged,
+            epi_tile_postact,
+            args.act_fn,
+            alpha=args.alpha,
+            beta=args.beta,
+            mRowVecBroadcast=mRowVecBroadcast,
+            mColVecBroadcast=mColVecBroadcast,
+        )
+    def epi_get_tma_atoms(
+        self, params: EpilogueParams, *, loc=None, ip=None
+    ) -> list[cute.CopyAtom]:
+        return [params.tma_atom_postact]
+    def epi_get_tensormap_update_shapes_orders(
+        self,
+        params: EpilogueParams,
+        cu_seqlens_m: Optional[cute.Tensor],
+        batch_idx: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> tuple[list[Int32], list[int]]:
+        shapes = [cu_seqlens_m[batch_idx + 1] if cu_seqlens_m is not None else None]
+        orders = [0 if const_expr(self.postact_layout.is_m_major_c()) else 1]
+        return shapes, orders
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: EpilogueArguments, cta_tile_shape_mnk: Tuple[int, int, int], epi_tile: cute.Tile
+    ) -> int:
+        postact_dtype = args.mPostAct.element_type
+        postact_bytes_per_stage = cute.size(cute.shape(epi_tile)) * (postact_dtype.width // 8)
+        rowvec_colvec_bytes = GemmDefaultEpiMixin.epi_smem_bytes_per_stage(
+            args, cta_tile_shape_mnk, epi_tile
+        )
+        return postact_bytes_per_stage + rowvec_colvec_bytes
+    def epi_get_smem_struct(self, params: EpilogueParams):
+        row_vec_smem_size = 0 if params.mRowVecBroadcast is None else self.cta_tile_shape_mnk[1]
+        col_vec_smem_size = 0 if params.mColVecBroadcast is None else self.cta_tile_shape_mnk[0]
+        row_vec_dtype = (
+            params.mRowVecBroadcast.element_type if params.mRowVecBroadcast is not None else Float32
+        )
+        col_vec_dtype = (
+            params.mColVecBroadcast.element_type if params.mColVecBroadcast is not None else Float32
+        )
+        @cute.struct
+        class EpiSharedStorage:
+            sRowVec: cute.struct.Align[cute.struct.MemRange[row_vec_dtype, row_vec_smem_size], 16]
+            sColVec: cute.struct.Align[cute.struct.MemRange[col_vec_dtype, col_vec_smem_size], 16]
+            sPostAct: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.postact_dtype, cute.cosize(params.epi_postact_smem_layout_staged)
+                ],
+                self.buffer_align_bytes,
+            ]
+        return EpiSharedStorage
+    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
+        sRowVec, sColVec = super().epi_get_smem_tensors(params, storage)
+        sPostAct = storage.epi.sPostAct.get_tensor(
+            params.epi_postact_smem_layout_staged.outer,
+            swizzle=params.epi_postact_smem_layout_staged.inner,
+        )
+        return (sRowVec, sColVec, sPostAct)
+    @cute.jit
+    def epilogue(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: cutlass.pipeline.PipelineState,
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.TiledCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        copy_C: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        tma_atom_postact = params.tma_atom_postact
+        mPostAct_mnl = params.mPostAct_mnl
+        sRowVec, sColVec, sPostAct = epi_smem_tensors
+        get_smem_store_op = (
+            partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
+            if self.arch == 100
+            else sm90_utils_og.sm90_get_smem_store_op
+        )
+        copy_atom_postact_r2s = get_smem_store_op(
+            self.postact_layout, self.postact_dtype, self.acc_dtype
+        )
+        # tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        # tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
+        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
+        tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
+        (tma_desc_postact_ptr,) = tma_desc_epi_ptrs
+        batch_idx = tile_coord_mnkl[3]
+        copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
+            tma_atom_postact,
+            varlen_manager.offset_batch_epi(mPostAct_mnl, batch_idx),
+            self.cta_tile_shape_postact_mn,
+            params.epi_tile_postact,
+            sPostAct,
+            tile_coord_mnkl,
+            tma_desc_ptr=tma_desc_postact_ptr,
+        )
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
+        ).shape[1]
+        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx):
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+                copy_postact(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        delay_tma_store = True
+        src_idx_prev, dst_idx_prev = None, None
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from acc to D registers
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            cute.copy(
+                tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct),
+                tRS_sPostAct[None, None, None, epi_buffer],
+            )
+            if const_expr(not delay_tma_store):
+                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
+        if const_expr(delay_tma_store):
+            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
+        return epi_read_state, epi_producer_state
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC)
+        # Apply activation function if provided
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            if const_expr(self.arch < 100):
+                for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                    tRS_rPostAct[i] = params.act_fn(tRS_rD[i])
+            else:
+                for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                    tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1] = params.act_fn(
+                        (tRS_rD[2 * i], tRS_rD[2 * i + 1])
+                    )
+        else:
+            tRS_rPostAct = tRS_rD
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+class GemmActSm90(GemmActMixin, GemmSm90):
+    pass
+class GemmActSm100(GemmActMixin, GemmSm100):
+    pass
+act_fn_map = {
+    None: None,
+    "relu": activation.relu,
+    "relu_sq": activation.relu_sq,
+    "gelu_tanh_approx": activation.gelu_tanh_approx,
+}
+def gemm_act(
+    A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m or (whatever, k) if gather_A with varlen_m
+    B: Tensor,  # (l, n, k)
+    D: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    C: Optional[Tensor],  # (l, m, n) or (total_m, n) if varlen_m
+    PostAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    rowvec_bias: Optional[Tensor] = None,  # (l, n)
+    colvec_bias: Optional[Tensor] = None,  # (l, m), or (total_m,) if varlen_m
+    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
+    A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
+) -> None:
+    if cu_seqlens_m is not None:
+        assert persistent, "varlen_m requires persistent=True"
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        if D is not None:
+            assert D.stride(-1) == 1, "varlen_m requires D to be n-major"
+        assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
+    gather_A = A_idx is not None
+    if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen (cu_seqlens_m must be specified)"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    assert activation in act_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, additional_tensors={"PostAct": PostAct}, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos, varlen_m=cu_seqlens_m is not None)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmActSm100 if device_capacity[0] > 9 else GemmActSm90
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = act_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor,
+        act_fn,
+        mRowVecBroadcast=from_dlpack(rowvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1
+        )
+        if rowvec_bias is not None
+        else None,
+        mColVecBroadcast=from_dlpack(colvec_bias.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1 if cu_seqlens_m is None else 0
+        )
+        if colvec_bias is not None
+        else None,
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
+    )
+    # Create varlen arguments if needed (assumes persistent=True when varlen_m)
+    varlen_args = GemmWrapperBase.create_varlen_args(
+        cu_seqlens_m,
+        None,  # cu_seqlens_k
+        A_idx,
+        max_active_clusters,
+        cluster_shape_mnk,
+        tensor_infos,
+        GemmCls.num_epi_tensormaps,
+        pingpong,
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        max_swizzle_size,
+        rowvec_bias.dtype if rowvec_bias is not None else None,
+        colvec_bias.dtype if colvec_bias is not None else None,
+        cu_seqlens_m is not None,
+        A_idx is not None,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_act.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=gather_A,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm_act.compile_cache = {}

build/torch-cuda/quack/gemm_config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (C) 2025, Fri Dao.
+import itertools
+from typing import Optional, List, Literal
+from functools import partial
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class GemmConfig:
+    tile_m: int = 128
+    tile_n: int = 192
+    pingpong: bool = True
+    cluster_m: int = 2
+    cluster_n: int = 1
+    swap_ab: bool = False
+    # raster_order: int = 1
+    max_swizzle_size: int = 8
+def get_all_configs(
+    device_capacity: Literal[9, 10] = 9,
+    epilogue: Optional[str] = None,
+    tune_coop: bool = True,
+    # tune_raster_order=True,
+) -> List[GemmConfig]:
+    assert device_capacity in [9, 10]
+    if device_capacity == 9:
+        tile_n_vals = [128, 144, 160, 176, 192, 208]
+        tile_mn_coop_vals = [(256, tile_n) for tile_n in tile_n_vals] + [
+            (128, 224),
+            (128, 256),
+            # (192, 256),  # Getting IOT instruction (core dumped) in the bwd
+        ]
+        tile_mn_pingpong_vals = [(128, tile_n) for tile_n in tile_n_vals] + [(192, 128)]
+        if epilogue in ["gated"]:
+            tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if n % 32 == 0 and m != 192]
+            tile_mn_pingpong_vals = [(m, n) for m, n in tile_mn_pingpong_vals if n % 32 == 0]
+        elif epilogue in ["lse"]:
+            tile_mn_coop_vals = [(m, n) for m, n in tile_mn_coop_vals if m != 192]
+        tile_mn_vals = []
+        if tune_coop:
+            tile_mn_vals += [(m, n, False) for m, n in tile_mn_coop_vals]
+        tile_mn_vals += [(m, n, True) for m, n in tile_mn_pingpong_vals]
+        cluster = [(1, 2), (2, 1)]
+        # cluster = [(1, 1), (1, 2), (2, 1)]
+        if epilogue in ["lse"]:
+            cluster = [(1, 2), (2, 1)]
+        swap_ab_vals = [False, True]
+        if epilogue in ["lse", "gated"]:
+            swap_ab_vals = [False]
+        # raster_swizzle = (
+        #     [(0, 1)]
+        #     if not tune_raster_order
+        #     else [(1, 1), (1, 2), (1, 4), (1, 8), (2, 1), (2, 2), (2, 4), (2, 8)]
+        # )
+        return [
+            GemmConfig(
+                tile_m=tile_m,
+                tile_n=tile_n,
+                pingpong=pingpong,
+                cluster_m=cluster_m,
+                cluster_n=cluster_n,
+                swap_ab=swap_ab,
+                # raster_order=raster_order,
+                # max_swizzle_size=max_swizzle_size,
+            )
+            for (tile_m, tile_n, pingpong), (cluster_m, cluster_n), swap_ab in itertools.product(
+                tile_mn_vals,
+                cluster,
+                swap_ab_vals,
+                # raster_swizzle,
+            )
+        ]
+    elif device_capacity == 10:
+        tile_n_vals = [128, 160, 192, 224, 256]
+        tile_n_64_vals = [128, 192, 256]
+        tile_mn_cluster_vals = (
+            [(128, tile_n, (1, 2)) for tile_n in tile_n_vals]
+            # + [(128, tile_n, (2, 1)) for tile_n in tile_n_64_vals]
+            + [(128, tile_n, (2, 1)) for tile_n in tile_n_vals]
+            + [(256, tile_n, (2, 1)) for tile_n in tile_n_vals]
+        )
+        swap_ab_vals = [False, True]
+        if epilogue in ["lse", "gated"]:
+            swap_ab_vals = [False]
+        max_swizzle_size_vals = [4, 8, 16]
+        GemmConfigCls = partial(GemmConfig, pingpong=False)  # There's no pingpong on Sm100
+        return [
+            GemmConfigCls(
+                tile_m=m, tile_n=n, cluster_m=cm, cluster_n=cn, swap_ab=sab, max_swizzle_size=ms
+            )
+            for (m, n, (cm, cn)), sab, ms in itertools.product(
+                tile_mn_cluster_vals, swap_ab_vals, max_swizzle_size_vals
+            )
+        ]

build/torch-cuda/quack/gemm_dact.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple
+from functools import partial
+from torch import Tensor
+import cutlass
+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+import cutlass.torch as cutlass_torch
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .gemm_default_epi import GemmDefaultEpiMixin
+from .gemm_act import GemmActMixin
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from .gemm_wrapper_utils import GemmWrapperBase
+from . import activation
+class GemmDActMixin(GemmActMixin):
+    # Different from GemmActSm90, here act_bwd_fn must take in 2 arguments (x, dout)
+    # and return 2 arguments (dx, out)
+    EpilogueArguments = GemmActMixin.EpilogueArguments
+    EpilogueParams = GemmActMixin.EpilogueParams
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        assert tRS_rC is not None
+        # We don't add C to the accumulator
+        GemmDefaultEpiMixin.epi_visit_subtile(self, params, epi_loop_tensors, tRS_rD, tRS_rC=None)
+        tRS_rC_acc = cute.make_fragment_like(tRS_rC, self.acc_dtype)
+        tRS_rC_acc.store(tRS_rC.load().to(self.acc_dtype))
+        # If we don't have .shape here, the compiler generates local stores and loads
+        if const_expr(params.act_fn is not None):
+            tRS_rPostAct = cute.make_fragment(tRS_rD.layout.shape, self.acc_dtype)
+            if const_expr(self.arch < 100):
+                for i in cutlass.range(cute.size(tRS_rPostAct), unroll_full=True):
+                    tRS_rD[i], tRS_rPostAct[i] = params.act_fn(tRS_rC_acc[i], tRS_rD[i])
+            else:
+                for i in cutlass.range(cute.size(tRS_rPostAct) // 2, unroll_full=True):
+                    (
+                        (tRS_rD[2 * i], tRS_rD[2 * i + 1]),
+                        (tRS_rPostAct[2 * i], tRS_rPostAct[2 * i + 1]),
+                    ) = params.act_fn(
+                        (tRS_rC_acc[2 * i], tRS_rC_acc[2 * i + 1]),
+                        (tRS_rD[2 * i], tRS_rD[2 * i + 1]),
+                    )
+        else:
+            tRS_rPostAct = tRS_rC_acc
+        # Type conversion
+        tRS_rPostAct_out = cute.make_fragment_like(tRS_rPostAct, self.postact_dtype)
+        tRS_rPostAct_out.store(tRS_rPostAct.load().to(self.postact_dtype))
+        return tRS_rPostAct_out
+class GemmDActSm90(GemmDActMixin, GemmSm90):
+    pass
+class GemmDActSm100(GemmDActMixin, GemmSm100):
+    pass
+dact_fn_map = {
+    None: None,
+    "relu": activation.drelu,
+    "relu_sq": activation.drelu_sq,
+    "gelu_tanh_approx": activation.dgelu_tanh_approx,
+}
+def gemm_dact(
+    A: Tensor,  # (l, m, k) or (total_m, k) if varlen_m or (whatever, k) if gather_A with varlen_m
+    B: Tensor,  # (l, n, k)
+    Out: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    PreAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    PostAct: Tensor,  # (l, m, n) or (total_m, n) if varlen_m
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    activation: Optional[str],
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = True,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    cu_seqlens_m: Optional[Tensor] = None,  # (l+1,) cumulative sum of m values for variable length
+    A_idx: Optional[Tensor] = None,  # (total_m,) if gather_A with varlen_m
+) -> None:
+    if cu_seqlens_m is not None:
+        assert persistent, "varlen_m requires persistent=True"
+        assert A.stride(-1) == 1, "varlen_m requires A to be k-major"
+        assert Out.stride(-1) == 1, "varlen_m requires Out to be n-major"
+        assert PreAct.stride(-1) == 1, "varlen_m requires PreAct to be n-major"
+        assert PostAct.stride(-1) == 1, "varlen_m requires PostAct to be n-major"
+    gather_A = A_idx is not None
+    if gather_A:
+        assert cu_seqlens_m is not None, "gather_A requires varlen (cu_seqlens_m must be specified)"
+        assert cluster_N == 1, "gather_A requires cluster_N=1"
+    assert activation in dact_fn_map, f"Unsupported activation {activation}"
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A,
+        B,
+        Out,
+        PreAct,
+        additional_tensors={"PostAct": PostAct},
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+    )
+    GemmWrapperBase.permute_tensors(tensor_infos, varlen_m=cu_seqlens_m is not None)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmDActSm100 if device_capacity[0] > 9 else GemmDActSm90
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors(tensor_infos, major_configs)
+    act_fn = dact_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(tensor_infos["PostAct"].cute_tensor, act_fn)
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
+    )
+    # Create varlen arguments if needed (assumes persistent=True when varlen_m)
+    varlen_args = GemmWrapperBase.create_varlen_args(
+        cu_seqlens_m,
+        None,  # cu_seqlens_k
+        A_idx,
+        max_active_clusters,
+        cluster_shape_mnk,
+        tensor_infos,
+        GemmCls.num_epi_tensormaps,
+        pingpong,
+    )
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        max_swizzle_size,
+        cu_seqlens_m is not None,
+        A_idx is not None,
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_dact.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=gather_A,
+        )
+        cache[compile_key] = cute.compile(
+            gemm,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm_dact.compile_cache = {}

build/torch-cuda/quack/gemm_default_epi.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) 2025, Wentao Guo, Tri Dao.
+from typing import Optional, Tuple
+from functools import partial
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Float32, Boolean, const_expr
+from .cute_dsl_utils import ArgumentsBase, ParamsBase
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .sm90_utils import partition_for_epilogue
+from . import utils as utils
+from . import copy_utils as copy_utils
+from .varlen_utils import VarlenManager
+class GemmDefaultEpiMixin:
+    num_epi_tensormaps: int = 0
+    @dataclass
+    class EpilogueArguments(ArgumentsBase):
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+        add_to_output: bool = False
+    @dataclass
+    class EpilogueParams(ParamsBase):
+        alpha: Optional[Float32 | cute.Tensor] = None
+        beta: Optional[Float32 | cute.Tensor] = None
+        mRowVecBroadcast: Optional[cute.Tensor] = None
+        mColVecBroadcast: Optional[cute.Tensor] = None
+    def epi_to_underlying_arguments(
+        self, args: EpilogueArguments, *, loc=None, ip=None
+    ) -> EpilogueParams:
+        # Assume all strides are divisible by 32 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=32 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mRowVecBroadcast, mColVecBroadcast = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (args.mRowVecBroadcast, args.mColVecBroadcast)
+        ]
+        return self.EpilogueParams(
+            alpha=args.alpha,
+            beta=args.beta,
+            mRowVecBroadcast=mRowVecBroadcast,
+            mColVecBroadcast=mColVecBroadcast,
+        )
+    @cute.jit
+    def epi_begin(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tidx: Int32,
+    ):
+        alpha, beta = None, None
+        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
+            alpha = utils.load_scalar_or_pointer(params.alpha)
+        if const_expr(hasattr(params, "beta") and params.beta is not None):
+            beta = utils.load_scalar_or_pointer(params.beta)
+        sRowVec, sColVec, *rest = epi_smem_tensors
+        tile_M, tile_N = self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[1]
+        batch_idx = tile_coord_mnkl[3]
+        num_epi_threads = self.num_epi_warps * cute.arch.WARP_SIZE
+        # Don't need sync as we assume the previous epilogue has finished
+        partition_for_epilogue_fn = partial(
+            partition_for_epilogue,
+            epi_tile=epi_tile,
+            tiled_copy=tiled_copy_t2r if tiled_copy_t2r is not None else tiled_copy_r2s,
+            tidx=tidx,
+            reference_src=tiled_copy_t2r is None,
+        )
+        tDsRowVec = None
+        if const_expr(params.mRowVecBroadcast is not None):
+            rowvec_dtype = params.mRowVecBroadcast.element_type
+            num_copy_elems = const_expr(max(32, rowvec_dtype.width)) // rowvec_dtype.width
+            thr_copy_RV = copy_utils.tiled_copy_1d(
+                params.mRowVecBroadcast.element_type, num_epi_threads, num_copy_elems, is_async=True
+            ).get_slice(tidx)
+            mRowVec = params.mRowVecBroadcast[batch_idx, None]
+            gRowVec = cute.local_tile(mRowVec, (tile_N,), (tile_coord_mnkl[1],))
+            tRVgRV = thr_copy_RV.partition_S(gRowVec)
+            tRVsRV = thr_copy_RV.partition_D(sRowVec)
+            tRVcRV = thr_copy_RV.partition_S(cute.make_identity_tensor(tile_N))
+            limit_n = min(mRowVec.shape[0] - tile_coord_mnkl[1] * tile_N, tile_N)
+            tRVpRV = cute.make_fragment((1, cute.size(tRVsRV.shape[1])), Boolean)
+            for m in cutlass.range(cute.size(tRVsRV.shape[1]), unroll_full=True):
+                tRVpRV[0, m] = tRVcRV[0, m] < limit_n
+            cute.copy(thr_copy_RV, tRVgRV, tRVsRV, pred=tRVpRV)
+            # (CPY, CPY_M, CPY_N, EPI_M, EPI_N)
+            tDsRowVec = partition_for_epilogue_fn(
+                cute.make_tensor(
+                    sRowVec.iterator, cute.make_layout((tile_M, tile_N), stride=(0, 1))
+                )
+            )
+            if const_expr(tiled_copy_t2r is not None):
+                tDsRowVec = tiled_copy_r2s.retile(tDsRowVec)
+        tDsColVec = None
+        if const_expr(params.mColVecBroadcast is not None):
+            colvec_dtype = params.mColVecBroadcast.element_type
+            num_copy_elems = const_expr(max(32, colvec_dtype.width)) // colvec_dtype.width
+            thr_copy_CV = copy_utils.tiled_copy_1d(
+                params.mColVecBroadcast.element_type, num_epi_threads, num_copy_elems, is_async=True
+            ).get_slice(tidx)
+            if const_expr(not varlen_manager.varlen_m):
+                mColVec = params.mColVecBroadcast[batch_idx, None]
+            else:
+                mColVec = cute.domain_offset(
+                    (varlen_manager.params.cu_seqlens_m[batch_idx],), params.mColVecBroadcast
+                )
+            gColVec = cute.local_tile(mColVec, (tile_M,), (tile_coord_mnkl[0],))
+            tCVgCV = thr_copy_CV.partition_S(gColVec)
+            tCVsCV = thr_copy_CV.partition_D(sColVec)
+            tCVcCV = thr_copy_CV.partition_S(cute.make_identity_tensor(tile_M))
+            limit_m = min(varlen_manager.len_m(batch_idx) - tile_coord_mnkl[0] * tile_M, tile_M)
+            tCVpCV = cute.make_fragment((1, cute.size(tCVsCV.shape[1])), Boolean)
+            for m in cutlass.range(cute.size(tCVsCV.shape[1]), unroll_full=True):
+                tCVpCV[0, m] = tCVcCV[0, m] < limit_m
+            cute.copy(thr_copy_CV, tCVgCV, tCVsCV, pred=tCVpCV)
+            tDsColVec = partition_for_epilogue_fn(
+                cute.make_tensor(
+                    sColVec.iterator, cute.make_layout((tile_M, tile_N), stride=(1, 0))
+                )
+            )
+            if const_expr(tiled_copy_t2r is not None):
+                tDsColVec = tiled_copy_r2s.retile(tDsColVec)
+        if const_expr(params.mRowVecBroadcast is not None or params.mColVecBroadcast is not None):
+            cute.arch.cp_async_commit_group()
+            cute.arch.cp_async_wait_group(0)
+            epilogue_barrier.arrive_and_wait()
+        return alpha, beta, tDsRowVec, tDsColVec
+    def epi_begin_loop(self, params: EpilogueParams, epi_tensors, epi_coord: cute.Coord):
+        alpha, beta, tDsRowVec, tDsColVec = epi_tensors
+        tDrRowVec_cvt = None
+        if const_expr(tDsRowVec is not None):
+            tDsRowVec_cur = cute.group_modes(tDsRowVec, 3, cute.rank(tDsRowVec))[
+                None, None, None, epi_coord
+            ]
+            # tDrRowVec = cute.make_fragment_like(tDsRowVec_cur)
+            tDrRowVec = cute.make_fragment(tDsRowVec_cur.layout, tDsRowVec_cur.element_type)
+            cute.autovec_copy(cute.filter_zeros(tDsRowVec_cur), cute.filter_zeros(tDrRowVec))
+            tDrRowVec_cvt = cute.make_fragment_like(tDrRowVec, self.acc_dtype)
+            tDrRowVec_cvt.store(tDrRowVec.load().to(self.acc_dtype))
+        tDrColVec_cvt = None
+        if const_expr(tDsColVec is not None):
+            tDsColVec_cur = cute.group_modes(tDsColVec, 3, cute.rank(tDsColVec))[
+                None, None, None, epi_coord
+            ]
+            # This somehow doesn't work, some dim with stride 0 turns to non-zero stride
+            # tDrRowVec = cute.make_fragment_like(tDsRowVec_cur)
+            tDrColVec = cute.make_fragment(tDsColVec_cur.layout, tDsColVec_cur.element_type)
+            cute.autovec_copy(cute.filter_zeros(tDsColVec_cur), cute.filter_zeros(tDrColVec))
+            tDrColVec_cvt = cute.make_fragment_like(tDrColVec, self.acc_dtype)
+            tDrColVec_cvt.store(tDrColVec.load().to(self.acc_dtype))
+        return alpha, beta, tDrRowVec_cvt, tDrColVec_cvt
+    @cute.jit
+    def epi_visit_subtile(
+        self,
+        params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        alpha, beta, tDrRowVec, tDrColVec = epi_loop_tensors
+        rD = tRS_rD.load()
+        # Apply alpha scaling to accumulator if alpha is provided (not None)
+        if const_expr(hasattr(params, "alpha") and params.alpha is not None):
+            alpha = utils.load_scalar_or_pointer(params.alpha)
+            rD *= alpha
+        # Apply C with beta scaling
+        if const_expr(tRS_rC is not None):
+            if const_expr(not hasattr(params, "beta") or params.beta is None):
+                # beta is None, default behavior: add C (beta=1.0)
+                rD += tRS_rC.load().to(tRS_rD.element_type)
+            else:
+                beta = utils.load_scalar_or_pointer(params.beta)
+                rD += beta * tRS_rC.load().to(tRS_rD.element_type)
+        tRS_rD.store(rD)
+        if const_expr(tDrRowVec is not None):
+            for i in cutlass.range(cute.size(tDrRowVec), unroll_full=True):
+                tRS_rD[i] += tDrRowVec[i]
+        if const_expr(tDrColVec is not None):
+            for i in cutlass.range(cute.size(tDrColVec), unroll_full=True):
+                tRS_rD[i] += tDrColVec[i]
+        return None
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: Optional[EpilogueArguments],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
+    ) -> int:
+        row_vec_smem_size = 0 if args.mRowVecBroadcast is None else cta_tile_shape_mnk[1]
+        col_vec_smem_size = 0 if args.mColVecBroadcast is None else cta_tile_shape_mnk[0]
+        row_vec_dtype = (
+            args.mRowVecBroadcast.element_type if args.mRowVecBroadcast is not None else Float32
+        )
+        col_vec_dtype = (
+            args.mColVecBroadcast.element_type if args.mColVecBroadcast is not None else Float32
+        )
+        return (
+            row_vec_smem_size * row_vec_dtype.width + col_vec_smem_size * col_vec_dtype.width
+        ) // 8
+    def epi_get_smem_struct(self, params: EpilogueParams):
+        row_vec_smem_size = 0 if params.mRowVecBroadcast is None else self.cta_tile_shape_mnk[1]
+        col_vec_smem_size = 0 if params.mColVecBroadcast is None else self.cta_tile_shape_mnk[0]
+        row_vec_dtype = (
+            params.mRowVecBroadcast.element_type if params.mRowVecBroadcast is not None else Float32
+        )
+        col_vec_dtype = (
+            params.mColVecBroadcast.element_type if params.mColVecBroadcast is not None else Float32
+        )
+        @cute.struct
+        class EpiSharedStorage:
+            sRowVec: cute.struct.Align[cute.struct.MemRange[row_vec_dtype, row_vec_smem_size], 16]
+            sColVec: cute.struct.Align[cute.struct.MemRange[col_vec_dtype, col_vec_smem_size], 16]
+        return EpiSharedStorage
+    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
+        sRowVec = None
+        if const_expr(params.mRowVecBroadcast is not None):
+            sRowVec = storage.epi.sRowVec.get_tensor(cute.make_layout(self.cta_tile_shape_mnk[1]))
+        sColVec = None
+        if const_expr(params.mColVecBroadcast is not None):
+            sColVec = storage.epi.sColVec.get_tensor(cute.make_layout(self.cta_tile_shape_mnk[0]))
+        return (sRowVec, sColVec)
+class GemmDefaultSm90(GemmDefaultEpiMixin, GemmSm90):
+    pass
+class GemmDefaultSm100(GemmDefaultEpiMixin, GemmSm100):
+    pass

build/torch-cuda/quack/gemm_interface.py ADDED Viewed

	@@ -0,0 +1,1058 @@

+# Copyright (c) 2025, Tri Dao
+from typing import Optional, Tuple, Literal
+from functools import partial
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from ._ops_compat import add_quack_op_namespace_prefix
+from .gemm_config import GemmConfig, get_all_configs
+from .autotuner import autotune, AutotuneConfig
+from .cute_dsl_utils import get_device_capacity
+from .gemm import gemm as gemm_sm90_sm100
+from .gemm_act import gemm_act as gemm_act_sm90_sm100
+from .gemm_dact import gemm_dact as gemm_dact_sm90_sm100
+from .gemm_symmetric import gemm_symmetric as gemm_symmetric_sm90_sm100
+# Dictionary mapping activation names to PyTorch functions
+act_to_pytorch_fn_map = {
+    None: lambda x: x,
+    "relu": F.relu,
+    "relu_sq": lambda x: F.relu(x).square(),
+    "gelu_tanh_approx": partial(F.gelu, approximate="tanh"),
+}
+# Dictionary mapping gated activation names to their forward functions
+# Each function takes (gate, up) and returns postact
+gated_to_pytorch_fn_map = {
+    "swiglu": lambda gate, up: F.silu(gate) * up,
+    "swiglu_oai": lambda gate, up: gate * torch.sigmoid(1.702 * gate) * (up + 1),
+    "reglu": lambda gate, up: F.relu(gate) * up,
+    "geglu": lambda gate, up: F.gelu(gate, approximate="tanh") * up,
+    "glu": lambda gate, up: torch.sigmoid(gate) * up,
+}
+def _get_default_device_capacity():
+    if not torch.cuda.is_available():
+        return (9, 0)
+    cap = get_device_capacity(torch.device("cuda"))
+    if cap[0] not in (9, 10):
+        return (9, 0)
+    return cap
+class _LazyDeviceCapacity:
+    """Defer torch.cuda.get_device_capability until first access so the
+    module can be imported in environments without a GPU (e.g. nix build)."""
+    _value = None
+    def __getitem__(self, idx):
+        if self._value is None:
+            self._value = _get_default_device_capacity()
+        return self._value[idx]
+default_device_capacity = _LazyDeviceCapacity()
+def default_config(device):
+    if get_device_capacity(device)[0] != 10:
+        return GemmConfig(tile_m=128, tile_n=192, cluster_m=2, cluster_n=1, pingpong=True)
+    else:
+        return GemmConfig(tile_m=256, tile_n=256, cluster_m=2, cluster_n=1, pingpong=False)
+def prune_invalid_gemm_configs(configs, named_args: dict, **kwargs):
+    kwargs = named_args | kwargs
+    gather_A = kwargs.get("A_idx", None) is not None
+    varlen_m = kwargs.get("cu_seqlens_m", None) is not None
+    if varlen_m or gather_A:  # Doesn't support swap_ab
+        configs = [conf for conf in configs if not conf.kwargs["config"].swap_ab]
+    if gather_A:
+        if get_device_capacity(kwargs["A"].device)[0] == 9:
+            # tile_n == 208 causes register spills, as gather_A requires more registers for the producer
+            configs = [
+                conf
+                for conf in configs
+                if conf.kwargs["config"].cluster_n == 1 and conf.kwargs["config"].tile_n != 208
+            ]
+    return configs
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
+    key=["dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_tuned(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    alpha: float | Tensor = 1.0,  # (1,)
+    beta: float | Tensor = 1.0,  # (1,)
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    cu_seqlens_k: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    varlen_k = cu_seqlens_k is not None
+    varlen = varlen_m or varlen_k
+    gather_A = A_idx is not None
+    if gather_A:
+        assert varlen, "gather_A requires either varlen_m or varlen_k"
+        assert config.cluster_n == 1, "gather_A requires cluster_n=1"
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K) or (N, total_K)
+    if B.ndim == 2 and not varlen_k:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if C is not None and C.ndim == 2 and not varlen_m:
+        C = C.unsqueeze(0)  # (1, M, N)
+    if out.ndim == 2 and not varlen_m:
+        out = out.unsqueeze(0)
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
+    batch_size = B.shape[0] if not varlen_k else cu_seqlens_k.shape[0] - 1
+    if varlen_m:
+        # If gather_A (A_idx provided), use its length; otherwise use A.shape[0]
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-2])
+    else:
+        out_shape = (batch_size, A.shape[-2], B.shape[-2])
+    assert out.shape == out_shape, f"out shape mismatch: {out.shape} vs {out_shape}"
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_sm90_sm100(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        out if not config.swap_ab else out.mT,
+        (C if not config.swap_ab else C.mT) if C is not None else None,
+        tile_count_semaphore,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
+        alpha=alpha,
+        beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+    )
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_act_tuned(
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    # (M, N) or (L, M, N) or (total_M, N) if varlen_m - None if not storing preact
+    preact_out: Optional[Tensor],
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
+    config: Optional[GemmConfig] = None,
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if C is not None and C.ndim == 2 and not varlen_m:
+        C = C.unsqueeze(0)  # (1, M, N)
+    if preact_out is not None and preact_out.ndim == 2 and not varlen_m:
+        D = preact_out.unsqueeze(0)
+    else:
+        D = preact_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    if bias is not None and bias.ndim == 1:
+        bias = bias.unsqueeze(0)  # (L, N)
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_act_sm90_sm100(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        (D if not config.swap_ab else D.mT) if D is not None else None,
+        (C if not config.swap_ab else C.mT) if C is not None else None,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        rowvec_bias=bias if not config.swap_ab else None,
+        colvec_bias=bias if config.swap_ab else None,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+    )
+@autotune(
+    configs=[AutotuneConfig(config=c) for c in get_all_configs(default_device_capacity[0])],
+    key=["activation", "dynamic_scheduler"],
+    prune_configs_by={"early_config_prune": prune_invalid_gemm_configs},
+)
+def gemm_dact_tuned(
+    # (M, K) or or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, N, N) or (total_M, N) if varlen_m
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,  # (L+1), int32
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = True,
+    config: Optional[GemmConfig] = None,
+) -> None:
+    if config is None:
+        config = default_config(A.device)
+    varlen_m = cu_seqlens_m is not None
+    if varlen_m:
+        assert not config.swap_ab, "Variable-length sequences not supported with swap_ab"
+    if A.ndim == 2 and not varlen_m:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (N, K) or (L, N, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, N, K)
+    if PreAct.ndim == 2 and not varlen_m:
+        PreAct = PreAct.unsqueeze(0)  # (1, M, N)
+    if dx_out.ndim == 2 and not varlen_m:
+        D = dx_out.unsqueeze(0)
+    else:
+        D = dx_out
+    if postact_out.ndim == 2 and not varlen_m:
+        PostAct = postact_out.unsqueeze(0)
+    else:
+        PostAct = postact_out
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_dact_sm90_sm100(
+        A if not config.swap_ab else B,
+        B if not config.swap_ab else A,
+        D if not config.swap_ab else D.mT,
+        PreAct if not config.swap_ab else PreAct.mT,
+        PostAct if not config.swap_ab else PostAct.mT,
+        tile_count_semaphore,
+        activation,
+        config.tile_m,
+        config.tile_n,
+        config.cluster_m,
+        config.cluster_n,
+        config.pingpong,
+        persistent=True,
+        max_swizzle_size=config.max_swizzle_size,
+        cu_seqlens_m=cu_seqlens_m,
+        A_idx=A_idx,
+    )
+def gemm(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    alpha: float | Tensor = 1.0,
+    out_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tensor:
+    """GEMM with optional output tensor and tuning control."""
+    if out is None:
+        out_dtype = A.dtype if out_dtype is None else out_dtype
+        varlen_m = cu_seqlens_m is not None
+        varlen_k = cu_seqlens_k is not None
+        if varlen_m:
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_shape = (total_m, B.shape[-1])
+        elif varlen_k:
+            L = cu_seqlens_k.shape[0] - 1
+            # For varlen_k, the first dimension is always A.shape[0] (M dimension)
+            out_shape = (L, A.shape[0], B.shape[-1])
+        else:
+            out_shape = (
+                (A.shape[0], B.shape[-1]) if A.ndim == 2 else (A.shape[0], A.shape[-2], B.shape[-1])
+            )
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    alpha_tensor = alpha if not isinstance(alpha, float) else None
+    alpha = alpha if isinstance(alpha, float) else 1.0
+    gemm_out(
+        A,
+        B,
+        out,
+        bias=bias,
+        alpha=alpha,
+        alpha_tensor=alpha_tensor,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        dynamic_scheduler=dynamic_scheduler,
+        tuned=tuned,
+    )
+    return out
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_out"),
+    mutates_args=("out",),
+    device_types="cuda",
+    # We have to split out alpha and alpha_tensor since torch.library requires
+    # each argument to have a fixed type
+    # schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? bias, float alpha=1.0, Tensor? alpha_tensor=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_out(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    alpha: float = 1.0,
+    alpha_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    """GEMM with pre-allocated output tensor."""
+    fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
+    alpha = alpha_tensor if alpha_tensor is not None else alpha
+    fn(
+        A,
+        B,
+        out,
+        C=None,
+        bias=bias,
+        alpha=alpha,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+def gemm_ref(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    alpha: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    out_dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    """Reference implementation for GEMM with pre-allocated output."""
+    # The out_dtype argument requires torch >= 2.8
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    if cu_seqlens_m is None and cu_seqlens_k is None:
+        fn = torch.bmm if A.ndim == 3 else torch.mm
+        out = fn(A, B, out_dtype=out_dtype, out=out)
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
+    elif cu_seqlens_m is not None:
+        # Handle varlen_m case
+        if out is None:
+            # When gather_A (A_idx provided), output size is determined by A_idx length
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out = torch.empty((total_m, B.shape[-1]), dtype=out_dtype, device=A.device)
+        for i in range(cu_seqlens_m.shape[0] - 1):
+            A_slice = (
+                A[A_idx[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]]
+                if A_idx is not None
+                else A[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            )
+            torch.mm(A_slice, B[i], out=out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]])
+            if not isinstance(alpha, float) or alpha != 1.0:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] *= alpha
+            if bias is not None:
+                out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]] += bias[i]
+    else:  # cu_seqlens_k is not None
+        L = cu_seqlens_k.shape[0] - 1
+        if out is None:
+            out = torch.empty((L, A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        for i in range(L):
+            A_slice = (
+                A[:, A_idx[cu_seqlens_k[i] : cu_seqlens_k[i + 1]]]
+                if A_idx is not None
+                else A[:, cu_seqlens_k[i] : cu_seqlens_k[i + 1]]
+            )
+            torch.mm(A_slice, B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :], out=out[i])
+        if not isinstance(alpha, float) or alpha != 1.0:
+            out *= alpha
+        if bias is not None:
+            out += bias
+    return out
+def gemm_add(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+    out_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tensor:
+    """GEMM with addition and optional output tensor."""
+    if out is None:
+        out_dtype = A.dtype if out_dtype is None else out_dtype
+        varlen_m = cu_seqlens_m is not None
+        varlen_k = cu_seqlens_k is not None
+        if varlen_m:
+            # If A_idx is provided (gather_A), use its length; otherwise use A.shape[0]
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_shape = (total_m, B.shape[-1])
+        elif varlen_k:
+            L = cu_seqlens_k.shape[0] - 1
+            # For varlen_k, the first dimension is always A.shape[0] (M dimension)
+            out_shape = (L, A.shape[0], B.shape[-1])
+        else:
+            out_shape = (
+                (A.shape[0], B.shape[-1]) if A.ndim == 2 else (A.shape[0], A.shape[-2], B.shape[-1])
+            )
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    add_to_output = C is out and isinstance(beta, float) and beta == 1.0 and cu_seqlens_m is None
+    alpha_tensor = alpha if not isinstance(alpha, float) else None
+    alpha = alpha if isinstance(alpha, float) else 1.0
+    beta_tensor = beta if not isinstance(beta, float) else None
+    beta = beta if isinstance(beta, float) else 1.0
+    gemm_add_out(
+        A,
+        B,
+        C if not add_to_output else None,
+        out,
+        alpha,
+        beta,
+        alpha_tensor,
+        beta_tensor,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+        tuned=tuned,
+    )
+    return out
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_add_out"),
+    mutates_args=("out",),
+    device_types="cuda",
+    # We have to split out alpha and alpha_tensor since torch.library requires
+    # each argument to have a fixed type
+    # schema="(Tensor A, Tensor B, Tensor C, Tensor(a3!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, Tensor? cu_seqlens_m=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_add_out(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    alpha: float = 1.0,
+    beta: float = 1.0,
+    alpha_tensor: Optional[Tensor] = None,
+    beta_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    add_to_output: bool = False,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    """GEMM with addition and pre-allocated output tensor."""
+    fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
+    alpha = alpha_tensor if alpha_tensor is not None else alpha
+    beta = beta_tensor if beta_tensor is not None else beta
+    fn(
+        A,
+        B,
+        out,
+        C,
+        alpha=alpha,
+        beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+def gemm_add_ref(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    out_dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    """Reference implementation for GEMM with addition and pre-allocated output."""
+    if cu_seqlens_m is None and cu_seqlens_k is None:
+        if isinstance(alpha, float) and isinstance(beta, float):
+            out = torch.addmm(C, A, B, out_dtype=out_dtype, alpha=alpha, beta=beta, out=out)
+        else:
+            out_dtype = (
+                out.dtype if out is not None else (out_dtype if out_dtype is not None else A.dtype)
+            )
+            result = (alpha * (A @ B) + beta * C).to(out_dtype)
+            if out is not None:
+                out.copy_(result)
+        if bias is not None:
+            bias = bias if A.ndim == 2 else bias.unsqueeze(1)
+            out += bias
+    elif cu_seqlens_m is not None:
+        # Handle varlen_m case
+        if out is None:
+            # When gather_A (A_idx provided), output size is determined by A_idx length
+            total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+            out_dtype = out_dtype if out_dtype is not None else A.dtype
+            out = torch.empty((total_m, B.shape[-1]), dtype=out_dtype, device=A.device)
+        for i in range(cu_seqlens_m.shape[0] - 1):
+            A_slice = (
+                A[A_idx[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]]
+                if A_idx is not None
+                else A[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            )
+            C_slice = C[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            out_slice = out[cu_seqlens_m[i] : cu_seqlens_m[i + 1]]
+            result = alpha * torch.mm(A_slice, B[i]) + beta * C_slice
+            if bias is not None:
+                result += bias[i]
+            out_slice.copy_(result)
+    else:  # cu_seqlens_k is not None
+        # Handle varlen_k case
+        L = cu_seqlens_k.shape[0] - 1
+        out_dtype = out_dtype if out_dtype is not None else A.dtype
+        if out is None:
+            out = torch.empty((L, A.shape[0], B.shape[1]), dtype=out_dtype, device=A.device)
+        for i in range(L):
+            A_slice = (
+                A[:, A_idx[cu_seqlens_k[i] : cu_seqlens_k[i + 1]]]
+                if A_idx is not None
+                else A[:, cu_seqlens_k[i] : cu_seqlens_k[i + 1]]
+            )
+            B_slice = B[cu_seqlens_k[i] : cu_seqlens_k[i + 1], :]
+            result = alpha * torch.mm(A_slice, B_slice) + beta * C[i]
+            out[i].copy_(result)
+        if bias is not None:
+            out += bias
+    return out
+def gemm_add_inplace(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    """In-place GEMM with addition: out = alpha * A @ B + beta * out.
+    Args:
+        A: (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k - input tensor
+        B: (K, N) or (L, K, N) or (total_K, N) if varlen_k - input tensor
+        out: (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k - tensor to accumulate into (modified in-place)
+        alpha: Scalar multiplier for A @ B
+        beta: Scalar multiplier for out
+        cu_seqlens_m: Optional cumulative sequence lengths for variable M
+        cu_seqlens_k: Optional cumulative sequence lengths for variable K
+        dynamic_scheduler: Whether to use dynamic scheduler
+        tuned: Whether to use autotuned configuration
+    """
+    alpha_tensor = alpha if not isinstance(alpha, float) else None
+    alpha = alpha if isinstance(alpha, float) else 1.0
+    beta_tensor = beta if not isinstance(beta, float) else None
+    beta = beta if isinstance(beta, float) else 1.0
+    gemm_add_inplace_op(
+        A,
+        B,
+        out,
+        alpha,
+        beta,
+        alpha_tensor,
+        beta_tensor,
+        cu_seqlens_m,
+        cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        dynamic_scheduler=dynamic_scheduler,
+        tuned=tuned,
+    )
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_add_inplace"),
+    mutates_args=("out",),
+    device_types="cuda",
+    # We have to split out alpha and alpha_tensor since torch.library requires
+    # each argument to have a fixed type
+    # schema="(Tensor A, Tensor B, Tensor(a2!) out, float alpha=1.0, float beta=1.0, Tensor? alpha_tensor=None, Tensor? beta_tensor=None, Tensor? cu_seqlens_m=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_add_inplace_op(
+    # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A with varlen_m or (M, whatever) if gather_A with varlen_k
+    A: Tensor,
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m or (L, M, N) if varlen_k
+    alpha: float = 1.0,
+    beta: float = 1.0,
+    alpha_tensor: Optional[Tensor] = None,
+    beta_tensor: Optional[Tensor] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    cu_seqlens_k: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) or (total_K,) indices for gather_A when varlen
+    batch_idx_permute: Optional[Tensor] = None,  # (L,) permutation of batch indices for scheduler
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    fn = gemm_tuned if tuned else partial(gemm_tuned.fn, config=None)
+    alpha = alpha_tensor if alpha_tensor is not None else alpha
+    beta = beta_tensor if beta_tensor is not None else beta
+    add_to_output = isinstance(beta, float) and beta == 1.0 and cu_seqlens_m is None
+    # Use out as both input bias and output
+    fn(
+        A,
+        B,
+        out,
+        out if not add_to_output else None,
+        alpha=alpha,
+        beta=beta,
+        cu_seqlens_m=cu_seqlens_m,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        batch_idx_permute=batch_idx_permute,
+        add_to_output=add_to_output,
+        dynamic_scheduler=dynamic_scheduler,
+    )
+def gemm_act(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    preact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    store_preact: bool = True,
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM with activation and optional output tensors."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    varlen_m = cu_seqlens_m is not None
+    # Determine output shape based on gather_A
+    if varlen_m:
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-1])
+    elif A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    if preact_out is None and store_preact:
+        preact_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    if postact_out is None:
+        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
+    gemm_act_out(
+        A,
+        B,
+        preact_out,
+        postact_out,
+        C,
+        bias,
+        activation,
+        cu_seqlens_m,
+        A_idx,
+        dynamic_scheduler,
+        tuned,
+    )
+    return preact_out, postact_out
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_act_out"),
+    mutates_args=("preact_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!)? preact_out, Tensor(a3!) postact_out, Tensor? C=None, Tensor? bias=None, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=False, bool tuned=True) -> ()",
+)
+def gemm_act_out(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    preact_out: Optional[Tensor],  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = False,
+    tuned: bool = True,
+) -> None:
+    """GEMM with activation and pre-allocated output tensors."""
+    fn = gemm_act_tuned if tuned else partial(gemm_act_tuned.fn, config=None)
+    fn(A, B, preact_out, postact_out, C, bias, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
+def gemm_act_ref(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+    store_preact: bool = True,
+) -> Tuple[Optional[Tensor], Tensor]:
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    if C is None:
+        out = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    else:
+        out = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    postact = act_to_pytorch_fn_map[activation](out).to(postact_dtype)
+    return out.to(out_dtype) if store_preact else None, postact
+def gemm_dact(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    dx_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = True,
+    tuned: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    """GEMM with activation gradient and optional output tensors."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
+    varlen_m = cu_seqlens_m is not None
+    # Determine output shape based on gather_A
+    if varlen_m:
+        total_m = A_idx.shape[0] if A_idx is not None else A.shape[0]
+        out_shape = (total_m, B.shape[-1])
+    elif A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    if dx_out is None:
+        dx_out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    if postact_out is None:
+        postact_out = torch.empty(out_shape, dtype=postact_dtype, device=A.device)
+    gemm_dact_out(
+        A, B, PreAct, dx_out, postact_out, activation, cu_seqlens_m, A_idx, dynamic_scheduler, tuned
+    )
+    return dx_out, postact_out
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_dact_out"),
+    mutates_args=("dx_out", "postact_out"),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor PreAct, Tensor(a3!) dx_out, Tensor(a4!) postact_out, str? activation=None, Tensor? cu_seqlens_m=None, Tensor? A_idx=None, bool dynamic_scheduler=True, bool tuned=True) -> ()",
+)
+def gemm_dact_out(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (whatever, K) if gather_A with varlen_m
+    B: Tensor,  # (K, N) or (L, K, N)
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    dx_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    postact_out: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    dynamic_scheduler: bool = True,
+    tuned: bool = True,
+) -> None:
+    """GEMM with activation gradient and pre-allocated output tensors."""
+    fn = gemm_dact_tuned if tuned else partial(gemm_dact_tuned.fn, config=None)
+    fn(A, B, PreAct, dx_out, postact_out, activation, cu_seqlens_m, A_idx, dynamic_scheduler)
+def gemm_dact_ref(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    PreAct: Tensor,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    activation: Literal[None, "relu", "relu_sq", "gelu_tanh_approx"] = None,
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Reference implementation for GEMM with activation gradient."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
+    dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
+    postact = act_to_pytorch_fn_map[activation](PreAct)
+    # Compute gradient using autograd
+    if activation is None:
+        dx = dout
+    else:
+        PreAct_requires_grad = PreAct.requires_grad
+        PreAct.requires_grad_(True)
+        postact_for_grad = act_to_pytorch_fn_map[activation](PreAct)
+        dx = torch.autograd.grad(postact_for_grad, PreAct, dout, create_graph=False)[0]
+        PreAct.requires_grad_(PreAct_requires_grad)
+    return dx.to(out_dtype), postact.to(postact_dtype)
+def gemm_gated_ref(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    C: Optional[Tensor] = None,  # (M, N) or (L, M, N) or (total_M, N) if varlen_m
+    bias: Optional[Tensor] = None,  # (N,) or (L, N)
+    activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"] = "swiglu",
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+    store_preact: bool = True,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """Reference implementation for GEMM with gated activation forward.
+    Args:
+        A: (M, K) - input tensor
+        B: (K, N) - weight tensor with gate and up projections
+        C: (M, N) - optional bias tensor
+        activation: Type of gated activation
+        out_dtype: Output dtype for preact
+        postact_dtype: Output dtype for postact
+        store_preact: Whether to return the pre-activation
+    Returns:
+        (preact, postact) where:
+        - preact: (M, N) pre-activation (if store_preact=True, else None)
+        - postact: (M, N // 2) post-activation output
+    """
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = A.dtype if postact_dtype is None else postact_dtype
+    if C is None:
+        preact = gemm_ref(A, B, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    else:
+        preact = gemm_add_ref(A, B, C, bias=bias, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx)
+    # Split preact into gate and up projections
+    gate = preact[..., ::2]  # (M, N//2)
+    up = preact[..., 1::2]  # (M, N//2)
+    postact = gated_to_pytorch_fn_map[activation](gate, up)
+    return preact.to(out_dtype) if store_preact else None, postact.to(postact_dtype)
+def gemm_dgated_ref(
+    A: Tensor,  # (M, K) or (L, M, K) or (total_M, K) if varlen_m or (M, total_K) if varlen_k or (whatever, K) if gather_A
+    B: Tensor,  # (K, N) or (L, K, N) or (total_K, N) if varlen_k
+    PreAct: Tensor,  # (M, 2*N) or (L, M, 2*N) or (total_M, 2*N) if varlen_m
+    activation: Literal["glu", "swiglu", "swiglu_oai", "reglu", "geglu"],
+    cu_seqlens_m: Optional[Tensor] = None,
+    A_idx: Optional[Tensor] = None,  # (total_M,) if gather_A with varlen_m
+    out_dtype: Optional[torch.dtype] = None,
+    postact_dtype: Optional[torch.dtype] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Reference implementation for GEMM with gated activation gradient.
+    Args:
+        A: (M, K) - dout input tensor
+        B: (K, N) - weight tensor
+        PreAct: (M, 2*N) - pre-activation tensor with gate and up projections interleaved
+        activation: Type of gated activation
+        out_dtype: Output dtype for dx
+        postact_dtype: Output dtype for postact
+    Returns:
+        (dx, postact) where:
+        - dx: (M, 2*N) gradient w.r.t. PreAct
+        - postact: (M, N) post-activation output
+    """
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    postact_dtype = PreAct.dtype if postact_dtype is None else postact_dtype
+    dout = gemm_ref(A, B, cu_seqlens_m=cu_seqlens_m, A_idx=A_idx).to(out_dtype)
+    # Split PreAct into gate and up projections
+    gate = PreAct[..., ::2]  # (M, N)
+    up = PreAct[..., 1::2]  # (M, N)
+    # Use autograd to compute gradients w.r.t. gate and up
+    gate_requires_grad, up_requires_grad = gate.requires_grad, up.requires_grad
+    gate.requires_grad_(True)
+    up.requires_grad_(True)
+    postact = gated_to_pytorch_fn_map[activation](gate, up)
+    dgate, dup = torch.autograd.grad(postact, [gate, up], dout, create_graph=False)
+    gate.requires_grad_(gate_requires_grad)
+    up.requires_grad_(up_requires_grad)
+    # Interleave gradients back
+    dx = torch.stack([dgate, dup], dim=-1).reshape(PreAct.shape)
+    return dx.to(out_dtype), postact.to(postact_dtype)
+@torch.library.custom_op(
+    add_quack_op_namespace_prefix("gemm_symmetric_out"),
+    mutates_args=("out",),
+    device_types="cuda",
+    schema="(Tensor A, Tensor B, Tensor(a2!) out, Tensor? C=None, bool dynamic_scheduler=False, float alpha=1.0, float beta=1.0) -> ()",
+)
+def gemm_symmetric_out(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    out: Tensor,  # (M, M) or (L, M, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    dynamic_scheduler: bool = False,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+) -> None:
+    """GEMM with guaranteed symmetric output."""
+    if A.ndim == 2:
+        A = A.unsqueeze(0)  # (1, M, K)
+    B = B.mT  # (M, K) or (L, M, K)
+    if B.ndim == 2:
+        B = B.unsqueeze(0)  # (1, M, K)
+    if C is not None and C.ndim == 2:
+        C = C.unsqueeze(0)  # (1, M, M)
+    if out.ndim == 2:
+        out = out.unsqueeze(0)
+    else:
+        out = out
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device) if dynamic_scheduler else None
+    )
+    gemm_symmetric_sm90_sm100(
+        A,
+        B,
+        out if out is not None else None,
+        C if C is not None else None,
+        tile_count_semaphore,
+        tile_M=128,
+        tile_N=256,
+        cluster_M=2,
+        cluster_N=1,
+        pingpong=False,
+        persistent=True,
+        max_swizzle_size=8,
+        alpha=alpha,
+        beta=beta,
+    )
+def gemm_symmetric(
+    A: Tensor,  # (M, K) or (L, M, K)
+    B: Tensor,  # (K, M) or (L, K, M)
+    C: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out: Optional[Tensor] = None,  # (M, M) or (L, M, M)
+    out_dtype: Optional[torch.dtype] = None,
+    dynamic_scheduler: bool = False,
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+) -> Tuple[Optional[Tensor], Tensor]:
+    """GEMM with symmetric output."""
+    out_dtype = A.dtype if out_dtype is None else out_dtype
+    # Determine output shape based on gather_A
+    if A.ndim == 2:
+        out_shape = (A.shape[0], B.shape[-1])
+    else:
+        out_shape = (A.shape[0], A.shape[-2], B.shape[-1])
+    if out is None:
+        out = torch.empty(out_shape, dtype=out_dtype, device=A.device)
+    alpha_val = alpha if isinstance(alpha, float) else 1.0
+    beta_val = beta if isinstance(beta, float) else 1.0
+    gemm_symmetric_out(
+        A, B, out, C, dynamic_scheduler=dynamic_scheduler, alpha=alpha_val, beta=beta_val
+    )
+    return out
+# TODO: this is not quite right, do we need to register gemm_add not gemm_add_out?
+# try:
+#     from torch._inductor.fx_passes.reinplace import InplaceableOp
+#     torch._inductor.fx_passes.reinplace.inplaceable_ops.update({
+#         torch.ops.quack.gemm_add_out.default:
+#         InplaceableOp(torch.ops.quack.gemm_add_inplace.default, mutated_arg=2)
+#     })
+# except ImportError:
+#     pass

build/torch-cuda/quack/gemm_sm100.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch-cuda/quack/gemm_sm90.py ADDED Viewed

	@@ -0,0 +1,2070 @@

+# Based on the cute-dsl example:
+# https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/hopper/dense_gemm.py
+import enum
+from typing import Tuple, Type, Callable, Optional, Union, Literal
+from functools import partial
+import math
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, warp, warpgroup
+import cutlass.utils.hopper_helpers as sm90_utils
+from cutlass import Int32, Float32, Float16, Boolean, const_expr
+from cutlass.cutlass_dsl import if_generate
+from cutlass.utils import LayoutEnum
+from .cute_dsl_utils import ParamsBase, ArgumentsBase
+from .tile_scheduler import (
+    TileSchedulerOptions,
+    TileSchedulerArguments,
+    TileScheduler,
+    VarlenMTileSchedulerArguments,
+    VarlenMTileScheduler,
+)
+from .varlen_utils import VarlenArguments, VarlenManager
+# return PipelineStateWAdvance instead of PipelineState
+from .pipeline import make_pipeline_state, PipelineTmaCpAsync
+from . import copy_utils as copy_utils
+from . import sm90_utils as quack_sm90_utils
+"""
+A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Hopper architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Hopper's WGMMA for matrix multiply-accumulate (MMA) operations
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Supports multi-stage pipeline to overlap computation and memory access
+This GEMM works as follows:
+1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. Perform matrix multiply-accumulate (MMA) operations using WGMMA instruction.
+3. Store results from registers (RMEM) to shared memory (SMEM), then to global memory (GMEM) with TMA operations.
+Hopper WGMMA instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Perform MMA operation and store the result in Accumulator(register)
+Constraints:
+* Supported input data types: fp16, fp8 (e4m3fn, e5m2)
+* For fp16 types, A and B must have the same data type
+* For fp8 types, A and B can have different types (e4m3fn or e5m2) but both must be 8-bit
+* Fp8 types only support k-major layout
+* Only fp32 accumulation is supported in this example
+* CTA tile shape M must be 64/128
+* CTA tile shape N must be 64/128/256
+* CTA tile shape K must be 64
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 4
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 8, 16 for Float16, and Float8, respectively.
+"""
+class NamedBarrierGemm(enum.IntEnum):
+    Epilogue = enum.auto()  # starts from 1 as barrier 0 is reserved for sync_threads()
+    # For mainloop load warps to signal that the epilogue load warp can start.
+    # This is to avoid loading C too early, interfering with loading A and B.
+    EpilogueLoad = enum.auto()
+    MmaWG0 = enum.auto()
+    MmaWG1 = enum.auto()
+    EpiWG0 = enum.auto()
+    EpiWG1 = enum.auto()
+    TmemPtr = enum.auto()
+class GemmSm90:
+    """
+    This class implements batched matrix multiplication (C = A x B) with support for various data types
+    and architectural features specific to Hopper GPUs with persistent tile scheduling and warp specialization.
+    :param acc_dtype: Data type for accumulation during computation
+    :type acc_dtype: type[cutlass.Numeric]
+    :param tile_shape_mn: Shape of the CTA tile (M,N)
+    :type tile_shape_mn: Tuple[int, int, int]
+    :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
+    :type cluster_shape_mnk: Tuple[int, int, int]
+    :note: Data type requirements:
+        - For 16-bit types: A and B must have the same data type
+        - For 8-bit types: A and B can have different types (Float8E4M3FN/Float8E5M2) as long as both are 8-bit
+        - Float8 types only support k-major layout
+    :note: Supported data types:
+        - Float16
+        - BFloat16
+        - Float8E4M3FN/Float8E5M2
+    :note: Supported accumulation types:
+        - Float32 (for all floating point inputs)
+    :note: Constraints:
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 4
+    Example:
+        >>> gemm = GemmSm90(
+        ...     acc_dtype=Float32,
+        ...     tile_shape_mn=(128, 256),
+        ...     cluster_shape_mnk=(1, 1, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, c_tensor, stream)
+    """
+    arch = 90
+    num_epi_tensormaps: int = 0
+    EpilogueArguments = ArgumentsBase
+    EpilogueParams = ParamsBase
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        a_dtype: Type[cutlass.Numeric],
+        tile_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
+        pingpong: bool = False,
+        is_persistent: bool = True,
+        fp8_fast_accum: bool = False,
+        gather_A: bool = False,
+    ):
+        """
+        Initializes the configuration for a Hopper dense GEMM kernel.
+        This configuration includes data types for operands, tile shape, cluster configuration,
+        and thread layout.
+        :param acc_dtype: Data type for accumulation during computation
+        :type acc_dtype: type[cutlass.Numeric]
+        :param tile_shape_mn: Shape of the CTA tile (M,N)
+        :type tile_shape_mn: Tuple[int, int]
+        :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
+        :type cluster_shape_mnk: Tuple[int, int, int]
+        """
+        self.acc_dtype = acc_dtype
+        self.pingpong = pingpong
+        self.is_persistent = is_persistent
+        if self.pingpong:
+            assert self.is_persistent, "Pingpong gemm requires persistent scheduler"
+        self.fp8_slow_accum = not fp8_fast_accum and a_dtype.width == 8
+        self.gather_A = gather_A
+        if gather_A:
+            assert cluster_shape_mnk[1] == 1, "Cluster shape N must be 1 for gather A "
+        self.cluster_shape_mnk = cluster_shape_mnk
+        # K dimension is deferred in _setup_attributes
+        self.cta_tile_shape_mnk = (*tile_shape_mn, 1)
+        tile_M, tile_N = self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[1]
+        # check the cta tile shape
+        if not self.pingpong:
+            if tile_M not in [64, 128, 192, 256, 320]:
+                raise ValueError("CTA tile shape M must be 64/128/192/256/320")
+            if tile_M in [192, 320]:  # special case
+                tile_N_max = 256 if tile_M == 192 else 160
+                if not (tile_N % 32 == 0 and tile_N <= tile_N_max):
+                    raise ValueError(
+                        f"If tile_m == {tile_M}, CTA tile shape N must be divisible by 32 and <= {tile_N_max}"
+                    )
+            else:
+                if not (
+                    (tile_N % 16 == 0 and tile_N <= 256) or (tile_N % 32 == 0 and tile_N <= 512)
+                ):
+                    raise ValueError(
+                        "CTA tile shape N must be divisible by 16 and <= 256, or divisible by 32 and <= 512"
+                    )
+        else:
+            if tile_M not in [64, 128, 192]:
+                raise ValueError("CTA tile shape M must be 64/128/192 if pingpong")
+            tile_N_max = 256 if tile_M == 64 else (208 if tile_M == 128 else 128)
+            if not (tile_N % 16 == 0 and tile_N <= tile_N_max):
+                raise ValueError(f"CTA tile shape N must be divisible by 16 and <= {tile_N_max}")
+        if not self.pingpong:
+            if tile_M == 320:  # tile_M / 64 is not even so we have to split along N
+                atom_layout_m, atom_layout_n = 1, 2
+            elif tile_M == 192:
+                if tile_N <= 128:
+                    atom_layout_m, atom_layout_n = 3, 1
+                else:
+                    atom_layout_m, atom_layout_n = 1, 2
+            else:
+                atom_layout_m = (
+                    self.cta_tile_shape_mnk[0] // 64 if self.cta_tile_shape_mnk[0] < 256 else 2
+                )
+                atom_layout_n = 1
+            assert atom_layout_m in [1, 2, 3] and atom_layout_n in [1, 2]
+        else:
+            atom_layout_m, atom_layout_n = 1, 1
+        self.atom_layout_mnk = (atom_layout_m, atom_layout_n, 1)
+        self.num_mcast_ctas_a = self.cluster_shape_mnk[1]
+        if self.gather_A:
+            assert self.num_mcast_ctas_a == 1
+        self.num_mcast_ctas_b = self.cluster_shape_mnk[0]
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.occupancy = 1
+        self.mma_warp_groups = math.prod(self.atom_layout_mnk) * (1 if not self.pingpong else 2)
+        if self.pingpong:
+            assert self.mma_warp_groups == 2
+        assert self.mma_warp_groups in [1, 2, 3]
+        self.num_threads_per_warp_group = 128
+        self.threads_per_cta = (self.mma_warp_groups + 1) * self.num_threads_per_warp_group
+        self.smem_capacity = cutlass.utils.get_smem_capacity_in_bytes("sm_90")
+        self.num_epi_warps = (self.mma_warp_groups if not self.pingpong else 1) * 4
+        self.num_ab_load_warps = 1 if not self.gather_A else 4
+        self.ab_load_warp_id = self.mma_warp_groups * 4
+        # self.num_epi_load_threads = cute.arch.WARP_SIZE * 1
+        # self.epi_load_warp_id = self.ab_load_warp_id + self.num_ab_load_warps
+        regs_per_thread = math.prod(self.cta_tile_shape_mnk[:2]) // (
+            math.prod(self.atom_layout_mnk) * self.num_threads_per_warp_group
+        )
+        if self.fp8_slow_accum:
+            regs_per_thread *= 2
+        if not self.gather_A:
+            if self.mma_warp_groups == 3:
+                self.num_regs_load, self.num_regs_mma = 32, 160
+            else:
+                heavy_register_pressure = regs_per_thread >= 208
+                self.num_regs_load, self.num_regs_mma = (
+                    (40, 232) if not heavy_register_pressure else (24, 240)
+                )
+        else:
+            if self.mma_warp_groups == 3:
+                self.num_regs_load, self.num_regs_mma = 56, 152
+            else:
+                self.num_regs_load, self.num_regs_mma = (56, 224)
+        self.ab_stage = None
+        self.epi_stage = None
+        self.a_smem_layout_staged = None
+        self.b_smem_layout_staged = None
+        self.epi_smem_layout_staged = None
+        self.epi_tile = None
+        self.shared_storage = None
+        self.buffer_align_bytes = 1024
+    def _setup_attributes(self, epilogue_args: EpilogueArguments):
+        """Set up configurations that are dependent on GEMM inputs
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        """
+        self.tiled_mma = sm90_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_layout.sm90_mma_major_mode(),
+            self.b_layout.sm90_mma_major_mode(),
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            tiler_mn=(64, self.cta_tile_shape_mnk[1] // self.atom_layout_mnk[1]),
+        )
+        if const_expr(self.atom_layout_mnk[1] > 1):
+            # If N dimension is split among 2 WGs, we need to permute the N dimension so
+            # that in the epilogue, WG0 and WG1 can write to epi smem of size e.g. (64, 32)
+            # containing accumulators that are next to each other in the N dimension.
+            # Without permutation WG0 would write to epi smem of size (64, 16) and
+            # WG1 would write to a separate epi smem of size (64, 16) that's far away.
+            atom_n = self.atom_layout_mnk[1]
+            permutation_n = cute.make_ordered_layout(
+                (8, self.cta_tile_shape_mnk[1] // atom_n // 8, atom_n), order=(0, 2, 1)
+            )
+            self.tiled_mma = cute.make_tiled_mma(
+                cute.make_mma_atom(self.tiled_mma.op),
+                self.atom_layout_mnk,
+                permutation_mnk=(None, permutation_n, None),
+            )
+        mma_inst_shape_k = cute.size(self.tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.cta_tile_shape_mnk = (
+            self.cta_tile_shape_mnk[0],
+            self.cta_tile_shape_mnk[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cluster_layout_mnk = cute.make_layout(self.cluster_shape_mnk)
+        self.epi_tile = self._sm90_compute_tile_shape_or_override(
+            self.cta_tile_shape_mnk,
+            self.atom_layout_mnk,
+            self.d_dtype,
+        )
+        # Compute stage before compute smem layout
+        self.ab_stage, self.epi_stage, self.epi_c_stage = self._compute_stages(
+            self.cta_tile_shape_mnk,
+            self.epi_tile,
+            self.a_dtype,
+            self.b_dtype,
+            self.d_dtype,
+            self.c_dtype,
+            epilogue_args,
+            cutlass.utils.get_smem_capacity_in_bytes(f"sm_{self.arch}"),  # smem_capacity
+            self.occupancy,
+            # epi_smem will reuse smem ab if not persistent.
+            overlap_sD_sA=not self.is_persistent,
+        )
+        self.sched_stage = 2 if self.pingpong else 1
+        (
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.epi_smem_layout_staged,
+            self.epi_c_smem_layout_staged,
+        ) = self._make_smem_layouts(
+            self.cta_tile_shape_mnk,
+            self.epi_tile,
+            self.a_dtype,
+            self.a_layout,
+            self.b_dtype,
+            self.b_layout,
+            self.ab_stage,
+            self.d_dtype,
+            self.d_layout,
+            self.epi_stage,
+            self.c_dtype,
+            self.c_layout,
+            self.epi_c_stage,
+        )
+    @cute.jit
+    def __call__(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mD: Optional[cute.Tensor],
+        mC: Optional[cute.Tensor],
+        epilogue_args: ArgumentsBase,
+        scheduler_args: TileSchedulerOptions,
+        varlen_args: Optional[VarlenArguments],
+        stream: cuda.CUstream,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+        :param mA: Input tensor A
+        :type mA: cute.Tensor
+        :param mB: Input tensor B
+        :type mB: cute.Tensor
+        :param mD: Output tensor D
+        :type mD: cute.Tensor
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        """
+        # setup static attributes before smem/grid/tma computation
+        self.a_dtype = mA.element_type
+        self.b_dtype = mB.element_type
+        self.d_dtype = mD.element_type if mD is not None else None
+        self.c_dtype = mC.element_type if mC is not None else None
+        self.a_layout = LayoutEnum.from_tensor(mA)
+        self.b_layout = LayoutEnum.from_tensor(mB)
+        self.d_layout = LayoutEnum.from_tensor(mD) if mD is not None else None
+        self.c_layout = LayoutEnum.from_tensor(mC) if mC is not None else None
+        if const_expr(self.a_dtype.width == 16 and self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type mismatch: {self.a_dtype} != {self.b_dtype}")
+        if const_expr(self.a_dtype.width != self.b_dtype.width):
+            raise TypeError(f"Type width mismatch: {self.a_dtype.width} != {self.b_dtype.width}")
+        if const_expr(self.a_dtype.width != 16 and self.a_dtype.width != 8):
+            raise TypeError("a_dtype should be float16 or float8")
+        if const_expr(varlen_args is None):
+            varlen_args = VarlenArguments()
+        assert (varlen_args.mAIdx is not None) == self.gather_A
+        # Assume all strides are divisible by 128 bits except the last stride
+        new_stride = lambda t: tuple(
+            cute.assume(s, divby=128 // t.element_type.width) if not cute.is_static(s) else s
+            for s in t.stride
+        )
+        mA, mD = [
+            cute.make_tensor(t.iterator, cute.make_layout(t.shape, stride=new_stride(t)))
+            if t is not None
+            else None
+            for t in (mA, mD)
+        ]
+        self._setup_attributes(epilogue_args)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, 0))
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, 0))
+        tma_atom_a, tma_tensor_a = None, None
+        if const_expr(not self.gather_A):
+            tma_atom_a, tma_tensor_a = self._make_tma_atoms_and_tensors(
+                mA,
+                a_smem_layout,
+                (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
+                self.cluster_shape_mnk[1],
+            )
+        tma_atom_b, tma_tensor_b = self._make_tma_atoms_and_tensors(
+            mB,
+            b_smem_layout,
+            (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
+            self.cluster_shape_mnk[0],
+        )
+        self.num_tma_load_bytes = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        if const_expr(not self.gather_A):
+            self.num_tma_load_bytes += cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        tma_atom_d, tma_tensor_d = None, None
+        if const_expr(mD is not None):
+            tma_atom_d, tma_tensor_d = self._make_tma_epi_atoms_and_tensors(
+                mD,
+                self.epi_smem_layout_staged,
+                self.epi_tile,
+                op_type="store"
+                if not (hasattr(epilogue_args, "add_to_output") and epilogue_args.add_to_output)
+                else "add",
+            )
+        tma_atom_c, tma_tensor_c = None, None
+        if const_expr(mC is not None):
+            tma_atom_c, tma_tensor_c = self._make_tma_epi_atoms_and_tensors(
+                mC, self.epi_c_smem_layout_staged, self.epi_tile, op_type="load"
+            )
+        epilogue_params = self.epi_to_underlying_arguments(epilogue_args)
+        varlen_params = VarlenManager.to_underlying_arguments(varlen_args)
+        TileSchedulerCls = self.get_scheduler_class(varlen_m=varlen_args.mCuSeqlensM is not None)
+        tile_sched_args = self.get_scheduler_arguments(mA, mB, mD, scheduler_args, varlen_args)
+        tile_sched_params = TileSchedulerCls.to_underlying_arguments(tile_sched_args)
+        grid = TileSchedulerCls.get_grid_shape(
+            tile_sched_params, scheduler_args.max_active_clusters
+        )
+        epi_smem_size = (
+            cute.cosize(self.epi_smem_layout_staged) if self.is_persistent and mD is not None else 0
+        )
+        epi_c_smem_size = cute.cosize(self.epi_c_smem_layout_staged) if mC is not None else 0
+        @cute.struct
+        class SharedStorage:
+            ab_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.ab_stage * 2]
+            epi_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_c_stage * 2]
+            sched_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.sched_stage * 2]
+            tile_count: cute.struct.MemRange[Int32, self.sched_stage]
+            sD: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.d_dtype if self.d_dtype is not None else Int32, epi_smem_size
+                ],
+                self.buffer_align_bytes,
+            ]
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype if self.c_dtype is not None else Int32, epi_c_smem_size
+                ],
+                self.buffer_align_bytes,
+            ]
+            epi: self.epi_get_smem_struct(epilogue_params)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+        self.shared_storage = SharedStorage
+        # Launch the kernel synchronously
+        self.kernel(
+            self.tiled_mma,
+            tma_atom_a,
+            tma_tensor_a if const_expr(not self.gather_A) else mA,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_d,
+            tma_tensor_d,
+            tma_atom_c,
+            tma_tensor_c,
+            epilogue_params,
+            varlen_params,
+            self.cluster_layout_mnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.epi_smem_layout_staged,
+            self.epi_c_smem_layout_staged,
+            tile_sched_params,
+            TileSchedulerCls,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=self.cluster_shape_mnk,
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+    #  GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: Optional[cute.CopyAtom],
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_d: Optional[cute.CopyAtom],
+        mD_mnl: Optional[cute.Tensor],
+        tma_atom_c: Optional[cute.CopyAtom],
+        mC_mnl: Optional[cute.Tensor],
+        epilogue_params: ParamsBase,
+        varlen_params: VarlenManager.Params,
+        cluster_layout_mnk: cute.Layout,
+        a_smem_layout: cute.ComposedLayout,
+        b_smem_layout: cute.ComposedLayout,
+        epi_smem_layout: cute.ComposedLayout,
+        epi_c_smem_layout: cute.ComposedLayout,
+        tile_sched_params: ParamsBase,
+        TileSchedulerCls: cutlass.Constexpr[Callable],
+    ):
+        """
+        GPU device kernel performing the batched GEMM computation.
+        :param tma_atom_a: TMA copy atom for A tensor
+        :type tma_atom_a: cute.CopyAtom
+        :param mA_mkl: Input tensor A
+        :type mA_mkl: cute.Tensor
+        :param tma_atom_b: TMA copy atom for B tensor
+        :type tma_atom_b: cute.CopyAtom
+        :param mB_nkl: Input tensor B
+        :type mB_nkl: cute.Tensor
+        :param tma_atom_d: TMA copy atom for D tensor
+        :type tma_atom_d: cute.CopyAtom
+        :param mD_mnl: Output tensor D
+        :type mD_mnl: cute.Tensor
+        :param tiled_mma: Tiled MMA object
+        :type tiled_mma: cute.TiledMma
+        :param cluster_layout_mnk: CTA layout
+        :type cluster_layout_mnk: cute.Layout
+        :param a_smem_layout: Shared memory layout for A
+        :type a_smem_layout: cute.ComposedLayout
+        :param b_smem_layout: Shared memory layout for B
+        :type b_smem_layout: cute.ComposedLayout
+        :param epi_smem_layout: Shared memory layout for epilogue
+        :type epi_smem_layout: cute.ComposedLayout
+        """
+        varlen_m = const_expr(varlen_params.cu_seqlens_m is not None)
+        varlen_k = const_expr(varlen_params.cu_seqlens_k is not None)
+        assert not (varlen_m and varlen_k)
+        if const_expr(self.gather_A):
+            assert varlen_m or varlen_k
+        has_D = const_expr(mD_mnl is not None)
+        has_C = const_expr(mC_mnl is not None)
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # /////////////////////////////////////////////////////////////////////////////
+        #  Prefetch Tma desc
+        # /////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.ab_load_warp_id:
+            for tma_atom in (tma_atom_a, tma_atom_b, tma_atom_d, tma_atom_c):
+                if const_expr(tma_atom is not None):
+                    cpasync.prefetch_descriptor(tma_atom)
+        # /////////////////////////////////////////////////////////////////////////////
+        #  Alloc and init AB full/empty + ACC full mbar (pipeline)
+        # /////////////////////////////////////////////////////////////////////////////
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+        ab_pipeline = self.make_ab_pipeline(
+            tiled_mma=tiled_mma,
+            cluster_layout_vmnk=cute.make_layout((1, *cluster_layout_mnk.shape)),
+            ab_pipeline_mbar_ptr=storage.ab_pipeline_array_ptr.data_ptr(),
+        )
+        epi_pipeline = None
+        if const_expr(has_C):
+            epi_pipeline = self.make_epi_pipeline(
+                c_smem_layout=cute.slice_(epi_c_smem_layout, (None, None, 0)),
+                epi_pipeline_mbar_ptr=storage.epi_pipeline_array_ptr.data_ptr(),
+            )
+        sched_pipeline = None
+        tile_count = None
+        if const_expr(tile_sched_params.tile_count_semaphore is not None):
+            # Dynamic persistent scheduler
+            sched_pipeline = self.make_sched_pipeline(
+                cluster_layout_mnk,
+                sched_pipeline_mbar_ptr=storage.sched_pipeline_array_ptr.data_ptr(),
+                varlen_k=varlen_k,
+            )
+            tile_count = storage.tile_count.get_tensor((self.sched_stage,))
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Generate smem tensor A/B
+        # ///////////////////////////////////////////////////////////////////////////////
+        sA = storage.sA.get_tensor(a_smem_layout.outer, swizzle=a_smem_layout.inner)
+        sB = storage.sB.get_tensor(b_smem_layout.outer, swizzle=b_smem_layout.inner)
+        sD = None
+        if const_expr(has_D):
+            if const_expr(not self.is_persistent):
+                sD_ptr = cute.recast_ptr(sA.iterator, epi_smem_layout.inner, dtype=self.d_dtype)
+                sD = cute.make_tensor(sD_ptr, epi_smem_layout.outer)
+            else:
+                sD = storage.sD.get_tensor(epi_smem_layout.outer, swizzle=epi_smem_layout.inner)
+        sC = None
+        if const_expr(has_C):
+            sC = storage.sC.get_tensor(epi_c_smem_layout.outer, swizzle=epi_c_smem_layout.inner)
+        epi_smem_tensors = self.epi_get_smem_tensors(epilogue_params, storage)
+        varlen_manager = VarlenManager.create(
+            varlen_params,
+            has_D,
+            self.num_epi_tensormaps,
+            # Only used if not varlen_m
+            len_m_static=Int32(
+                mA_mkl.shape[0]
+                if varlen_k or varlen_params.mAIdx is None
+                else varlen_params.mAIdx.shape[0]
+            ),
+            len_k_static=Int32(mA_mkl.shape[1]),
+            pingpong=self.pingpong,
+            warp_idx=warp_idx,
+        )
+        TileSchedulerCls = partial(
+            TileSchedulerCls.create, tile_sched_params, tile_count, sched_pipeline
+        )
+        if warp_idx >= self.ab_load_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_load)
+            if (
+                warp_idx >= self.ab_load_warp_id
+                and warp_idx < self.ab_load_warp_id + self.num_ab_load_warps
+            ):
+                is_tma_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
+                # initialize tensormap for A & B
+                varlen_manager.init_tensormap_AB(tma_atom_a, tma_atom_b, is_tma_warp)
+                tma_desc_a_ptr = varlen_manager.get_tma_desc_a_ptr()
+                tma_desc_b_ptr = varlen_manager.get_tma_desc_b_ptr()
+                # ///////////////////////////////////////////////////////////////////////////////
+                # Get mcast mask
+                # ///////////////////////////////////////////////////////////////////////////////
+                cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+                block_in_cluster_coord_mnk = cluster_layout_mnk.get_flat_coord(cta_rank_in_cluster)
+                a_mcast_mask = cute.make_layout_image_mask(
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=1
+                )
+                b_mcast_mask = cute.make_layout_image_mask(
+                    cluster_layout_mnk, block_in_cluster_coord_mnk, mode=0
+                )
+                a_mcast_mask = a_mcast_mask if self.is_a_mcast else 0
+                b_mcast_mask = b_mcast_mask if self.is_b_mcast else 0
+                # Persistent tile scheduling loop
+                is_scheduler_warp = self.num_ab_load_warps == 1 or warp_idx == self.ab_load_warp_id
+                if const_expr(cute.size(cluster_layout_mnk) > 1):
+                    is_scheduler_warp = is_scheduler_warp and cute.arch.block_idx_in_cluster() == 0
+                tile_scheduler = TileSchedulerCls(is_scheduler_warp=is_scheduler_warp)
+                work_tile = tile_scheduler.initial_work_tile_info()
+                ab_producer_state = make_pipeline_state(
+                    pipeline.PipelineUserType.Producer, self.ab_stage
+                )
+                if const_expr(varlen_k):
+                    # wait tensormap initialization complete before update
+                    varlen_manager.fence_tensormap_init()
+                while work_tile.is_valid_tile:
+                    tile_coord_mnkl = work_tile.tile_idx
+                    batch_idx = tile_coord_mnkl[3]
+                    varlen_manager.update_tensormap_AB(
+                        batch_idx,
+                        self.a_layout,
+                        self.b_layout,
+                        is_tma_warp,
+                    )
+                    # ///////////////////////////////////////////////////////////////////////////
+                    #  Local_tile partition global tensors
+                    # ///////////////////////////////////////////////////////////////////////////
+                    if const_expr(not self.gather_A):
+                        mA_mk = varlen_manager.offset_batch_A(mA_mkl, batch_idx)
+                        # (bM, bK, RestK)
+                        gA_mk = cute.local_tile(
+                            mA_mk,
+                            cute.select(self.cta_tile_shape_mnk, [0, 2]),
+                            (tile_coord_mnkl[0], None),
+                        )
+                    else:
+                        mAIdx_mk = varlen_manager.offset_batch_AIdx(batch_idx)
+                        if const_expr(varlen_m):
+                            gAIdx = cute.local_tile(
+                                mAIdx_mk, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0],)
+                            )
+                            # (M, K)
+                            mA_mk = mA_mkl
+                        else:
+                            assert varlen_k
+                            # (tile_K, RestK)
+                            gAIdx = cute.flat_divide(mAIdx_mk, (self.cta_tile_shape_mnk[2],))
+                            # (tile_M, K)
+                            mA_mk = cute.local_tile(
+                                mA_mkl, (self.cta_tile_shape_mnk[0],), (tile_coord_mnkl[0], None)
+                            )
+                    # (bN, bK, RestK)
+                    gB_nk = cute.local_tile(
+                        varlen_manager.offset_batch_B(mB_nkl, batch_idx),
+                        cute.select(self.cta_tile_shape_mnk, [1, 2]),
+                        (tile_coord_mnkl[1], None),
+                    )
+                    # //////////////////////////////////////////////////////////////////////////
+                    #  Partition shared tensor for TMA load A/B
+                    # //////////////////////////////////////////////////////////////////////////
+                    varlen_manager.fence_tensormap_update_AB(is_tma_warp)
+                    len_m = varlen_manager.len_m(batch_idx)
+                    len_k = varlen_manager.len_k(batch_idx)
+                    #  TMA load A partition_S/D
+                    copy_A = None
+                    if const_expr(not self.gather_A):
+                        copy_A, _, _ = copy_utils.tma_get_copy_fn(
+                            tma_atom_a,
+                            cta_coord=block_in_cluster_coord_mnk[1],
+                            cta_layout=cute.make_layout(
+                                cute.slice_(cluster_layout_mnk, (0, None, 0)).shape
+                            ),
+                            src_tensor=gA_mk,
+                            dst_tensor=sA,
+                            mcast_mask=a_mcast_mask,
+                            tma_desc_ptr=tma_desc_a_ptr,
+                        )
+                    else:
+                        tiled_copy_A = self._make_gmem_tiled_copy_A(
+                            mA_mkl.element_type, self.a_layout, self.num_ab_load_warps * 32
+                        )
+                        tidx = (
+                            cute.arch.thread_idx()[0] - cute.arch.WARP_SIZE * self.ab_load_warp_id
+                        )
+                        thr_copy_A = tiled_copy_A.get_slice(tidx)
+                        copy_A, prefetch_A = None, None
+                        if const_expr(varlen_m):
+                            copy_A = copy_utils.gather_m_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
+                        else:
+                            copy_A, prefetch_A = copy_utils.gather_k_get_copy_fn(
+                                thr_copy_A,
+                                mA_mk,
+                                sA,
+                                gAIdx,
+                                limit_m=len_m - tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0],
+                                limit_k=len_k,
+                            )
+                    # TMA load B partition_S/D
+                    copy_B, _, _ = copy_utils.tma_get_copy_fn(
+                        tma_atom_b,
+                        cta_coord=block_in_cluster_coord_mnk[0],
+                        cta_layout=cute.make_layout(
+                            cute.slice_(cluster_layout_mnk, (None, 0, 0)).shape
+                        ),
+                        src_tensor=gB_nk,
+                        dst_tensor=sB,
+                        mcast_mask=b_mcast_mask,
+                        tma_desc_ptr=tma_desc_b_ptr,
+                    )
+                    k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                    if const_expr(not self.gather_A):
+                        ab_producer_state = self.load_AB(
+                            ab_pipeline, ab_producer_state, copy_A, copy_B, k_tile_cnt
+                        )
+                    else:
+                        ab_producer_state = self.load_AB_gather_A(
+                            ab_pipeline,
+                            ab_producer_state,
+                            copy_A,
+                            prefetch_A,
+                            copy_B,
+                            k_tile_cnt,
+                            varlen_m=varlen_m,
+                        )
+                    tile_scheduler.fetch_next_work(is_scheduler_warp=is_scheduler_warp)
+                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
+                    work_tile = tile_scheduler.get_current_work()
+                    # End of persistent scheduler loop
+                if const_expr(self.pingpong and not varlen_k):
+                    # Need to write the tile_idx to smem for the next WG in the pingpong mode
+                    tile_scheduler.advance_to_next_work(is_scheduler_warp=is_scheduler_warp)
+                ab_pipeline.producer_tail(ab_producer_state)
+                if is_scheduler_warp:
+                    tile_scheduler.producer_tail()
+        if warp_idx < self.ab_load_warp_id:
+            cute.arch.warpgroup_reg_alloc(self.num_regs_mma)
+            is_tma_warp = Boolean(
+                (not self.pingpong and warp_idx == 0)
+                or (self.pingpong and (warp_idx == 0 or warp_idx == 4))
+            )
+            varlen_manager.init_tensormap_epi(
+                tma_atom_d, self.epi_get_tma_atoms(epilogue_params), is_tma_warp
+            )
+            tma_desc_d_ptr = varlen_manager.get_tma_desc_d_ptr()
+            tma_desc_epi_ptrs = varlen_manager.get_tma_desc_epi_ptrs()
+            # //////////////////////////////////////////////////////////////////////////////
+            #  Partition global tensor for TiledMMA_A/B/C
+            # //////////////////////////////////////////////////////////////////////////////
+            tidx, _, _ = cute.arch.thread_idx()
+            warp_group_idx = cute.arch.make_warp_uniform(tidx // self.num_threads_per_warp_group)
+            if const_expr(self.pingpong):
+                tidx = tidx % self.num_threads_per_warp_group
+            warp_group_thread_layout = cute.make_layout(
+                self.mma_warp_groups if not self.pingpong else 1,
+                stride=self.num_threads_per_warp_group,
+            )
+            thr_mma = tiled_mma.get_slice(
+                warp_group_thread_layout(warp_group_idx if not self.pingpong else 0)
+            )
+            # //////////////////////////////////////////////////////////////////////////////
+            #  Make fragments
+            # //////////////////////////////////////////////////////////////////////////////
+            tCrA = tiled_mma.make_fragment_A(thr_mma.partition_A(sA))
+            tCrB = tiled_mma.make_fragment_B(thr_mma.partition_B(sB))
+            acc_shape = tiled_mma.partition_shape_C(
+                cute.select(self.cta_tile_shape_mnk, mode=[0, 1])
+            )
+            acc = cute.make_fragment(acc_shape, self.acc_dtype)
+            acc_slow = None
+            if const_expr(self.fp8_slow_accum):
+                acc_slow = cute.make_fragment(acc_shape, self.acc_dtype)
+            if const_expr(self.pingpong):
+                if warp_group_idx == 0:
+                    # WG0 needs a start signal at the very beginning
+                    self.pingpong_barrier_arrive(warp_group_idx=0, stage="mma")
+                    self.pingpong_barrier_arrive(warp_group_idx=0, stage="epi")
+            k_tile_cnt_static = cute.ceil_div(mA_mkl.shape[1], self.cta_tile_shape_mnk[2])
+            c_tile_cnt = cute.size(cute.ceil_div(self.cta_tile_shape_mnk[:2], self.epi_tile))
+            ab_read_state = make_pipeline_state(pipeline.PipelineUserType.Consumer, self.ab_stage)
+            epi_store_pipeline = self.make_epi_store_pipeline()
+            epi_read_state = make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.epi_c_stage
+            )
+            epi_producer_state = make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.epi_c_stage
+            )
+            tile_scheduler = TileSchedulerCls()
+            work_tile = None
+            if const_expr(self.pingpong):
+                if const_expr(varlen_k):
+                    work_tile = tile_scheduler.initial_work_tile_info()
+                if warp_idx >= 4:
+                    # Advance 2nd Math WG pipeline states to the end of 1st Math WG
+                    epi_read_state.advance_iters(c_tile_cnt)
+                    epi_producer_state.advance_iters(c_tile_cnt)
+                    if const_expr(not varlen_k):
+                        ab_read_state.advance_iters(k_tile_cnt_static)
+                    else:
+                        len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                        k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                        ab_read_state.advance_iters(k_tile_cnt)
+                    tile_scheduler.advance_to_next_work()
+                    if const_expr(varlen_k):
+                        work_tile = tile_scheduler.get_current_work()
+                if const_expr(not varlen_k):
+                    work_tile = tile_scheduler.initial_work_tile_info()
+            else:
+                work_tile = tile_scheduler.initial_work_tile_info()
+            if const_expr(varlen_m):
+                # wait tensormap initialization complete before update
+                varlen_manager.fence_tensormap_init()
+            while work_tile.is_valid_tile:
+                tile_coord_mnkl = work_tile.tile_idx
+                batch_idx = tile_coord_mnkl[3]
+                epi_shapes, epi_orders = self.epi_get_tensormap_update_shapes_orders(
+                    epilogue_params, varlen_params.cu_seqlens_m, batch_idx
+                )
+                varlen_manager.update_tensormap_epi(
+                    batch_idx,
+                    self.d_layout,
+                    epi_shapes,
+                    epi_orders,
+                    is_tma_warp,
+                )
+                len_k = varlen_manager.len_k(batch_idx)
+                k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                ab_read_state, tiled_mma = self.mma(
+                    ab_pipeline,
+                    ab_read_state,
+                    tiled_mma,
+                    tCrA,
+                    tCrB,
+                    acc,
+                    acc_slow,
+                    k_tile_cnt,
+                    warp_group_idx,
+                )
+                if const_expr(varlen_k):
+                    if k_tile_cnt == 0:
+                        acc.fill(0.0)
+                # /////////////////////////////////////////////////////////////////////////////
+                #  EPILOGUE
+                # /////////////////////////////////////////////////////////////////////////////
+                if const_expr(self.pingpong):
+                    self.pingpong_barrier_sync(warp_group_idx, "epi")
+                epilogue_barrier = pipeline.NamedBarrier(
+                    barrier_id=int(NamedBarrierGemm.Epilogue),
+                    num_threads=self.num_epi_warps * cute.arch.WARP_SIZE,
+                )
+                varlen_manager.fence_tensormap_update_epi(is_tma_warp)
+                copy_D = None
+                if const_expr(has_D):
+                    copy_D, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_d,
+                        varlen_manager.offset_batch_epi(mD_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        self.epi_tile,
+                        sD,
+                        tile_coord_mnkl,
+                        tma_desc_ptr=tma_desc_d_ptr,
+                    )
+                copy_C = None
+                if const_expr(has_C):
+                    copy_C_fn, _, _ = self.epilog_gmem_copy_and_partition(
+                        tma_atom_c,
+                        varlen_manager.offset_batch_epi(mC_mnl, batch_idx),
+                        self.cta_tile_shape_mnk[:2],
+                        self.epi_tile,
+                        sC,
+                        tile_coord_mnkl,
+                    )
+                    copy_C = copy_utils.tma_producer_copy_fn(copy_C_fn, epi_pipeline)
+                d_dtype_for_layout = self.d_dtype if self.d_dtype is not None else cutlass.BFloat16
+                tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_store_and_partition(
+                    tiled_mma, self.d_layout, d_dtype_for_layout, sD, tidx
+                )
+                # (R2S, R2S_M, R2S_N)
+                tRS_rAcc = tiled_copy_r2s.retile(acc)
+                load_acc_subtile = partial(self.epi_load_acc_subtile, tRS_rAcc)
+                if const_expr(has_C):
+                    tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC = self.epilog_smem_load_and_partition(
+                        tiled_mma, self.c_layout, self.c_dtype, sC, tRS_rD.layout, tidx
+                    )
+                else:
+                    tiled_copy_s2r, tSR_sC, tRS_rC, tSR_rC = None, None, None, None
+                # Wait for all warp groups in the thread block to finish, because smem for tensor
+                # A in the mainloop is reused in the epilogue if not persistent.
+                if const_expr(not self.is_persistent):
+                    epilogue_barrier.arrive_and_wait()
+                self.epi_visit_acc(epilogue_params, acc, tiled_mma, tile_coord_mnkl, tidx)
+                epi_read_state, epi_producer_state = self.epilogue(
+                    epilogue_params,
+                    epi_smem_tensors,
+                    tma_desc_epi_ptrs,
+                    epi_pipeline,
+                    epi_store_pipeline,
+                    epi_read_state,
+                    epi_producer_state,
+                    self.epi_tile,
+                    load_acc_subtile,
+                    tRS_rD,
+                    tRS_rC,
+                    None,  # tiled_copy_t2r, for Sm100 only
+                    tiled_copy_r2s,
+                    tRS_sD,
+                    tiled_copy_s2r,
+                    tSR_rC,
+                    tSR_sC,
+                    copy_D,
+                    copy_C,
+                    tile_coord_mnkl,
+                    varlen_manager,
+                    epilogue_barrier,
+                    tile_scheduler,
+                    tidx,
+                    is_tma_warp,
+                )
+                if const_expr(self.pingpong):
+                    # With pingpong, 2 WGs write two different output tiles to the same smem,
+                    # so we have to make sure the smem content is done reading before signaling
+                    # the next WG's epilogue.
+                    if is_tma_warp:
+                        epi_store_pipeline.producer_tail()
+                    self.pingpong_barrier_arrive(1 - warp_group_idx, stage="epi")
+                if const_expr(not self.pingpong):
+                    tile_scheduler.advance_to_next_work()
+                    work_tile = tile_scheduler.get_current_work()
+                else:  # Skip a tile for pingpong
+                    # Update starting load/store pipeline states for the next tile
+                    epi_read_state.advance_iters(c_tile_cnt)
+                    epi_producer_state.advance_iters(c_tile_cnt)
+                    # Update starting mainloop pipeline state for the next tile
+                    if const_expr(not varlen_k):
+                        ab_read_state.advance_iters(k_tile_cnt_static)
+                        tile_scheduler.advance_to_next_work(advance_count=self.mma_warp_groups)
+                        work_tile = tile_scheduler.get_current_work()
+                    else:
+                        tile_scheduler.advance_to_next_work()
+                        work_tile = tile_scheduler.get_current_work()
+                        if work_tile.is_valid_tile:
+                            len_k = varlen_manager.len_k(batch_idx=work_tile.tile_idx[3])
+                            k_tile_cnt = cute.ceil_div(len_k, self.cta_tile_shape_mnk[2])
+                            ab_read_state.advance_iters(k_tile_cnt)
+                            tile_scheduler.advance_to_next_work()
+                            work_tile = tile_scheduler.get_current_work()
+                # End of persistent scheduler loop
+            # Wait for D store complete
+            if const_expr(not self.pingpong):
+                if is_tma_warp:
+                    epi_store_pipeline.producer_tail()
+    @cute.jit
+    def load_AB(
+        self,
+        ab_pipeline: cutlass.pipeline.PipelineAsync,
+        ab_producer_state: cutlass.pipeline.PipelineState,
+        copy_A: Optional[Callable],
+        copy_B: Callable,
+        k_tile_cnt: Int32,
+        # These are for Sm100 blockscaled gemm
+        copy_SFA: Optional[Callable] = None,
+        copy_SFB: Optional[Callable] = None,
+    ) -> cutlass.pipeline.PipelineState:
+        blockscaled = const_expr(copy_SFA is not None)
+        if const_expr(blockscaled):
+            assert copy_SFB is not None
+        # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
+        peek_ab_empty_status = Boolean(True)
+        if 0 < k_tile_cnt:
+            peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+        # /////////////////////////////////////////////////////////////////////////
+        # TMA load
+        # /////////////////////////////////////////////////////////////////////////
+        for k_tile in cutlass.range(k_tile_cnt, unroll=1):
+            # Wait for A/B buffers to be empty before loading into them
+            # Also sets the transaction barrier for the A/B buffers
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+            tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+            smem_idx = ab_producer_state.index
+            if const_expr(copy_A is not None):
+                copy_A(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            if const_expr(blockscaled):
+                copy_SFA(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+                copy_SFB(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            # Mainloop pipeline's producer commit is a NOP
+            ab_pipeline.producer_commit(ab_producer_state)
+            ab_producer_state.advance()
+            peek_ab_empty_status = Boolean(True)
+            if k_tile + 1 < k_tile_cnt:
+                peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+        return ab_producer_state
+    @cute.jit
+    def load_AB_gather_A(
+        self,
+        ab_pipeline: cutlass.pipeline.PipelineAsync,
+        ab_producer_state: cutlass.pipeline.PipelineState,
+        copy_A: Callable,
+        prefetch_A: Optional[Callable],
+        copy_B: Callable,
+        k_tile_cnt: Int32,
+        varlen_m: bool = True,
+    ) -> cutlass.pipeline.PipelineState:
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # Peek (try_wait) AB buffer empty for k_block = prefetch_k_tile_cnt
+        peek_ab_empty_status = Boolean(True)
+        if 0 < k_tile_cnt:
+            peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+        # /////////////////////////////////////////////////////////////////////////
+        # TMA load on B and cp.async on A
+        # /////////////////////////////////////////////////////////////////////////
+        for k_tile in cutlass.range(k_tile_cnt - 1, unroll=1):
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile),)
+            # Wait for A/B buffers to be empty before loading into them
+            # Also sets the transaction barrier for the A/B buffers
+            # A tiny bit faster to rotate the warp that does TMA
+            # However, for varlen_k, we must use the warp_idx == self.ab_load_warp_id
+            # since that's the warp that does the tensormap update.
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
+                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
+            )
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            # A bit faster to load B first while we calculate the indices for A
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out)
+            # This tells mbarrier to track the completion of cp.async
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
+            ab_producer_state.advance()
+            peek_ab_empty_status = Boolean(True)
+            if k_tile + 1 < k_tile_cnt:
+                peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+        # bound checking in the K dimension on the last k_tile
+        if 0 < k_tile_cnt:
+            k_tile = k_tile_cnt - 1
+            prefetch_out = ()
+            if const_expr(prefetch_A is not None):  # Prefetch early, even before smem is free
+                prefetch_out = (prefetch_A(k_tile, pred=True),)
+            is_tma_warp = warp_idx == self.ab_load_warp_id + (
+                (k_tile % self.num_ab_load_warps) if const_expr(varlen_m) else 0
+            )
+            ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status, is_tma_warp)
+            smem_idx = ab_producer_state.index
+            if is_tma_warp:
+                tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
+                copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
+            copy_A(k_tile, smem_idx, *prefetch_out, pred=True)
+            ab_pipeline.producer_cpasync_commit(ab_producer_state)
+            ab_producer_state.advance()
+        return ab_producer_state
+    @cute.jit
+    def mma(
+        self,
+        ab_pipeline: cutlass.pipeline.PipelineAsync,
+        ab_read_state: cutlass.pipeline.PipelineState,
+        tiled_mma: cute.TiledMma,
+        tCrA: cute.Tensor,
+        tCrB: cute.Tensor,
+        acc: cute.Tensor,
+        acc_slow: Optional[cute.Tensor],
+        k_tile_cnt: Int32,
+        warp_group_idx: Int32,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cute.TiledMma]:
+        # /////////////////////////////////////////////////////////////////////////////
+        #  Prologue MMAs
+        # /////////////////////////////////////////////////////////////////////////////
+        k_pipe_mmas = 1
+        ab_release_state = ab_read_state.clone()
+        num_prologue_mma = min(k_pipe_mmas, k_tile_cnt)
+        if const_expr(self.pingpong):
+            self.pingpong_barrier_sync(warp_group_idx, stage="mma")
+        peek_ab_full_status = Boolean(True)
+        if 0 < k_tile_cnt:
+            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+        tiled_mma.set(warpgroup.Field.ACCUMULATE, False)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k_tile in cutlass.range(num_prologue_mma):
+            # Wait for A/B buffer to be ready
+            ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
+            warpgroup.fence()
+            for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
+                k_blk_coord = (None, None, k_blk_idx, ab_read_state.index)
+                cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
+                tiled_mma.set(warpgroup.Field.ACCUMULATE, True)
+            warpgroup.commit_group()
+            ab_read_state.advance()
+            peek_ab_full_status = Boolean(True)
+            if k_tile + 1 < k_tile_cnt:
+                peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+        # If k_tile_cnt == 0, this is not correct. But we will set acc to 0 in the mainloop
+        # in that case.
+        if const_expr(self.fp8_slow_accum):
+            warpgroup.wait_group(0)
+            acc_slow.store(acc.load())
+        # /////////////////////////////////////////////////////////////////////////////
+        #  MAINLOOP
+        # /////////////////////////////////////////////////////////////////////////////
+        for k_tile in cutlass.range(num_prologue_mma, k_tile_cnt, unroll=1):
+            # Wait for TMA copies to complete
+            ab_pipeline.consumer_wait(ab_read_state, peek_ab_full_status)
+            # WGMMA
+            warpgroup.fence()
+            if const_expr(self.fp8_slow_accum):
+                tiled_mma.set(warpgroup.Field.ACCUMULATE, False)
+            for k_blk_idx in cutlass.range(num_k_blocks, unroll_full=True):
+                k_blk_coord = (None, None, k_blk_idx, ab_read_state.index)
+                cute.gemm(tiled_mma, acc, tCrA[k_blk_coord], tCrB[k_blk_coord], acc)
+                tiled_mma.set(warpgroup.Field.ACCUMULATE, True)
+            warpgroup.commit_group()
+            # Wait on the wgmma barrier for previous k_pipe_mmas wgmmas to complete
+            if const_expr(not self.fp8_slow_accum):
+                warpgroup.wait_group(k_pipe_mmas)
+            else:
+                warpgroup.wait_group(0)
+                acc_slow.store(acc_slow.load() + acc.load())
+            ab_pipeline.consumer_release(ab_release_state)
+            ab_read_state.advance()
+            ab_release_state.advance()
+            peek_ab_full_status = Boolean(True)
+            if k_tile + 1 < k_tile_cnt:
+                peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_read_state)
+        if const_expr(self.pingpong):
+            # Cue for next WG's MMA to start
+            self.pingpong_barrier_arrive(1 - warp_group_idx, stage="mma")
+        if const_expr(not self.fp8_slow_accum):
+            # fp8_slow_accum would already called wait_group(0) inside the loop
+            warpgroup.wait_group(0)
+        for k_tile in cutlass.range(num_prologue_mma, unroll=1):
+            ab_pipeline.consumer_release(ab_release_state)
+            ab_release_state.advance()
+        if const_expr(self.fp8_slow_accum):
+            acc.store(acc_slow.load())
+        # If we don't return the tiled_mma, we get compiler error
+        # "operand #0 does not dominate this use"
+        return ab_read_state, tiled_mma
+    @cute.jit
+    def epilogue(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: Optional[cutlass.pipeline.PipelineState],
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.ThrCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        copy_C: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
+        ).shape[1]
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_layout = cute.make_ordered_layout(epi_tile_shape, order=(1, 0))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx):
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        # We could delay the TMA store by 1 epi tile to better overlap the non-TMA ops
+        # with the TMA store. However, currently this doesn't seem to improve perf.
+        delay_tma_store = False
+        src_idx_prev, dst_idx_prev = None, None
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from acc to D registers
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rEpi = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            if const_expr(not delay_tma_store):
+                tma_store_fn(src_idx=epi_buffer, dst_idx=gmem_coord)
+        if const_expr(delay_tma_store):
+            tma_store_fn(src_idx=src_idx_prev, dst_idx=dst_idx_prev)
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
+        return epi_read_state, epi_producer_state
+    def get_scheduler_class(self, varlen_m: bool = False):
+        """Return the scheduler class to use. Override in subclasses for custom schedulers."""
+        return TileScheduler if not varlen_m else VarlenMTileScheduler
+    def get_scheduler_arguments(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mD: Optional[cute.Tensor],
+        scheduler_args,
+        varlen_args,
+    ):
+        """Create scheduler arguments. Override in subclasses for custom schedulers."""
+        if const_expr(varlen_args.mCuSeqlensM is None):
+            num_problems = (
+                mD.shape[2]
+                if mD is not None
+                else (
+                    mB.shape[2]
+                    if varlen_args.mCuSeqlensK is None
+                    else varlen_args.mCuSeqlensK.shape[0] - 1
+                )
+            )
+            problem_shape_ntile_mnl = (
+                cute.ceil_div(mA.shape[0], self.cta_tile_shape_mnk[0]),
+                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
+                num_problems,
+            )
+            tile_sched_args = TileSchedulerArguments(
+                problem_shape_ntile_mnl=problem_shape_ntile_mnl,
+                raster_order=scheduler_args.raster_order,
+                group_size=scheduler_args.max_swizzle_size,
+                cluster_shape_mnk=self.cluster_shape_mnk,
+                tile_count_semaphore=scheduler_args.tile_count_semaphore,
+                batch_idx_permute=scheduler_args.batch_idx_permute,
+                is_persistent=self.is_persistent,
+            )
+        else:
+            assert mD is not None or not self.gather_A
+            problem_shape_ntile_mnl = (
+                None,
+                cute.ceil_div(mB.shape[0], self.cta_tile_shape_mnk[1]),
+                varlen_args.mCuSeqlensM.shape[0] - 1,
+            )
+            tile_sched_args = VarlenMTileSchedulerArguments(
+                problem_shape_ntile_mnl=problem_shape_ntile_mnl,
+                total_m=mD.shape[0] if mD is not None else varlen_args.mAIdx.shape[0],
+                cu_seqlens_m=varlen_args.mCuSeqlensM,
+                raster_order=scheduler_args.raster_order,
+                group_size=scheduler_args.max_swizzle_size,
+                tile_shape_mn=self.cta_tile_shape_mnk[:2],
+                cluster_shape_mnk=self.cluster_shape_mnk,
+                tile_count_semaphore=scheduler_args.tile_count_semaphore,
+                is_persistent=self.is_persistent,
+            )
+        return tile_sched_args
+    @cute.jit
+    def epi_load_acc_subtile(self, tRS_rAcc: cute.Tensor, tRS_rD: cute.Tensor, epi_idx: int):
+        for epi_v in cutlass.range_constexpr(cute.size(tRS_rD)):
+            tRS_rD[epi_v] = tRS_rAcc[epi_idx * cute.size(tRS_rD) + epi_v]
+    @cute.jit
+    def epi_begin(
+        self,
+        params: EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tidx: Int32,
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
+    def epi_begin_loop(
+        self, params: EpilogueParams, epi_tensors: Tuple[cute.Tensor, ...], epi_coord: cute.Coord
+    ) -> Tuple[cute.Tensor, ...]:
+        return ()
+    def epi_visit_subtile(
+        self,
+        params: EpilogueParams,
+        epi_loop_tensors: Tuple[cute.Tensor, ...],
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor] = None,
+    ) -> Optional[cute.Tensor]:
+        return None
+    def epi_visit_acc(
+        self,
+        params: EpilogueParams,
+        acc: cute.Tensor,
+        tiled_mma: cute.TiledMma,
+        tile_coord_mnkl: cute.Coord,
+        tidx: Int32,
+    ) -> None:
+        pass
+    @cute.jit
+    def epi_end(
+        self,
+        params: EpilogueParams,
+        epi_tensors: Tuple[cute.Tensor, ...],
+        epi_tile: cute.Tile,
+        tiled_copy_t2r: Optional[cute.TiledCopy],
+        tiled_copy_r2s: cute.TiledCopy,
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager,
+        tidx,
+    ) -> None:
+        pass
+    def epi_to_underlying_arguments(
+        self, args: EpilogueArguments, *, loc=None, ip=None
+    ) -> EpilogueParams:
+        return self.EpilogueParams()
+    def epi_get_tma_atoms(
+        self, params: EpilogueParams, *, loc=None, ip=None
+    ) -> list[cute.CopyAtom]:
+        """Subclasses can override this"""
+        return []
+    def epi_get_tensormap_update_shapes_orders(
+        self,
+        params: EpilogueParams,
+        cu_seqlens_m: cute.Tensor,
+        batch_idx: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> tuple[list[Int32], list[int]]:
+        """Subclasses can override this"""
+        return [], []
+    @staticmethod
+    def epi_smem_bytes_per_stage(
+        args: Optional[EpilogueArguments],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: cute.Tile,
+    ) -> int:
+        return 0
+    def epi_get_smem_struct(self, params: EpilogueParams):
+        return cute.struct.MemRange[Int32, 0]  # Dummy struct
+    def epi_get_smem_tensors(self, params: EpilogueParams, storage) -> Tuple[cute.Tensor, ...]:
+        return tuple()
+    def pingpong_barrier_sync(self, warp_group_idx: Int32, stage: Literal["mma", "epi"]):
+        assert stage in ["mma", "epi"]
+        barrier = NamedBarrierGemm.MmaWG0 if stage == "mma" else NamedBarrierGemm.EpiWG0
+        cute.arch.barrier(
+            barrier_id=int(barrier) + warp_group_idx,
+            number_of_threads=2 * self.num_threads_per_warp_group,
+        )
+    def pingpong_barrier_arrive(self, warp_group_idx: Int32, stage: Literal["mma", "epi"]):
+        assert stage in ["mma", "epi"]
+        barrier = NamedBarrierGemm.MmaWG0 if stage == "mma" else NamedBarrierGemm.EpiWG0
+        cute.arch.barrier_arrive(
+            barrier_id=int(barrier) + warp_group_idx,
+            number_of_threads=2 * self.num_threads_per_warp_group,
+        )
+    def epilog_smem_copy_atom(self, tiled_mma: cute.TiledMma) -> cute.TiledCopy:
+        copy_atom_C = cute.make_copy_atom(
+            warp.StMatrix8x8x16bOp(
+                self.d_layout.is_m_major_c() if self.d_layout is not None else False,
+                num_matrices=4 if self.epi_tile[1] % 16 == 0 else 2,
+            ),
+            Float16,  # this is just to get the right source layout
+        )
+        tiled_copy_C_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
+        return tiled_copy_C_atom
+    def epilog_smem_store_and_partition(
+        self,
+        tiled_mma: cute.TiledMma,
+        d_layout: Optional[LayoutEnum],
+        dtype: Type[cutlass.Numeric],
+        sD: Optional[cute.Tensor],
+        tidx: Int32,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        if d_layout is None:
+            d_layout = LayoutEnum.ROW_MAJOR
+        tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        # Doesn't work with tile_N % 8 == 0 but tile_n % 16 != since this always
+        # get st.matrix with num_matrices=4
+        copy_atom_r2s = sm90_utils.sm90_get_smem_store_op(
+            d_layout, elem_ty_d=dtype, elem_ty_acc=self.acc_dtype
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_S(copy_atom_r2s, tiled_copy_C_atom)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sD = thr_copy_r2s.partition_D(sD) if sD is not None else None
+        sD_shape = sD.shape[:2] if sD is not None else self.epi_tile
+        tRS_rD_shape = thr_copy_r2s.partition_S(cute.make_identity_tensor(sD_shape)).shape
+        tRS_rD = cute.make_fragment(tRS_rD_shape, self.acc_dtype)
+        return tiled_copy_r2s, tRS_rD, tRS_sD
+    def epilog_smem_load_and_partition(
+        self,
+        tiled_mma: cute.TiledMma,
+        c_layout: LayoutEnum,
+        dtype: Type[cutlass.Numeric],
+        sC: cute.Tensor,
+        tRS_rD_layout: cutlass.Layout,
+        tidx: Int32,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        copy_atom_s2r = copy_utils.sm90_get_smem_load_op(c_layout, dtype)
+        tiled_copy_s2r = cute.make_tiled_copy_S(copy_atom_s2r, tiled_copy_C_atom)
+        thr_copy_s2r = tiled_copy_s2r.get_slice(tidx)
+        tSR_sC = thr_copy_s2r.partition_S(sC)
+        tRS_rC = cute.make_fragment(tRS_rD_layout, dtype)
+        tSR_rC = thr_copy_s2r.retile(tRS_rC)
+        return tiled_copy_s2r, tRS_rC, tSR_rC, tSR_sC
+    def epilog_gmem_copy_and_partition(
+        self,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        mD_mn: cute.Tensor,
+        tile_shape_mn: cute.Tile,
+        epi_tile: cute.Tile,
+        sD: cute.Tensor,
+        tile_coord_mnkl: cute.Coord,
+        tma_desc_ptr: Optional[cute.Pointer] = None,
+    ) -> Tuple[cute.Tensor, cute.Tensor]:
+        # (bM, bN)
+        gD = cute.local_tile(mD_mn, tile_shape_mn, tile_coord_mnkl[:2])
+        tDgD_for_tma_partition = cute.zipped_divide(gD, epi_tile)
+        is_s2g = isinstance(
+            atom.op, (cpasync.CopyBulkTensorTileS2GOp, cpasync.CopyReduceBulkTensorTileS2GOp)
+        )
+        src_tensor, dst_tensor = (
+            (sD, tDgD_for_tma_partition) if is_s2g else (tDgD_for_tma_partition, sD)
+        )
+        return copy_utils.tma_get_copy_fn(
+            atom,
+            cta_coord=0,
+            cta_layout=cute.make_layout(1),
+            src_tensor=src_tensor,
+            dst_tensor=dst_tensor,
+            tma_desc_ptr=tma_desc_ptr,
+        )
+    def make_ab_pipeline(
+        self,
+        tiled_mma: cute.TiledMma,
+        cluster_layout_vmnk: cute.Layout,
+        ab_pipeline_mbar_ptr: cute.Pointer,
+    ):
+        # Threads/warps participating in this pipeline
+        producer_cnt = 1 if const_expr(not self.gather_A) else 1 + self.num_ab_load_warps * 32
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, producer_cnt)
+        # Each warp will contribute to the arrive count with the number of mcast size
+        mcast_size = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        consumer_arrive_cnt = mcast_size * tiled_mma.size // cute.arch.WARP_SIZE
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        pipeline_cls = pipeline.PipelineTmaAsync if not self.gather_A else PipelineTmaCpAsync
+        return pipeline_cls.create(
+            barrier_storage=ab_pipeline_mbar_ptr,
+            num_stages=self.ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+    def make_epi_pipeline(
+        self, c_smem_layout: cute.Layout | cute.ComposedLayout, epi_pipeline_mbar_ptr: cute.Pointer
+    ):
+        # Threads/warps participating in this pipeline
+        epi_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        # Each warp will contribute 1 to the arrive count
+        consumer_arrive_cnt = self.num_epi_warps
+        epi_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        tma_copy_c_bytes = cute.size_in_bytes(self.c_dtype, c_smem_layout)
+        return pipeline.PipelineTmaAsync.create(
+            barrier_storage=epi_pipeline_mbar_ptr,
+            num_stages=self.epi_c_stage,
+            producer_group=epi_pipeline_producer_group,
+            consumer_group=epi_pipeline_consumer_group,
+            tx_count=tma_copy_c_bytes,
+        )
+    def make_epi_store_pipeline(self):
+        # Threads/warps participating in tma store pipeline
+        num_epi_threads = self.num_epi_warps * cute.arch.WARP_SIZE
+        epi_store_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_epi_threads)
+        return pipeline.PipelineTmaStore.create(
+            num_stages=self.epi_stage, producer_group=epi_store_producer_group
+        )
+    def make_sched_pipeline(
+        self, cluster_layout_mnk: cute.Layout, sched_pipeline_mbar_ptr: cute.Pointer, varlen_k: bool
+    ):
+        # Threads/warps participating in this pipeline
+        sched_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        cluster_size = cute.size(cluster_layout_mnk)
+        # Each warp that are not the scheduler warp will contribute 1 to the arrive count
+        # If pingpong and varlen_k, then all 8 mma warps will participate in the scheduler barrier
+        # at each round. If pingpong and not varlen_k, then only 4 mma warp will participate.
+        consumer_arrive_cnt = (
+            (self.mma_warp_groups if not (self.pingpong and not varlen_k) else 1) * 4
+            + self.num_ab_load_warps
+        ) * cluster_size - 1
+        sched_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, consumer_arrive_cnt
+        )
+        return pipeline.PipelineAsync.create(
+            barrier_storage=sched_pipeline_mbar_ptr,
+            num_stages=self.sched_stage,
+            producer_group=sched_pipeline_producer_group,
+            consumer_group=sched_pipeline_consumer_group,
+            # If there's cluster, the consumers must arrive at the mbar of CTA 0 in the cluster.
+            consumer_mask=None if const_expr(cluster_size == 1) else 0,
+        )
+    @classmethod
+    def _compute_stages(
+        cls,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: Tuple[int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        c_dtype: Optional[Type[cutlass.Numeric]],
+        epilogue_args: EpilogueArguments,
+        smem_capacity: int,
+        occupancy: int,
+        overlap_sD_sA: bool = False,
+    ) -> Tuple[int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+        :return: A tuple containing the computed number of stages for:
+                 (A/B operand stages, epilogue stages)
+        :rtype: Tuple[int, int]
+        """
+        epi_stage = 4 if epi_tile[1] <= 16 else 2
+        if overlap_sD_sA:
+            epi_bytes = 0
+        else:
+            d_bytes_per_stage = (
+                cute.size(epi_tile) * d_dtype.width // 8 if d_dtype is not None else 0
+            )
+            epi_bytes_per_stage = d_bytes_per_stage + cls.epi_smem_bytes_per_stage(
+                epilogue_args, cta_tile_shape_mnk, epi_tile
+            )
+            epi_bytes = epi_bytes_per_stage * epi_stage
+        epi_c_stage = 0 if c_dtype is None else (4 if epi_tile[1] <= 16 else 2)
+        if c_dtype is not None:
+            epi_bytes += cute.size(epi_tile) * c_dtype.width // 8 * epi_c_stage
+        a_shape = cute.slice_(cta_tile_shape_mnk, (None, 0, None))
+        b_shape = cute.slice_(cta_tile_shape_mnk, (0, None, None))
+        ab_bytes_per_stage = (
+            cute.size(a_shape) * a_dtype.width // 8 + cute.size(b_shape) * b_dtype.width // 8
+        )
+        mbar_helpers_bytes = 1024
+        remaining_bytes = smem_capacity // occupancy - mbar_helpers_bytes - epi_bytes
+        ab_stage = remaining_bytes // ab_bytes_per_stage
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        if not overlap_sD_sA and epi_bytes_per_stage > 0:
+            epi_stage += (remaining_bytes - ab_bytes_per_stage * ab_stage) // epi_bytes_per_stage
+        return ab_stage, epi_stage, epi_c_stage
+    @staticmethod
+    def _sm90_compute_tile_shape_or_override(
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        atom_layout_mnk: Tuple[int, int, int],
+        element_type: Optional[Type[cutlass.Numeric]] = None,
+        epi_tile_override: Tuple[int, int] | None = None,
+    ) -> Tuple[int, int]:
+        """Compute the epilogue tile shape or use override if provided.
+        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
+        :param element_type: Data type of elements
+        :type element_type: type[cutlass.Numeric]
+        :param is_cooperative: Whether to use cooperative approach
+        :type is_cooperative: bool
+        :param epi_tile_override: Optional override for epilogue tile shape
+        :type epi_tile_override: Tuple[int, int] or None
+        :return: Computed epilogue tile shape
+        :rtype: Tuple[int, int]
+        """
+        if epi_tile_override is not None:
+            return epi_tile_override
+        if cta_tile_shape_mnk[0] % 128 == 0 and atom_layout_mnk[0] > 1:
+            tile_m = math.gcd(128, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(32, cute.size(cta_tile_shape_mnk, mode=[1]))
+        elif cta_tile_shape_mnk[0] % 192 == 0 and atom_layout_mnk[0] > 1:
+            tile_m = math.gcd(192, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(32, cute.size(cta_tile_shape_mnk, mode=[1]))
+        else:
+            # In the case of tile shape 128 x N but atom_layout 1 x 2, we need to set
+            # epi_tile_m = 64. If epi_tile_m = 128, the epilogue would iterate along the
+            # M dimension first, then move to the N dimension. But the accumulator in registers
+            # iterate along the N dimension first, then move to the M dimension.
+            # We could change the epilogue to accommodate this,
+            # but it's easier to just set epi_tile_m = 64.
+            n_perf = 64 if element_type is not None and element_type.width == 8 else 32
+            tile_m = math.gcd(64, cute.size(cta_tile_shape_mnk, mode=[0]))
+            tile_n = math.gcd(n_perf, cute.size(cta_tile_shape_mnk, mode=[1]))
+        return (tile_m, tile_n)
+    @staticmethod
+    def _make_smem_layouts(
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        epi_tile: Tuple[int, int],
+        a_dtype: Type[cutlass.Numeric],
+        a_layout: LayoutEnum,
+        b_dtype: Type[cutlass.Numeric],
+        b_layout: LayoutEnum,
+        ab_stage: int,
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        d_layout: LayoutEnum,
+        epi_stage: int,
+        c_dtype: Optional[Type[cutlass.Numeric]],
+        c_layout: Optional[LayoutEnum],
+        epi_c_stage: int,
+    ) -> Tuple[
+        cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout, Optional[cute.ComposedLayout]
+    ]:
+        """Create shared memory layouts for A, B, and C tensors.
+        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
+        :type cta_tile_shape_mnk: Tuple[int, int, int]
+        :param epi_tile: Epilogue tile shape
+        :type epi_tile: Tuple[int, int]
+        :param a_dtype: Data type for matrix A
+        :type a_dtype: type[cutlass.Numeric]
+        :param a_layout: Layout enum for matrix A
+        :type a_layout: LayoutEnum
+        :param b_dtype: Data type for matrix B
+        :type b_dtype: type[cutlass.Numeric]
+        :param b_layout: Layout enum for matrix B
+        :type b_layout: LayoutEnum
+        :param ab_stage: Number of stages for A/B tensors
+        :type ab_stage: int
+        :param d_dtype: Data type for output matrix D
+        :type d_dtype: type[cutlass.Numeric]
+        :param d_layout: Layout enum for the output matrix C
+        :type d_layout: LayoutEnum
+        :param epi_stage: Number of epilogue stages
+        :type epi_stage: int
+        :return: Tuple of shared memory layouts for A, B, and C
+        :rtype: Tuple[cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout]
+        """
+        a_smem_shape = cute.slice_(cta_tile_shape_mnk, (None, 0, None))
+        a_is_k_major = a_layout.sm90_mma_major_mode() == warpgroup.OperandMajorMode.K
+        b_is_k_major = b_layout.sm90_mma_major_mode() == warpgroup.OperandMajorMode.K
+        a_major_mode_size = cta_tile_shape_mnk[2 if a_is_k_major else 0]
+        a_smem_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(a_layout, a_dtype, a_major_mode_size),
+            a_dtype,
+        )
+        a_smem_layout_staged = cute.tile_to_shape(
+            a_smem_layout_atom,
+            cute.append(a_smem_shape, ab_stage),
+            order=(0, 1, 2) if a_is_k_major else (1, 0, 2),
+        )
+        b_smem_shape = cute.slice_(cta_tile_shape_mnk, (0, None, None))
+        b_major_mode_size = cta_tile_shape_mnk[2 if b_is_k_major else 1]
+        b_smem_layout_atom = warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(b_layout, b_dtype, b_major_mode_size),
+            b_dtype,
+        )
+        b_smem_layout_staged = cute.tile_to_shape(
+            b_smem_layout_atom,
+            cute.append(b_smem_shape, ab_stage),
+            order=(0, 1, 2) if b_is_k_major else (1, 0, 2),
+        )
+        epi_smem_layout_staged = None
+        if d_dtype is not None:
+            epi_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                d_dtype, d_layout, epi_tile, epi_stage
+            )
+        epi_c_smem_layout_staged = None
+        if c_dtype is not None:
+            assert c_layout is not None
+            epi_c_smem_layout_staged = quack_sm90_utils.make_smem_layout_epi(
+                c_dtype, c_layout, epi_tile, epi_c_stage
+            )
+        return (
+            a_smem_layout_staged,
+            b_smem_layout_staged,
+            epi_smem_layout_staged,
+            epi_c_smem_layout_staged,
+        )
+    @staticmethod
+    def _make_tma_epi_atoms_and_tensors(
+        tensor_d: cute.Tensor,
+        epi_smem_layout_staged: cute.ComposedLayout,
+        epi_tile: Tuple[int, int],
+        op_type: Literal["store", "load", "add"],
+    ) -> Tuple[cute.CopyAtom, cute.Tensor]:
+        """Create TMA atoms and tensors for storing D or loading C.
+        :param tensor_d: Output tensor D
+        :type tensor_d: cute.Tensor
+        :param epi_smem_layout_staged: Shared memory layout for epilogue
+        :type epi_smem_layout_staged: cute.ComposedLayout
+        :param epi_tile: Epilogue tile shape
+        :type epi_tile: Tuple[int, int]
+        :return: TMA atom and tensor for C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
+        """
+        assert op_type in ["load", "store", "add"]
+        epi_smem_layout = cute.slice_(epi_smem_layout_staged, (None, None, 0))
+        d_cta_v_layout = cute.composition(cute.make_identity_layout(tensor_d.shape), epi_tile)
+        op = (
+            cpasync.CopyBulkTensorTileG2SOp()
+            if op_type == "load"
+            else cpasync.CopyBulkTensorTileS2GOp()
+            if op_type == "store"
+            else cpasync.CopyReduceBulkTensorTileS2GOp(cute.ReductionOp.ADD)
+        )
+        tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
+            op, tensor_d, epi_smem_layout, d_cta_v_layout
+        )
+        return tma_atom_d, tma_tensor_d
+    @staticmethod
+    def _make_tma_atoms_and_tensors(
+        tensor: cute.Tensor,
+        smem_layout: cute.ComposedLayout,
+        smem_tile: Tuple[int, int],
+        mcast_dim: int,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor]:
+        """Create TMA atoms and tensors for input tensors.
+        :param tensor: Input tensor (A or B)
+        :type tensor: cute.Tensor
+        :param smem_layout: Shared memory layout for the tensor
+        :type smem_layout: cute.ComposedLayout
+        :param smem_tile: Shared memory tile shape
+        :type smem_tile: Tuple[int, int]
+        :param mcast_dim: Multicast dimension
+        :type mcast_dim: int
+        :return: TMA atom and tensor
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
+        """
+        op = (
+            cpasync.CopyBulkTensorTileG2SOp()
+            if mcast_dim == 1
+            else cpasync.CopyBulkTensorTileG2SMulticastOp()
+        )
+        tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+        return tma_atom, tma_tensor
+    def _make_gmem_tiled_copy_A(self, dtype, major_mode, num_threads, copy_bits=128):
+        atom_async_copy = cute.make_copy_atom(
+            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL),
+            dtype,
+            num_bits_per_copy=copy_bits,
+        )
+        copy_elems = copy_bits // dtype.width
+        loads_per_cache_line = 128 * 8 // copy_bits  # 128 bytes per cache line
+        shape_dim_1 = cute.size(self.cta_tile_shape_mnk[2]) // copy_elems
+        if shape_dim_1 > loads_per_cache_line:
+            shape_dim_1 = math.gcd(shape_dim_1, loads_per_cache_line)
+        # thread layout for copy
+        thread_layout = cute.make_layout(
+            (num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1)
+        )
+        if major_mode != LayoutEnum.ROW_MAJOR:
+            shape_dim_0 = cute.size(self.cta_tile_shape_mnk[0]) // copy_elems
+            if shape_dim_0 > loads_per_cache_line:
+                shape_dim_0 = math.gcd(shape_dim_0, loads_per_cache_line)
+            thread_layout = cute.make_layout(
+                (shape_dim_0, num_threads // shape_dim_0), stride=(1, shape_dim_0)
+            )
+        # Value layout for copy
+        value_layout = (
+            cute.make_layout((1, copy_elems))
+            if major_mode == LayoutEnum.ROW_MAJOR
+            else cute.make_layout((copy_elems, 1))
+        )
+        return cute.make_tiled_copy_tv(atom_async_copy, thread_layout, value_layout)
+    @staticmethod
+    def is_valid_dtypes(
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        d_dtype: Optional[Type[cutlass.Numeric]],
+        a_major: str,
+        b_major: str,
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+        :param a_dtype: The data type of tensor A
+        :type a_dtype: Type[cutlass.Numeric]
+        :param b_dtype: The data type of tensor B
+        :type b_dtype: Type[cutlass.Numeric]
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param d_dtype: The data type of the output tensor
+        :type d_dtype: Type[cutlass.Numeric]
+        :param a_major: major mode of tensor A
+        :type a_major: str
+        :param b_major: major mode of tensor B
+        :type b_major: str
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if a_dtype not in {
+            Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            is_valid = False
+        # tested b_dtype
+        if b_dtype not in {
+            Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            is_valid = False
+        if acc_dtype not in {Float32, Float16}:
+            is_valid = False
+        # tested d_dtype
+        if d_dtype not in {
+            None,
+            Float32,
+            Float16,
+            cutlass.BFloat16,
+            cutlass.Float8E4M3FN,
+            cutlass.Float8E5M2,
+        }:
+            is_valid = False
+        # make sure a_dtype == b_dtype for Float16
+        if a_dtype.width == 16 and a_dtype != b_dtype:
+            is_valid = False
+        # make sure a_dtype.width == b_dtype.width (i.e, Float8E4M3FN or Float8E5M2)
+        if a_dtype.width != b_dtype.width:
+            is_valid = False
+        # for Float8 types, this implementation only supports k-major layout
+        if (a_dtype.width == 8 and a_major != "k") or (b_dtype.width == 8 and b_major != "k"):
+            is_valid = False
+        return is_valid

build/torch-cuda/quack/gemm_symmetric.py ADDED Viewed

	@@ -0,0 +1,330 @@

+from typing import Tuple, Optional, Callable
+from functools import partial
+from torch import Tensor
+from .gemm_act import GemmActMixin, act_fn_map, gemm_act
+from .gemm_sm90 import GemmSm90
+from .gemm_sm100 import GemmSm100
+from .tile_scheduler import TriangularTileScheduler
+from .gemm_wrapper_utils import GemmWrapperBase
+from .cute_dsl_utils import get_device_capacity, get_max_active_clusters
+from .varlen_utils import VarlenManager
+from . import copy_utils as copy_utils
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import make_ptr
+from cutlass import Int32, Float32, Boolean, const_expr
+import cutlass.utils.hopper_helpers as sm90_utils_og
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cutlass_dsl import if_generate
+class GemmSymmetricMixin(GemmActMixin, GemmSm90):
+    def get_scheduler_class(self, varlen_m: bool = False):
+        return TriangularTileScheduler
+    @cute.jit
+    def epilogue(
+        self,
+        params: GemmActMixin.EpilogueParams,
+        epi_smem_tensors: Tuple[cute.Tensor, ...],
+        tma_desc_epi_ptrs: list[Optional[cute.Pointer]],
+        epi_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_store_pipeline: cutlass.pipeline.PipelineAsync,
+        epi_read_state: cutlass.pipeline.PipelineState,
+        epi_producer_state: cutlass.pipeline.PipelineState,
+        epi_tile: cute.Tile,
+        load_acc_subtile: Callable,
+        tRS_rD: cute.Tensor,
+        tRS_rC: Optional[cute.Tensor],
+        tiled_copy_t2r: Optional[cute.TiledCopy],  # Only for Sm100
+        tiled_copy_r2s: cute.TiledCopy,
+        tRS_sD: cute.Tensor,
+        tiled_copy_s2r: Optional[cute.TiledCopy],
+        tSR_rC: Optional[cute.Tensor],
+        tSR_sC: Optional[cute.Tensor],
+        copy_D: Optional[Callable],
+        copy_C: Optional[Callable],
+        tile_coord_mnkl: cute.Coord,
+        varlen_manager: VarlenManager,
+        epilogue_barrier: cutlass.pipeline.NamedBarrier,
+        tile_scheduler,
+        tidx: Int32,
+        is_tma_warp: Boolean,
+    ) -> Tuple[cutlass.pipeline.PipelineState, cutlass.pipeline.PipelineState]:
+        has_C = const_expr(tRS_rC is not None)
+        has_D = const_expr(copy_D is not None)
+        tma_atom_postact = params.tma_atom_postact
+        mPostAct_mnl = params.mPostAct_mnl
+        sRowVec, sColVec, sPostAct = epi_smem_tensors
+        get_smem_store_op = (
+            partial(sm100_utils.get_smem_store_op, tiled_tmem_load=tiled_copy_t2r)
+            if self.arch == 100
+            else sm90_utils_og.sm90_get_smem_store_op
+        )
+        copy_atom_postact_r2s = get_smem_store_op(
+            self.postact_layout, self.postact_dtype, self.acc_dtype
+        )
+        # tiled_copy_C_atom = self.epilog_smem_copy_atom(tiled_mma)
+        # tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_C_atom)
+        tiled_copy_postact_r2s = cute.make_tiled_copy_S(copy_atom_postact_r2s, tiled_copy_r2s)
+        tRS_sPostAct = tiled_copy_postact_r2s.get_slice(tidx).partition_D(sPostAct)
+        (tma_desc_postact_ptr,) = tma_desc_epi_ptrs
+        batch_idx = tile_coord_mnkl[3]
+        copy_postact, _, _ = self.epilog_gmem_copy_and_partition(
+            tma_atom_postact,
+            varlen_manager.offset_batch_epi(mPostAct_mnl, batch_idx),
+            self.cta_tile_shape_postact_mn,
+            params.epi_tile_postact,
+            sPostAct,
+            tile_coord_mnkl,
+            tma_desc_ptr=tma_desc_postact_ptr,
+        )
+        # We iterate over epi tiles in the N dimension first before the M dimension
+        epi_tile_shape = cute.zipped_divide(
+            cute.make_layout(self.cta_tile_shape_mnk[:2]), epi_tile
+        ).shape[1]
+        epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+        epi_tile_num = cute.size(epi_tile_shape)
+        num_prev_subtiles = tile_scheduler.num_tiles_executed * epi_tile_num
+        epi_tensors = self.epi_begin(
+            params,
+            epi_smem_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            epilogue_barrier,
+            tidx,
+        )
+        if const_expr(copy_C is not None):
+            for epi_idx in cutlass.range(min(epi_tile_num, self.epi_c_stage), unroll=1):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+        def tma_store_fn(src_idx, dst_idx, tile_coord_mnkl):
+            pid_m = tile_coord_mnkl[0]
+            pid_n = tile_coord_mnkl[1]
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+            )
+            epilogue_barrier.arrive_and_wait()
+            # Copy from shared memory to global memory
+            if is_tma_warp:
+                square_tile_m = pid_m // self.cluster_shape_mnk[0]
+                square_tile_n = pid_n // self.cluster_shape_mnk[1]
+                if const_expr(has_D):
+                    copy_D(src_idx=src_idx, dst_idx=dst_idx)
+                if square_tile_m != square_tile_n:  # don't write twice to the same tile
+                    copy_postact(src_idx=src_idx, dst_idx=dst_idx)
+            # Can't use if statement here, epi_store_pipeline object isn't captured somehow
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_commit())
+            if_generate(is_tma_warp, lambda: epi_store_pipeline.producer_acquire())
+            epilogue_barrier.arrive_and_wait()
+        delay_tma_store = True
+        src_idx_prev, dst_idx_prev = None, None
+        for epi_idx in cutlass.range_constexpr(epi_tile_num):
+            # The global memory coordinate for the current epi tile
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from acc to D registers
+            load_acc_subtile(tRS_rD, epi_idx)
+            epi_loop_tensors = self.epi_begin_loop(params, epi_tensors, gmem_coord)
+            if const_expr(has_C):
+                epi_pipeline.consumer_wait(epi_read_state)
+                cute.copy(tiled_copy_s2r, tSR_sC[None, None, None, epi_read_state.index], tSR_rC)
+                # Fence to make sure shared memory read is visible to TMA load
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared, space=cute.arch.SharedSpace.shared_cta
+                )
+                cute.arch.sync_warp()
+                with cute.arch.elect_one():
+                    epi_pipeline.consumer_release(epi_read_state)
+                epi_read_state.advance()
+            if const_expr(copy_C is not None and epi_idx + self.epi_c_stage < epi_tile_num):
+                gmem_coord_C = epi_tile_layout.get_hier_coord(epi_idx + self.epi_c_stage)
+                if is_tma_warp:
+                    epi_pipeline.producer_acquire(epi_producer_state)
+                    copy_C(src_idx=gmem_coord_C, producer_state=epi_producer_state)
+                    epi_pipeline.producer_commit(epi_producer_state)
+                epi_producer_state.advance()
+            tRS_rPostAct = self.epi_visit_subtile(params, epi_loop_tensors, tRS_rD, tRS_rC)
+            epi_buffer = (num_prev_subtiles + epi_idx) % self.epi_stage
+            if const_expr(delay_tma_store):
+                if const_expr(epi_idx > 0):
+                    tma_store_fn(
+                        src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
+                    )
+                src_idx_prev, dst_idx_prev = epi_buffer, gmem_coord
+            # Copy from D registers to shared memory
+            if const_expr(has_D):
+                copy_utils.cvt_copy(tiled_copy_r2s, tRS_rD, tRS_sD[None, None, None, epi_buffer])
+            cute.copy(
+                tiled_copy_postact_r2s,
+                tiled_copy_postact_r2s.retile(tRS_rPostAct),
+                tRS_sPostAct[None, None, None, epi_buffer],
+            )
+            if const_expr(not delay_tma_store):
+                tma_store_fn(
+                    src_idx=epi_buffer, dst_idx=gmem_coord, tile_coord_mnkl=tile_coord_mnkl
+                )
+        if const_expr(delay_tma_store):
+            tma_store_fn(
+                src_idx=src_idx_prev, dst_idx=dst_idx_prev, tile_coord_mnkl=tile_coord_mnkl
+            )
+        self.epi_end(
+            params,
+            epi_tensors,
+            epi_tile,
+            tiled_copy_t2r,
+            tiled_copy_r2s,
+            tile_coord_mnkl,
+            varlen_manager,
+            tidx,
+        )
+        return epi_read_state, epi_producer_state
+class GemmSymmetricSm90(GemmSymmetricMixin, GemmSm90):
+    pass
+class GemmSymmetricSm100(GemmSymmetricMixin, GemmSm100):
+    pass
+def gemm_symmetric(
+    A: Tensor,  # (l, m, k)
+    B: Tensor,  # (l, m, k)
+    D: Optional[Tensor],  # (l, m, m)
+    C: Optional[Tensor],  # (l, m, m)
+    tile_count_semaphore: Optional[Tensor],  # (1,)
+    tile_M: int,
+    tile_N: int,
+    cluster_M: int,
+    cluster_N: int,
+    pingpong: bool = False,
+    persistent: bool = True,
+    max_swizzle_size: int = 8,
+    alpha: float | Tensor = 1.0,
+    beta: float | Tensor = 1.0,
+) -> None:
+    # Tranpose D so the "activation" is a write to the mirrored tile
+    PostAct = D.mT
+    L, M, K, N, tensor_infos = GemmWrapperBase.validate_and_prepare_tensors(
+        A, B, D, C, additional_tensors={"PostAct": PostAct}
+    )
+    assert M == N, "M and N must be the same; symmetric gemm only supports square matrices"
+    GemmWrapperBase.permute_tensors(tensor_infos)
+    GemmWrapperBase.extract_dtypes(tensor_infos)
+    major_configs = {
+        "A": ("m", "k", "l"),
+        "B": ("n", "k", "l"),
+        "D": ("m", "n", "l"),
+        "C": ("m", "n", "l"),
+        "PostAct": ("m", "n", "l"),
+    }
+    GemmWrapperBase.determine_major_orders(tensor_infos, major_configs)
+    device_capacity = get_device_capacity(A.device)
+    assert device_capacity[0] in [9, 10], "Only SM90 and SM100 are supported"
+    GemmCls = GemmSymmetricSm90 if device_capacity[0] == 9 else GemmSymmetricSm100
+    acc_dtype = Float32
+    tile_shape_mn = (tile_M, tile_N)
+    cluster_shape_mnk = (cluster_M, cluster_N, 1)
+    if not GemmCls.is_valid_dtypes(
+        tensor_infos["A"].dtype,
+        tensor_infos["B"].dtype,
+        acc_dtype,
+        tensor_infos["D"].dtype,
+        tensor_infos["A"].major,
+        tensor_infos["B"].major,
+    ):
+        raise TypeError("Skipping due to unsupported combination of types and majors")
+    max_active_clusters = get_max_active_clusters(cluster_M * cluster_N) if persistent else 0
+    GemmWrapperBase.create_cute_tensors({k: v for k, v in tensor_infos.items()}, major_configs)
+    def scalar_arg(scalar: float | Tensor):
+        if isinstance(scalar, float):
+            return Float32(scalar) if scalar != 1.0 else None
+        else:
+            assert isinstance(scalar, Tensor)
+            return make_ptr(Float32, scalar.data_ptr(), cute.AddressSpace.gmem, assumed_align=4)
+    activation = None  # Equivalent to identity
+    act_fn = act_fn_map[activation]
+    epi_args = GemmCls.EpilogueArguments(
+        tensor_infos["PostAct"].cute_tensor, act_fn, scalar_arg(alpha), scalar_arg(beta)
+    )
+    scheduler_args = GemmWrapperBase.create_scheduler_args(
+        max_active_clusters, tile_count_semaphore, max_swizzle_size=max_swizzle_size
+    )
+    varlen_args = None
+    current_stream = cutlass_torch.current_stream()
+    compile_key = GemmWrapperBase.get_compile_key(
+        tensor_infos,
+        activation,
+        tile_shape_mn,
+        cluster_shape_mnk,
+        pingpong,
+        persistent,
+        tile_count_semaphore is not None,
+        device_capacity,
+        max_swizzle_size,
+        2 if isinstance(alpha, Tensor) else (1 if alpha == 1.0 else 0),
+        2 if isinstance(beta, Tensor) else (1 if beta == 1.0 else 0),
+        key_tensor_names=("A", "B", "D", "PostAct", "C"),
+    )
+    cache = gemm_act.compile_cache
+    if compile_key not in cache:
+        if device_capacity[0] == 9:
+            GemmCls = partial(GemmCls, pingpong=pingpong, is_persistent=persistent)
+        gemm_obj = GemmCls(
+            acc_dtype,
+            tensor_infos["A"].dtype,
+            tile_shape_mn,
+            cluster_shape_mnk,
+            gather_A=False,
+        )
+        cache[compile_key] = cute.compile(
+            gemm_obj,
+            tensor_infos["A"].cute_tensor,
+            tensor_infos["B"].cute_tensor,
+            tensor_infos["D"].cute_tensor,
+            tensor_infos["C"].cute_tensor,
+            epi_args,
+            scheduler_args,
+            varlen_args,
+            current_stream,
+        )
+    cache[compile_key](
+        tensor_infos["A"].cute_tensor,
+        tensor_infos["B"].cute_tensor,
+        tensor_infos["D"].cute_tensor,
+        tensor_infos["C"].cute_tensor,
+        epi_args,
+        scheduler_args,
+        varlen_args,
+        current_stream,
+    )
+gemm_act.compile_cache = {}

build/torch-cuda/quack/gemm_wrapper_utils.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional, Tuple, Dict, Any
+from dataclasses import dataclass
+import torch
+from torch import Tensor
+import cutlass.cute as cute
+from cutlass import Int32
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from .cute_dsl_utils import torch2cute_dtype_map
+from .varlen_utils import VarlenArguments
+from .tile_scheduler import TileSchedulerOptions
+@dataclass
+class GemmTensorInfo:
+    tensor: Optional[Tensor]
+    dtype: Optional[Any] = None
+    major: Optional[str] = None
+    cute_tensor: Optional[cute.Tensor] = None
+class GemmWrapperBase:
+    @staticmethod
+    def validate_tensor(tensor: Tensor, name: str, ndim: int) -> None:
+        assert tensor.dim() == ndim and tensor.is_cuda, f"{name} must be a {ndim}D CUDA tensor"
+        assert tensor.dtype in torch2cute_dtype_map, f"Unsupported dtype for {name}"
+    @staticmethod
+    def validate_shape(tensor: Tensor, expected_shape: Tuple[int, ...], name: str) -> None:
+        assert tensor.shape == expected_shape, (
+            f"{name} must have shape {expected_shape}, got {tensor.shape}"
+        )
+    @staticmethod
+    def get_major_order(tensor: Tensor, dims: Tuple[str, str, str]) -> str:
+        # Tensor is already permuted to (dims[0], dims[1], dims[2])
+        # stride(1) == 1 means dims[1] is contiguous (innermost)
+        return dims[1] if tensor.stride(1) == 1 else dims[0]
+    @staticmethod
+    def create_cute_tensor(
+        tensor: Optional[Tensor],
+        major: Optional[str],
+        dims: Tuple[str, str, str],
+        assumed_align: int = 16,
+    ) -> Optional[cute.Tensor]:
+        if tensor is None:
+            return None
+        # Tensor is already permuted to (dims[0], dims[1], dims[2]) or (dim[0], dim[1])
+        # If major is dims[1], leading_dim is 1; if major is dims[0], leading_dim is 0
+        leading_dim = 1 if major == dims[1] else 0
+        return from_dlpack(tensor.detach(), assumed_align=assumed_align).mark_layout_dynamic(
+            leading_dim=leading_dim
+        )
+    @staticmethod
+    def validate_and_prepare_tensors(
+        A: Tensor,
+        B: Tensor,
+        D: Optional[Tensor] = None,
+        C: Optional[Tensor] = None,
+        additional_tensors: Optional[Dict[str, Tensor]] = None,
+        cu_seqlens_m: Optional[Tensor] = None,
+        cu_seqlens_k: Optional[Tensor] = None,
+        A_idx: Optional[Tensor] = None,
+    ) -> Tuple[int, int, int, int, Dict[str, GemmTensorInfo]]:
+        assert not (cu_seqlens_m is not None and cu_seqlens_k is not None), (
+            "Only one of cu_seqlens_m and cu_seqlens_k can be specified"
+        )
+        assert B.dtype == A.dtype, "A and B must have the same dtype"
+        # Validate A_idx if provided (for gather_A case)
+        gather_A = A_idx is not None
+        if gather_A:
+            assert cu_seqlens_m is not None or cu_seqlens_k is not None, (
+                "gather_A requires either varlen_m or varlen_k"
+            )
+            assert A_idx.dtype == torch.int32, f"A_idx must be int32, got {A_idx.dtype}"
+            assert A_idx.dim() == 1, f"A_idx must be 1D, got {A_idx.dim()}D"
+        # Determine mode and extract dimensions
+        if cu_seqlens_m is not None:
+            # varlen_m: A is (total_m, k) or (whatever, k) if gather_A, B is (l, n, k), D/C are (total_m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_m, got {A.dim()}D"
+            assert B.dim() == 3, f"B must be 3D with varlen_m, got {B.dim()}D"
+            if gather_A:
+                # When gather_A, A can have any number of rows, we use A_idx.shape[0] as total_M
+                total_M = A_idx.shape[0]
+                _, K = A.shape
+            else:
+                total_M, K = A.shape
+            L, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            assert cu_seqlens_m.shape == (L + 1,), (
+                f"cu_seqlens_m must have shape ({L + 1},), got {cu_seqlens_m.shape}"
+            )
+            M = total_M
+            dc_shape = (total_M, N)
+            dc_ndim = 2
+        elif cu_seqlens_k is not None:
+            # varlen_k: A is (m, total_k) or (m, whatever) if gather_A, B is (n, total_k), D/C are (l, m, n)
+            assert A.dim() == 2, f"A must be 2D when using varlen_k, got {A.dim()}D"
+            assert B.dim() == 2, f"B must be 2D with varlen_k, got {B.dim()}D"
+            if gather_A:
+                # When gather_A with varlen_k, A can have any number of columns, we use A_idx.shape[0] as total_K
+                M, _ = A.shape
+                total_K = A_idx.shape[0]
+            else:
+                M, total_K = A.shape
+            N, K_B = B.shape
+            assert total_K == K_B, f"K dimension mismatch: expected {total_K}, B has {K_B}"
+            L = cu_seqlens_k.shape[0] - 1
+            assert cu_seqlens_k.shape == (L + 1,), (
+                f"cu_seqlens_k must have shape ({L + 1},), got {cu_seqlens_k.shape}"
+            )
+            K = total_K
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        else:
+            # Normal case - all tensors must be 3D
+            GemmWrapperBase.validate_tensor(A, "A", 3)
+            GemmWrapperBase.validate_tensor(B, "B", 3)
+            L, M, K = A.shape
+            _, N, K_B = B.shape
+            assert K == K_B, f"K dimension mismatch: A has {K}, B has {K_B}"
+            GemmWrapperBase.validate_shape(B, (L, N, K), "B")
+            dc_shape = (L, M, N)
+            dc_ndim = 3
+        # Validate D and C shapes uniformly
+        for tensor, name in [(D, "D"), (C, "C")]:
+            if tensor is not None:
+                assert tensor.dim() == dc_ndim, (
+                    f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                )
+                assert tensor.shape == dc_shape, (
+                    f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                )
+        tensors = {
+            "A": GemmTensorInfo(A),
+            "B": GemmTensorInfo(B),
+            "D": GemmTensorInfo(D),
+            "C": GemmTensorInfo(C),
+        }
+        if additional_tensors:
+            for name, tensor in additional_tensors.items():
+                if tensor is not None:
+                    assert tensor.dim() == dc_ndim, (
+                        f"{name} must be {dc_ndim}D for this mode, got {tensor.dim()}D"
+                    )
+                    assert tensor.shape == dc_shape, (
+                        f"{name} shape {tensor.shape} doesn't match expected {dc_shape}"
+                    )
+                tensors[name] = GemmTensorInfo(tensor)
+        return L, M, K, N, tensors
+    @staticmethod
+    def permute_tensors(
+        tensors: Dict[str, GemmTensorInfo], varlen_m: bool = False, varlen_k: bool = False
+    ) -> None:
+        # Determine which tensors need permutation
+        if varlen_m:
+            # Only B needs permutation (3D tensor)
+            tensors_to_permute = ["B"]
+        elif varlen_k:
+            # Only D and C need permutation (3D tensors)
+            tensors_to_permute = ["D", "C"]
+        else:
+            # All tensors need permutation
+            tensors_to_permute = None
+        # Apply permutation from (L, *, *) -> (*, *, L) for selected tensors
+        for name, info in tensors.items():
+            if info.tensor is not None and info.tensor.ndim == 3:
+                if tensors_to_permute is None or name in tensors_to_permute:
+                    info.tensor = info.tensor.permute(1, 2, 0)
+    @staticmethod
+    def extract_dtypes(tensors: Dict[str, GemmTensorInfo]) -> None:
+        for name, info in tensors.items():
+            if info.tensor is not None:
+                info.dtype = torch2cute_dtype_map[info.tensor.dtype]
+    @staticmethod
+    def determine_major_orders(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, dims in major_configs.items():
+            if name in tensors and tensors[name].tensor is not None:
+                tensors[name].major = GemmWrapperBase.get_major_order(tensors[name].tensor, dims)
+    @staticmethod
+    def create_cute_tensors(
+        tensors: Dict[str, GemmTensorInfo], major_configs: Dict[str, Tuple[str, str, str]]
+    ) -> None:
+        for name, info in tensors.items():
+            if info.tensor is not None and name in major_configs:
+                info.cute_tensor = GemmWrapperBase.create_cute_tensor(
+                    info.tensor, info.major, major_configs[name]
+                )
+    @staticmethod
+    def create_scheduler_args(
+        max_active_clusters: int,
+        tile_count_semaphore: Optional[Tensor] = None,
+        batch_idx_permute: Optional[Tensor] = None,
+        max_swizzle_size: int = 8,
+    ) -> TileSchedulerOptions:
+        return TileSchedulerOptions(
+            Int32(max_active_clusters),
+            tile_count_semaphore=make_ptr(
+                Int32, tile_count_semaphore.data_ptr(), cute.AddressSpace.gmem, assumed_align=4
+            )
+            if tile_count_semaphore is not None
+            else None,
+            batch_idx_permute=(
+                from_dlpack(batch_idx_permute, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+            )
+            if batch_idx_permute is not None
+            else None,
+            max_swizzle_size=Int32(max_swizzle_size),
+        )
+    @staticmethod
+    def create_varlen_args(
+        cu_seqlens_m: Optional[Tensor],
+        cu_seqlens_k: Optional[Tensor],
+        A_idx: Optional[Tensor],
+        max_active_clusters: int,
+        cluster_shape_mnk: Tuple[int, int, int],
+        tensors: Dict[str, GemmTensorInfo],
+        num_epi_tensormaps: int = 0,
+        pingpong: bool = False,
+    ) -> Optional[Any]:
+        if cu_seqlens_m is None and cu_seqlens_k is None:
+            return None
+        # When varlen_m, we assume persistent=True
+        # Grid size depends on num_active_clusters and cluster size
+        cluster_size = cluster_shape_mnk[0] * cluster_shape_mnk[1]
+        num_blocks = max_active_clusters * cluster_size
+        # Calculate number of tensormaps needed
+        if cu_seqlens_m is not None:
+            # For varlen_m: need tensormaps for D and epilogue tensors
+            num_tensormaps = num_epi_tensormaps * (1 if not pingpong else 2)
+            if tensors["D"].tensor is not None:
+                num_tensormaps += 1 if not pingpong else 2  # D tensormap
+        else:
+            # For varlen_k: need tensormaps for A & B
+            num_tensormaps = 2 if A_idx is None else 1
+        # Create tensormap buffer (each tensormap is 128 bytes = 16 int64s)
+        tensormap_size = 128 // 8  # 16 int64s
+        if num_tensormaps > 0:
+            device = cu_seqlens_m.device if cu_seqlens_m is not None else cu_seqlens_k.device
+            tensormaps = torch.empty(
+                (num_blocks, num_tensormaps, tensormap_size),
+                dtype=torch.int64,
+                device=device,
+            )
+            tensormaps_cute = from_dlpack(tensormaps, assumed_align=128).mark_compact_shape_dynamic(
+                mode=0, stride_order=(0, 1, 2)
+            )
+        else:
+            tensormaps_cute = None
+        return VarlenArguments(
+            mCuSeqlensM=(
+                from_dlpack(cu_seqlens_m, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_m is not None
+                else None
+            ),
+            mCuSeqlensK=(
+                from_dlpack(cu_seqlens_k, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if cu_seqlens_k is not None
+                else None
+            ),
+            mTensormaps=tensormaps_cute,
+            mAIdx=(
+                from_dlpack(A_idx, assumed_align=4).mark_layout_dynamic(leading_dim=0)
+                if A_idx is not None
+                else None
+            ),
+        )
+    @staticmethod
+    def get_compile_key(
+        tensors: Dict[str, GemmTensorInfo],
+        activation: Optional[str],
+        tile_shape_mn: Tuple[int, int],
+        cluster_shape_mnk: Tuple[int, int, int],
+        pingpong: bool,
+        persistent: bool,
+        has_semaphore: bool,
+        *args,
+        key_tensor_names: Tuple[str, ...] = ("A", "B", "D", "C"),
+    ) -> Tuple:
+        key_parts = []
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].dtype)
+        key_parts.append(activation)
+        key_parts.extend([tile_shape_mn, cluster_shape_mnk])
+        for name in key_tensor_names:
+            if name in tensors:
+                key_parts.append(tensors[name].major)
+        key_parts.extend([pingpong, persistent, has_semaphore])
+        key_parts.extend(args)
+        return tuple(key_parts)

build/torch-cuda/quack/layout_utils.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+from .utils import prmt
+def transpose_view(a: cute.Tensor) -> cute.Tensor:
+    """Transpose the first two dimensions of a tensor on smem."""
+    shape = (a.shape[1], a.shape[0], *a.shape[2:])
+    order = (1, 0, *range(2, cute.rank(a)))
+    return cute.composition(a, cute.make_ordered_layout(shape, order=order))
+def select(a: cute.Tensor, mode: list[int]) -> cute.Tensor:
+    return cute.make_tensor(a.iterator, cute.select(a.layout, mode))
+def expand(a: cute.Tensor, dim: int, size: Int32 | int) -> cute.Tensor:
+    shape = (*a.shape[:dim], size, *a.shape[dim:])
+    stride = (*a.layout.stride[:dim], 0, *a.layout.stride[dim:])
+    return cute.make_tensor(a.iterator, cute.make_layout(shape, stride=stride))
+@cute.jit
+def permute_gated_Cregs_b16(t: cute.Tensor) -> None:
+    assert t.element_type.width == 16
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b16 permutation"
+    t_u32 = cute.recast_tensor(t, Int32)
+    quad_idx = cute.arch.lane_idx() % 4
+    lane_03 = quad_idx == 0 or quad_idx == 3
+    selector_upper = Int32(0x5410) if lane_03 else Int32(0x1054)
+    selector_lower = Int32(0x7632) if lane_03 else Int32(0x3276)
+    # upper_map = [0, 3, 1, 2]
+    # lower_map = [1, 2, 0, 3]
+    # upper_idx = upper_map[quad_idx]
+    # indexing isn't supported so we have to do arithmetic
+    upper_idx = quad_idx // 2 if quad_idx % 2 == 0 else 3 - quad_idx // 2
+    lower_idx = upper_idx ^ 1
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t_u32.shape) // 2, unroll_full=True):
+        upper, lower = t_u32[i * 2 + 0], t_u32[i * 2 + 1]
+        upper0 = upper if lane_03 else lower
+        lower0 = lower if lane_03 else upper
+        upper0 = cute.arch.shuffle_sync(upper0, offset=upper_idx, mask_and_clamp=mask_and_clamp)
+        lower0 = cute.arch.shuffle_sync(lower0, offset=lower_idx, mask_and_clamp=mask_and_clamp)
+        t_u32[i * 2 + 0] = prmt(upper0, lower0, selector_upper)
+        t_u32[i * 2 + 1] = prmt(upper0, lower0, selector_lower)
+@cute.jit
+def permute_Cregs_b32_for_stsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    to
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    This is so that we can use STSM (instead of STS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [2, 0, 3, 1]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b10
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a b | c d | e f | g h -> a b | c d | f e | h g
+            left0 = left if quad_idx < 2 else right
+            right0 = right if quad_idx < 2 else left
+            # a b | c d | f e | h g -> a b | f d | c e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a e | f b | c g | h d
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a e | f b | c g | h d -> a e | b f | c g | d h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx % 2 == 0 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx % 2 == 0 else left0
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+@cute.jit
+def permute_Cregs_b32_for_ldsm(t: cute.Tensor) -> None:
+    """Permute and shuffle within 4 threads to change the layout from
+    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
+    a  | b  | c  | d  | e  | f  | g  | h
+    to
+     T0 | T1  | T2  | T3
+    a b | c d | e f | g h
+    This is so that we can use LDSM (instead of LDS.64) to store C registers without bank conflict.
+    """
+    assert t.element_type.width == 32
+    assert cute.size(t.shape) % 4 == 0, "Tensor size must be a multiple of 4 for b32 permutation"
+    quad_idx = cute.arch.lane_idx() % 4
+    # left_map = [0, 2, 1, 3]
+    # right_map = [1, 3, 0, 2]
+    # indexing isn't supported so we have to do arithmetic
+    left_idx = quad_idx // 2 if quad_idx % 2 == 0 else 2 + quad_idx // 2
+    right_idx = left_idx ^ 0b01
+    # 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
+    width = 4
+    mask = cute.arch.WARP_SIZE - width
+    clamp = cute.arch.WARP_SIZE - 1
+    mask_and_clamp = mask << 8 | clamp
+    # This is just the inverse of permute_Cregs_b32_for_stsm
+    for i in cutlass.range(cute.size(t.shape) // 4, unroll_full=True):
+        t[i * 4 + 1], t[i * 4 + 2] = t[i * 4 + 2], t[i * 4 + 1]
+        for r in cutlass.range(2, unroll_full=True):
+            left, right = t[i * 4 + r * 2 + 0], t[i * 4 + r * 2 + 1]
+            # a e | b f | c g | d h -> a e | f b | c g | h d
+            left0 = left if quad_idx % 2 == 0 else right
+            right0 = right if quad_idx % 2 == 0 else left
+            # a e | f b | c g | h d -> a b | f d | c e | h g
+            right0 = cute.arch.shuffle_sync(right0, offset=right_idx, mask_and_clamp=mask_and_clamp)
+            # a b | f d | c e | h g -> a b | c d | f e | h g
+            left0 = cute.arch.shuffle_sync(left0, offset=left_idx, mask_and_clamp=mask_and_clamp)
+            # a b | c d | f e | h g -> a b | c d | e f | g h
+            t[i * 4 + r * 2 + 0] = left0 if quad_idx < 2 else right0
+            t[i * 4 + r * 2 + 1] = right0 if quad_idx < 2 else left0
+@cute.jit
+def concat_layout(*layouts: cute.Layout) -> cute.Layout:
+    return cute.make_layout(
+        tuple(l.shape for l in layouts),
+        stride=tuple(l.stride for l in layouts),
+    )
+def convert_layout_acc_mn(acc_layout: cute.Layout) -> cute.Layout:
+    """
+    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
+    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
+    """
+    acc_layout_col_major = cute.make_layout(acc_layout.shape)
+    acc_layout_mn = cute.make_layout(
+        (
+            (acc_layout_col_major.shape[0][1], acc_layout_col_major.shape[1]),  # MMA_M
+            (
+                acc_layout_col_major.shape[0][0],
+                *acc_layout_col_major.shape[0][2:],
+                acc_layout_col_major.shape[2],
+            ),  # MMA_N
+            *acc_layout_col_major.shape[3:],
+        ),
+        stride=(
+            (acc_layout_col_major.stride[0][1], acc_layout_col_major.stride[1]),  # MMA_M
+            (
+                acc_layout_col_major.stride[0][0],
+                *acc_layout_col_major.stride[0][2:],
+                acc_layout_col_major.stride[2],
+            ),  # MMA_N
+            *acc_layout_col_major.stride[3:],
+        ),
+    )
+    return cute.composition(acc_layout, acc_layout_mn)
+def make_acc_tensor_mn_view(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
+def reshape_acc_to_mn(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_mn(acc.layout))
+@cute.jit
+def convert_layout_acc_frgA(acc_layout: cute.Layout) -> cute.Layout:
+    # For back to back gemm, convert layout of acc0 to gemm 1 accept layout.
+    # For Sm80, as the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+    # For Sm90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
+    # TODO: Sm90 FP8
+    if const_expr(cute.rank(acc_layout.shape[0]) == 3):  # Sm90
+        l = cute.logical_divide(
+            acc_layout, ((None, None, 2), None, None)
+        )  # ((2, 2, (2, N / 16)), MMA_M, MMA_N)
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0][0], l.shape[0][1], l.shape[0][2][0]),
+                l.shape[1],
+                (l.shape[0][2][1], l.shape[2]),
+            ),
+            stride=(
+                (l.stride[0][0], l.stride[0][1], l.stride[0][2][0]),
+                l.stride[1],
+                (l.stride[0][2][1], l.stride[2]),
+            ),
+        )
+    else:  # Sm80
+        # (4, MMA_M, MMA_N) -> (4, MMA_M, (2, MMA_N / 2))
+        l = cute.logical_divide(acc_layout, (None, None, 2))
+        rA_mma_view = cute.make_layout(
+            (
+                (l.shape[0], l.shape[2][0]),
+                l.shape[1],
+                l.shape[2][1],
+            ),
+            stride=(
+                (l.stride[0], l.stride[2][0]),
+                l.stride[1],
+                l.stride[2][1],
+            ),
+        )
+    return rA_mma_view
+def reshape_acc_to_frgA(acc: cute.Tensor) -> cute.Tensor:
+    return cute.make_tensor(acc.iterator, convert_layout_acc_frgA(acc.layout))
+def convert_layout_zero_stride(
+    input: cute.Tensor | cute.Layout, ref_layout: cute.Layout
+) -> cute.Layout:
+    layout = input.layout if const_expr(isinstance(input, cute.Tensor)) else input
+    # Group the modes with non-zero stride in the ref_layout together,
+    # and the modes with zero stride together
+    layout_flat = cute.flatten(layout)
+    ref_layout_flat = cute.flatten(ref_layout)
+    nonzero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride != 0]
+    zero_modes = [i for i in range(cute.rank(layout_flat)) if ref_layout_flat[i].stride == 0]
+    # There's an edge case when all modes are zero stride
+    new_shape = (
+        tuple(layout_flat[i].shape for i in nonzero_modes) if len(nonzero_modes) > 0 else (1,),
+        tuple(layout_flat[i].shape for i in zero_modes),
+    )
+    new_stride = (
+        tuple(layout_flat[i].stride for i in nonzero_modes) if len(nonzero_modes) > 0 else (0,),
+        tuple(layout_flat[i].stride for i in zero_modes),
+    )
+    out_layout = cute.make_layout(new_shape, stride=new_stride)
+    if const_expr(isinstance(input, cute.Tensor)):
+        return cute.make_tensor(input.iterator, out_layout)
+    else:
+        return out_layout
+def mma_partition_C_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_C(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]
+def mma_partition_A_vec(
+    sVec: cute.Tensor, thr_mma: cute.core.ThrMma, expand_shape: int, is_colvec: bool
+) -> cute.Tensor:
+    assert cute.rank(sVec) == 2
+    assert sVec.stride[0] == 1
+    stage = sVec.shape[1]
+    shape = (
+        (sVec.shape[0], expand_shape, stage)
+        if const_expr(is_colvec)
+        else (expand_shape, sVec.shape[0], stage)
+    )
+    stride = (1, 0, sVec.stride[1]) if const_expr(is_colvec) else (0, 1, sVec.stride[1])
+    sVec_mma = cute.make_tensor(sVec.iterator, cute.make_layout(shape, stride=stride))
+    tC_sVec = make_acc_tensor_mn_view(thr_mma.partition_A(sVec_mma))
+    return tC_sVec[None, 0, None] if const_expr(is_colvec) else tC_sVec[0, None, None]

build/torch-cuda/quack/pipeline.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Optional
+from dataclasses import dataclass
+import cutlass.cute as cute
+from cutlass import Boolean, Int32, const_expr
+from cutlass.cutlass_dsl import if_generate, and_, dsl_user_op
+from cutlass.pipeline import MbarrierArray, CooperativeGroup, PipelineOp, pipeline_init_wait
+from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
+from cutlass.pipeline import PipelineTmaUmma
+class PipelineStateWAdvance(PipelineState):
+    @dsl_user_op
+    def advance_iters(self, num_iterations: Int32, *, loc=None, ip=None):
+        self._count += Int32(num_iterations)
+        new_index = self._index + Int32(num_iterations)
+        # How many times did we cross the stages boundary
+        num_crossings = new_index // self.stages
+        self._phase ^= num_crossings
+        self._index = new_index % self.stages
+    # This can be overridden by derived classes
+    def __new_from_mlir_values__(self, values):
+        return PipelineStateWAdvance(
+            self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2])
+        )
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(1),
+        )
+    elif type is PipelineUserType.Consumer:
+        return PipelineStateWAdvance(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(0),
+        )
+    else:
+        assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+@dataclass(frozen=True)
+class PipelineTmaCpAsync(PipelineTmaAsync):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and AsyncThread consumers
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        tidx: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param tidx: thread index to consumer async threads
+        :type tidx: Int32 | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.AsyncThread
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if tidx is None:
+            tidx, _, _ = cute.arch.thread_idx()
+        if cta_layout_vmnk is None:
+            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
+        (
+            dst_rank,
+            is_signalling_thread,
+        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            dst_rank = None
+        else:
+            dst_rank = dst_rank
+        producer_mask = None
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            dst_rank,
+            is_signalling_thread,
+        )
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            is_tma_warp,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+        )
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)
+class MbarrierArrayWDropCount(MbarrierArray):
+    @dsl_user_op
+    def __init__(
+        self,
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+        drop_count: Optional[Int32] = None,
+        *,
+        loc=None,
+        ip=None,
+    ) -> None:
+        self.barrier_storage = barrier_storage
+        self.tx_count = tx_count
+        self.num_stages = num_stages
+        self.op_type, self.cg = agent
+        self.arrive_count = self.cg.size
+        self.drop_count = drop_count
+        if self.num_stages <= 0:
+            raise ValueError("Error: Mbarrier stage count must be greater than 0.")
+        if self.arrive_count <= 0:
+            raise ValueError("Error: Mbarrier arrive count must be greater than 0.")
+        if self.op_type is PipelineOp.TmaLoad and self.tx_count < 0:
+            raise ValueError("Error: Mbarrier tx count must not be less than 0 for TMA ops.")
+        if const_expr(drop_count is not None):
+            self.arrive_count = self.arrive_count - drop_count
+        # Store mbarrier base pointer
+        self.mbarrier_base = self.barrier_storage
+        # Mbarrier initialization in constructor
+        self.mbarrier_init(loc=loc, ip=ip)
+    def __extract_mlir_values__(self):
+        return [self.barrier_storage, self.drop_count]
+    def __new_from_mlir_values__(self, values):
+        return MbarrierArrayWDropCount(
+            values[0], self.num_stages, (self.op_type, self.cg), self.tx_count, values[1]
+        )
+@dataclass(frozen=True)
+class PipelineTmaCpAsyncUmma(PipelineTmaUmma):
+    """
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and UMMA consumers
+    (e.g. Blackwell mainloops)
+    """
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        producer_drop_count: Optional[Int32] = None,
+        mcast_mode_mn: tuple[int, int] = (1, 1),
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param mcast_mode_mn: Tuple specifying multicast modes for m and n dimensions (each 0 or 1)
+        :type mcast_mode_mn: tuple[int, int], optional
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = MbarrierArrayWDropCount(
+            barrier_storage.align(min_align=8),
+            num_stages,
+            producer,
+            tx_count,
+            drop_count=producer_drop_count,
+        )
+        sync_object_empty = PipelineTmaUmma._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk, mcast_mode_mn)
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        consumer_mask = producer_mask
+        pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaCpAsyncUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+    @dsl_user_op
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        is_tma_warp: Optional[Boolean] = True,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the
+        transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase, loc=loc, ip=ip),
+        )
+        # This is the difference between this and PipelineTmaAsync: we could have multiple
+        # warps calling this, but only 1 warp should do the arrive on the full barrier
+        if_generate(
+            and_(self.is_leader_cta, is_tma_warp),
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask, loc=loc, ip=ip),
+        )
+    @dsl_user_op
+    def producer_cpasync_commit(self, state: PipelineState, *, loc=None, ip=None):
+        """
+        We need the mbarrier to track the completion of cp.async
+        """
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state, loc=loc, ip=ip), loc=loc, ip=ip)

build/torch-cuda/quack/reduce.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# Copyright (c) 2025, Tri Dao.
+import math
+import operator
+from typing import Callable, Optional
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float32, Boolean, const_expr
+from . import utils as utils
+@cute.jit
+def block_reduce(
+    val: cute.Numeric, op: Callable, reduction_buffer: cute.Tensor, init_val: cute.Numeric = 0.0
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warp_per_row, warps_per_row)"""
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    warps_per_row = cute.size(reduction_buffer.shape[1])
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if lane_idx == 0:
+        reduction_buffer[row_idx, col_idx] = val
+    cute.arch.barrier()
+    block_reduce_val = init_val
+    if lane_idx < warps_per_row:
+        block_reduce_val = reduction_buffer[row_idx, lane_idx]
+    return cute.arch.warp_reduction(block_reduce_val, op)
+@cute.jit
+def cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: cute.Pointer,
+    init_val: cute.Numeric = 0.0,
+    phase: Optional[Int32] = None,
+) -> cute.Numeric:
+    """reduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
+    cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+    lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+    rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+    row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+    if warp_idx == 0:
+        with cute.arch.elect_one():
+            num_warps = rows_per_block * warps_per_row
+            cute.arch.mbarrier_arrive_and_expect_tx(
+                mbar_ptr,
+                num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+            )
+    if lane_idx < cluster_n:
+        utils.store_shared_remote(
+            val,
+            utils.elem_pointer(reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))),
+            mbar_ptr,
+            peer_cta_rank_in_cluster=lane_idx,
+        )
+    cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+    block_reduce_val = init_val
+    num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+    for i in cutlass.range_constexpr(num_iter):
+        idx = lane_idx + i * cute.arch.WARP_SIZE
+        if idx < cute.size(reduction_buffer, mode=[1]):
+            block_reduce_val = op(block_reduce_val, reduction_buffer[row_idx, idx])
+    return cute.arch.warp_reduction(block_reduce_val, op)
+@cute.jit
+def block_or_cluster_reduce(
+    val: cute.Numeric,
+    op: Callable,
+    reduction_buffer: cute.Tensor,
+    mbar_ptr: Optional[cute.Pointer],
+    phase: Optional[Int32] = None,
+    init_val: cute.Numeric = 0.0,
+) -> cute.Numeric:
+    """Perform either block or cluster reduction based on whether mbar_ptr is provided."""
+    if const_expr(mbar_ptr is None):
+        return block_reduce(val, op, reduction_buffer, init_val=init_val)
+    else:
+        return cluster_reduce(val, op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val)
+@cute.jit
+def row_reduce(
+    x: cute.TensorSSA | cute.Numeric,
+    op: cute.ReductionOp,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    phase: Optional[Int32] = None,
+    init_val: cute.Numeric = 0.0,
+    hook_fn: Optional[Callable] = None,
+) -> cute.Numeric:
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))"""
+    if const_expr(isinstance(x, cute.TensorSSA)):
+        val = x.reduce(op, init_val=init_val, reduction_profile=0)
+    else:
+        val = x
+    warp_op = {
+        cute.ReductionOp.ADD: operator.add,
+        cute.ReductionOp.MAX: cute.arch.fmax if const_expr(x.dtype == Float32) else max,
+        cute.ReductionOp.MIN: min,
+        cute.ReductionOp.MUL: operator.mul,
+    }[op]
+    val = cute.arch.warp_reduction(
+        val,
+        warp_op,
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if const_expr(hook_fn is not None):
+        hook_fn()
+    if const_expr(reduction_buffer is not None):
+        warps_per_row, cluster_n = reduction_buffer.shape[1]
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if const_expr(warps_per_row > 1 or cluster_n > 1):
+            val = block_or_cluster_reduce(
+                val, warp_op, reduction_buffer, mbar_ptr, phase=phase, init_val=init_val
+            )
+    return val
+@cute.jit
+def online_softmax_reduce(
+    x: cute.TensorSSA,
+    threads_per_row: cutlass.Constexpr[int],
+    reduction_buffer: Optional[cute.Tensor] = None,
+    mbar_ptr: Optional[cute.Pointer] = None,
+    hook_fn: Optional[Callable] = None,
+    phase: Optional[Int32] = None,
+    return_exp_x: bool = False,
+) -> [Float32, Float32, Optional[cute.TensorSSA]]:
+    assert x.dtype == Float32, "x must be of type Float32"
+    """reduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n), 2)"""
+    max_x = cute.arch.warp_reduction(
+        x.reduce(cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0),
+        cute.arch.fmax,
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    log2_e = math.log2(math.e)
+    exp_x = cute.math.exp2(x * log2_e - (max_x * log2_e), fastmath=True)
+    sum_exp_x = cute.arch.warp_reduction(
+        exp_x.reduce(cute.ReductionOp.ADD, init_val=0.0, reduction_profile=0),
+        operator.add,
+        threads_in_group=min(threads_per_row, cute.arch.WARP_SIZE),
+    )
+    if const_expr(hook_fn is not None):
+        hook_fn()
+    if const_expr(reduction_buffer is not None):
+        rows_per_block, (warps_per_row, cluster_n) = reduction_buffer.shape
+        assert cluster_n == 1 or mbar_ptr is not None, (
+            "mbar_ptr must be provided for cluster reduction"
+        )
+        if const_expr(warps_per_row > 1 or cluster_n > 1):
+            assert reduction_buffer.element_type == Int64, (
+                "reduction_buffer must be of type cute.Int64"
+            )
+            lane_idx, warp_idx = cute.arch.lane_idx(), cute.arch.warp_idx()
+            row_idx, col_idx = warp_idx // warps_per_row, warp_idx % warps_per_row
+            if const_expr(mbar_ptr is None):
+                if lane_idx == 0:
+                    reduction_buffer[row_idx, col_idx] = utils.f32x2_to_i64(max_x, sum_exp_x)
+                cute.arch.barrier()
+                max_x_single_warp = -Float32.inf
+                sum_exp_x = 0.0
+                if lane_idx < warps_per_row:
+                    max_x_single_warp, sum_exp_x = utils.i64_to_f32x2(
+                        reduction_buffer[row_idx, lane_idx]
+                    )
+                max_x_final = cute.arch.warp_reduction(max_x_single_warp, cute.arch.fmax)
+                sum_exp_x *= cute.math.exp(max_x_single_warp - max_x_final, fastmath=True)
+                sum_exp_x = cute.arch.warp_reduction(sum_exp_x, operator.add)
+                if const_expr(return_exp_x):
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
+                max_x = max_x_final
+            else:
+                cta_rank_in_cluster = cute.arch.block_idx_in_cluster()
+                if warp_idx == 0:
+                    with cute.arch.elect_one():
+                        num_warps = rows_per_block * warps_per_row
+                        cute.arch.mbarrier_arrive_and_expect_tx(
+                            mbar_ptr,
+                            num_warps * cluster_n * reduction_buffer.element_type.width // 8,
+                        )
+                if lane_idx < cluster_n:
+                    utils.store_shared_remote(
+                        utils.f32x2_to_i64(max_x, sum_exp_x),
+                        utils.elem_pointer(
+                            reduction_buffer, (row_idx, (col_idx, cta_rank_in_cluster))
+                        ),
+                        mbar_ptr,
+                        peer_cta_rank_in_cluster=lane_idx,
+                    )
+                cute.arch.mbarrier_wait(mbar_ptr, phase=phase if phase is not None else 0)
+                num_iter = cute.ceil_div(warps_per_row * cluster_n, cute.arch.WARP_SIZE)
+                max_x_single_warp = cute.make_fragment(num_iter, Float32)
+                max_x_single_warp.fill(-Float32.inf)
+                sum_exp_x_single_warp = cute.make_fragment(num_iter, Float32)
+                sum_exp_x_single_warp.fill(0.0)
+                for i in cutlass.range_constexpr(num_iter):
+                    idx = lane_idx + i * cute.arch.WARP_SIZE
+                    if idx < cute.size(reduction_buffer, mode=[1]):
+                        max_x_single_warp[i], sum_exp_x_single_warp[i] = utils.i64_to_f32x2(
+                            reduction_buffer[row_idx, idx]
+                        )
+                max_x_final = max_x_single_warp.load().reduce(
+                    cute.ReductionOp.MAX, init_val=-Float32.inf, reduction_profile=0
+                )
+                max_x_final = cute.arch.warp_reduction(max_x_final, cute.arch.fmax)
+                sum_exp_x = 0.0
+                for i in cutlass.range_constexpr(num_iter):
+                    sum_exp_x += sum_exp_x_single_warp[i] * cute.math.exp(
+                        max_x_single_warp[i] - max_x_final, fastmath=True
+                    )
+                sum_exp_x = cute.arch.warp_reduction(sum_exp_x, operator.add)
+                if const_expr(return_exp_x):
+                    exp_x *= cute.math.exp(max_x - max_x_final, fastmath=True)
+                max_x = max_x_final
+    return max_x, sum_exp_x, (exp_x if const_expr(return_exp_x) else None)
+@cute.jit
+def sum_swap_shuffle(
+    X: cute.Tensor, elem_per_lane: int = 1, subwarp_size: int = 1, warp_size: int = 32
+) -> cute.Tensor:
+    """
+    For warp reduction, we use Swap Shuffle
+    The normal way to reduction among threads:
+    use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
+    After each step of reduction, a half of threads won't work in the following steps.
+    That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
+    To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
+    we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
+    After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
+    We can recursively do this until the problem size is 1.
+    """
+    assert (
+        subwarp_size >= 1
+        and subwarp_size <= 32
+        and subwarp_size == 1 << int(math.log2(subwarp_size))
+    )
+    assert (
+        warp_size <= 32
+        and warp_size % subwarp_size == 0
+        and warp_size == 1 << int(math.log2(warp_size))
+    )
+    lane_idx = cute.arch.lane_idx() // subwarp_size
+    X = cute.logical_divide(X, cute.make_layout(elem_per_lane))  # (elem_per_lane, M)
+    numvec = cute.size(X, mode=[1])
+    assert numvec <= 32 // subwarp_size
+    # If X has more values than warp_size // subwarp_size, we first do a normal warp reduction
+    # to sum up values held by lanes further than size(X) away
+    for i in cutlass.range(
+        int(math.log2(numvec)), int(math.log2(warp_size // subwarp_size)), unroll_full=True
+    ):
+        for v in cutlass.range(cute.size(X), unroll_full=True):
+            shfl_val = cute.arch.shuffle_sync_bfly(X[v], offset=(1 << i) * subwarp_size)
+            X[v] = X[v] + shfl_val
+    for logm in cutlass.range_constexpr(int(math.log2(cute.size(X, mode=[1]))) - 1, -1, -1):
+        m = 1 << logm
+        for r in cutlass.range(m, unroll_full=True):
+            frg_A = X[None, r]
+            frg_B = X[None, r + m]
+            #  First half of threads swap fragments from the first half of data to the second
+            should_swap = not Boolean(lane_idx & m)
+            for v in cutlass.range(cute.size(frg_A), unroll_full=True):
+                # Step 1: swap
+                lower, upper = frg_A[v], frg_B[v]
+                frg_A[v] = upper if should_swap else lower
+                frg_B[v] = lower if should_swap else upper
+                # Step 2: shuffle
+                # each half of threads get a half of data from the other half of threads
+                shfl_val = cute.arch.shuffle_sync_bfly(frg_A[v], offset=m * subwarp_size)
+                # Step 3: reduction
+                frg_A[v] = frg_B[v] + shfl_val
+    return X[None, 0]

build/torch-cuda/quack/reduction_base.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+from typing import Type, Tuple, Optional
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Int64, Float32, const_expr
+from . import copy_utils as copy_utils
+class ReductionBase:
+    def __init__(self, dtype: Type[cutlass.Numeric], N: int, stage: int, reduction_dtype=Float32):
+        self.dtype = dtype
+        self.N = N
+        self.stage = stage
+        self.reduction_dtype = reduction_dtype
+    def _threads_per_row(self):
+        raise NotImplementedError()
+    def _num_threads(self):
+        return 128 if self.N <= 16384 else 256
+    def _set_cluster_n(self):
+        self.cluster_n = 1
+    def _get_tiled_copy(self, vecsize: int = 1):
+        assert self.N % vecsize == 0, f"Input N {self.N} is not divisible by vector size {vecsize}"
+        threads_per_row = self._threads_per_row()
+        num_threads = self._num_threads()
+        assert num_threads % cute.arch.WARP_SIZE == 0
+        num_blocks_N = cute.ceil_div(self.N // vecsize, threads_per_row * self.cluster_n)
+        tiler_mn = (num_threads // threads_per_row, vecsize * num_blocks_N * threads_per_row)
+        tiled_copy = copy_utils.tiled_copy_2d(self.dtype, threads_per_row, num_threads, vecsize)
+        return tiled_copy, tiler_mn, threads_per_row
+    def _get_reduction_buffer_layout(self, tv_layout: cute.Layout, cluster_n: int):
+        num_warps = cute.size(tv_layout, mode=[0]) // cute.arch.WARP_SIZE
+        warps_per_row = (
+            num_warps
+            if cute.rank(tv_layout.shape[0]) == 1
+            else max(tv_layout.shape[0][0] // cute.arch.WARP_SIZE, 1)
+        )
+        return cute.make_ordered_layout(
+            (num_warps // warps_per_row, (warps_per_row, cluster_n), self.stage),
+            order=(1, 0, 2),
+        )
+    def _allocate_reduction_buffer_and_mbar(
+        self, smem: cutlass.utils.SmemAllocator, tv_layout: cute.Layout, is_persistent: bool = False
+    ) -> Tuple[cute.Tensor, Optional[cute.Pointer]]:
+        reduction_buffer = smem.allocate_tensor(
+            self.reduction_dtype,
+            self._get_reduction_buffer_layout(tv_layout, self.cluster_n),
+            byte_alignment=8,
+        )
+        if const_expr(self.cluster_n > 1):
+            mbar_ptr = smem.allocate_array(
+                Int64, num_elems=self.stage if not is_persistent else self.stage * 2
+            )
+        else:
+            mbar_ptr = None
+        return reduction_buffer, mbar_ptr
+    @cute.jit
+    def _initialize_cluster(
+        self,
+        tidx: Int32,
+        mbar_ptr: cute.Pointer,
+        num_warps: int,
+        is_persistent: bool = False,
+    ):
+        if const_expr(self.cluster_n > 1):
+            if tidx < self.stage:  # Initialize full barrier
+                cute.arch.mbarrier_init(mbar_ptr + tidx, 1)
+                if const_expr(is_persistent):  # Initialize empty barrier
+                    cute.arch.mbarrier_init(
+                        mbar_ptr + self.stage + tidx, num_warps * self.cluster_n
+                    )
+            cute.arch.mbarrier_init_fence()
+            # Cluster arrive after barrier init
+            cute.arch.cluster_arrive_relaxed()

build/torch-cuda/quack/sm100_utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Type, Union
+import cutlass.cute as cute
+import cutlass.utils.blackwell_helpers as sm100_utils_og
+from cutlass.cute.nvgpu.tcgen05 import OperandMajorMode
+from cutlass.cutlass_dsl import Numeric, dsl_user_op
+@dsl_user_op
+def make_smem_layout_cpasync_a(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    a_dtype: Type[Numeric],
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """
+    :param tiled_mma: The tiled MMA used to partition tensor A
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The MMA tile shape
+    :type mma_tiler_mnk: cute.cute.Tile
+    :param a_dtype: The element type for tensor A
+    :type a_dtype: Type[Numeric]
+    :param num_stages: The number of pipeline stages for tensor A
+    :type num_stages: int
+    :return: SMEM layout for tensor A
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+    is_k_major = tiled_mma.op.a_major_mode == OperandMajorMode.K
+    a_smem_shape = tiled_mma.partition_shape_A(
+        cute.dice(mma_tiler_mnk, (1, None, 1), loc=loc, ip=ip)
+    )
+    a_smem_shape_mn_k = (
+        cute.size(a_smem_shape[0][0], loc=loc, ip=ip) * a_smem_shape[1],
+        cute.size(a_smem_shape[0][1], loc=loc, ip=ip) * a_smem_shape[2],
+    )
+    a_smem_layout_atom = sm100_utils_og.make_smem_layout_atom(
+        sm100_utils_og.get_smem_layout_atom_ab(
+            tiled_mma.op.a_major_mode,
+            a_dtype,
+            a_smem_shape_mn_k,
+            loc=loc,
+            ip=ip,
+        ),
+        a_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    a_smem_layout_staged = cute.tile_to_shape(
+        a_smem_layout_atom,
+        cute.append(a_smem_shape_mn_k, num_stages, loc=loc, ip=ip),
+        order=((1, 0, 2) if not is_k_major else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+    return a_smem_layout_staged

build/torch-cuda/quack/sm90_utils.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Type, Union, Optional
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils.hopper_helpers as sm90_utils_og
+from cutlass.cute.nvgpu import warpgroup
+from cutlass.cutlass_dsl import Numeric, dsl_user_op
+from cutlass import Float32, Int32, Boolean, const_expr
+from cutlass.utils import LayoutEnum
+@dsl_user_op
+def make_smem_layout(
+    dtype: Type[Numeric],
+    layout: LayoutEnum,
+    tile: cute.Tile,
+    stage: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    shape = cute.product_each(cute.shape(tile, loc=loc, ip=ip), loc=loc, ip=ip)
+    major_mode_size = shape[1] if layout.is_n_major_c() else shape[0]
+    smem_layout_atom = warpgroup.make_smem_layout_atom(
+        sm90_utils_og.get_smem_layout_atom(layout, dtype, major_mode_size),
+        dtype,
+    )
+    order = (1, 0, 2) if const_expr(layout.is_m_major_c()) else (0, 1, 2)
+    smem_layout_staged = cute.tile_to_shape(
+        smem_layout_atom,
+        cute.append(shape, stage) if const_expr(stage is not None) else shape,
+        order=order if const_expr(stage is not None) else order[:2],
+    )
+    return smem_layout_staged
+# For compatibility with blackwell_helpers.py
+make_smem_layout_epi = make_smem_layout
+@dsl_user_op
+def partition_for_epilogue(
+    cT: cute.Tensor,
+    epi_tile: cute.Tile,
+    tiled_copy: cute.TiledCopy,
+    tidx: Int32,
+    reference_src: bool,  # do register tensors reference the src or dst layout of the tiled copy
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Tensor:
+    thr_copy = tiled_copy.get_slice(tidx)
+    cT_epi = cute.flat_divide(cT, epi_tile)
+    # (CPY, CPY_M, CPY_N, EPI_M, EPI_N)
+    if const_expr(reference_src):
+        return thr_copy.partition_S(cT_epi, loc=loc, ip=ip)
+    else:
+        return thr_copy.partition_D(cT_epi, loc=loc, ip=ip)
+@cute.jit
+def gemm(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    zero_init: cutlass.Constexpr[bool] = False,
+    wg_wait: cutlass.Constexpr[int] = 0,
+    # A_in_regs: cutlass.Constexpr[bool] = False,
+    swap_AB: cutlass.Constexpr[bool] = False,
+) -> None:
+    if const_expr(swap_AB):
+        gemm(tiled_mma, acc, tCrB, tCrA, zero_init=zero_init, wg_wait=wg_wait, swap_AB=False)
+    else:
+        warpgroup.fence()
+        # We make a new mma_atom since we'll be modifying its attribute (accumulate).
+        # Otherwise the compiler complains "operand #0 does not dominate this use"
+        mma_atom = cute.make_mma_atom(tiled_mma.op)
+        mma_atom.set(warpgroup.Field.ACCUMULATE, not zero_init)
+        for k in cutlass.range_constexpr(cute.size(tCrA.shape[2])):
+            cute.gemm(mma_atom, acc, tCrA[None, None, k], tCrB[None, None, k], acc)
+            mma_atom.set(warpgroup.Field.ACCUMULATE, True)
+        warpgroup.commit_group()
+        if const_expr(wg_wait >= 0):
+            warpgroup.wait_group(wg_wait)
+def gemm_zero_init(
+    tiled_mma: cute.TiledMma,
+    shape: cute.Shape,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    wg_wait: int = -1,
+    swap_AB: bool = False,
+) -> cute.Tensor:
+    if const_expr(swap_AB):
+        return gemm_zero_init(
+            tiled_mma, shape[::-1], tCrB, tCrA, B_idx, A_idx, wg_wait, swap_AB=False
+        )
+    else:
+        acc = cute.make_fragment(tiled_mma.partition_shape_C(shape), Float32)
+        rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+        rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+        gemm(tiled_mma, acc, rA, rB, zero_init=True, wg_wait=wg_wait)
+        return acc
+def gemm_w_idx(
+    tiled_mma: cute.TiledMma,
+    acc: cute.Tensor,
+    tCrA: cute.Tensor,
+    tCrB: cute.Tensor,
+    zero_init: Boolean,
+    A_idx: Optional[Int32] = None,
+    B_idx: Optional[Int32] = None,
+    wg_wait: int = -1,
+    swap_AB: bool = False,
+) -> None:
+    if const_expr(swap_AB):
+        gemm_w_idx(tiled_mma, acc, tCrB, tCrA, zero_init, B_idx, A_idx, wg_wait, swap_AB=False)
+    else:
+        rA = tCrA if const_expr(A_idx is None) else tCrA[None, None, None, A_idx]
+        rB = tCrB if const_expr(B_idx is None) else tCrB[None, None, None, B_idx]
+        gemm(tiled_mma, acc, rA, rB, zero_init=zero_init, wg_wait=wg_wait)
+def partition_fragment_ABC(
+    thr_mma: cute.ThrMma,
+    shape_mnk: cute.Shape,
+    sA: Optional[cute.Tensor],
+    sB: Optional[cute.Tensor],
+    swap_AB: bool = False,
+):
+    is_rs = thr_mma.op.a_src == warpgroup.OperandSource.RMEM
+    if const_expr(not swap_AB):
+        acc = cute.make_fragment(thr_mma.partition_shape_C(shape_mnk[:2]), Float32)
+        if const_expr(not is_rs):
+            assert sA is not None
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_A(sA))
+        else:
+            tCrA = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[0], shape_mnk[2])))
+        assert sB is not None
+        tCrB = thr_mma.make_fragment_B(thr_mma.partition_B(sB))
+    else:
+        acc = cute.make_fragment(thr_mma.partition_shape_C((shape_mnk[1], shape_mnk[0])), Float32)
+        if const_expr(not is_rs):
+            assert sB is not None
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_A(sB))
+        else:  # B in rmem
+            tCrB = thr_mma.make_fragment_A(thr_mma.partition_shape_A((shape_mnk[1], shape_mnk[2])))
+        assert sA is not None
+        tCrA = thr_mma.make_fragment_B(thr_mma.partition_B(sA))
+    return acc, tCrA, tCrB

build/torch-cuda/quack/sort/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

build/torch-cuda/quack/sort/bitonic_sort.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Tri Dao.
+import math
+from typing import Optional
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, Float32, const_expr
+from .. import utils
+from .utils import compare_and_swap
+from .sorting_networks import optimal_sort
+@cute.jit
+def bitonic_merge(
+    arr: cute.Tensor,
+    n: Optional[cutlass.Constexpr[int]] = None,
+    start: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = True,
+) -> None:
+    """Merge a bitonic sequence into a sorted sequence using iterative approach."""
+    if const_expr(n is None):
+        n = cute.size(arr.shape)
+    if const_expr(n > 1):
+        num_levels = int(math.log2(n))
+        assert n == 2**num_levels, "n must be a power of 2"
+        # This one must be range_constexpr otherwise it's very slow for n = 128
+        for level in cutlass.range_constexpr(num_levels):
+            length = n >> level  # n // (2^level)
+            step = length // 2
+            for i in cutlass.range(n // length, unroll_full=True):
+                start_i = start + i * length
+                for j in cutlass.range(step, unroll_full=True):
+                    compare_and_swap(arr, start_i + j, start_i + j + step, ascending)
+@cute.jit
+def bitonic_sort(
+    arr: cute.Tensor,
+    n: Optional[cutlass.Constexpr[int]] = None,
+    start: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = True,
+) -> None:
+    """
+    Bitonic sort for small arrays of size N (power of 2, N <= 128).
+    Args:
+        arr: Array to sort
+        n: Size of array (must be power of 2 and <= 128)
+        start: Starting index (default 0)
+        ascending: Sort in ascending order (default True)
+    """
+    if const_expr(n is None):
+        n = cute.size(arr.shape)
+    assert n <= 128
+    if const_expr(n > 1):
+        if const_expr(n in [2, 4, 8, 16, 32, 64]):
+            optimal_sort(arr, n, start, ascending)
+        else:  # Fall back to bitonic sort
+            assert n % 2 == 0
+            # Sort first half in ascending order
+            bitonic_sort(arr, n // 2, start, True)
+            # Sort second half in descending order
+            bitonic_sort(arr, n // 2, start + n // 2, False)
+            # Merge the whole sequence
+            bitonic_merge(arr, n, start, ascending)
+@cute.jit
+def bitonic_topk_merge(
+    arr0: cute.Tensor,
+    arr1: cute.Tensor,
+    k: Optional[cutlass.Constexpr[int]] = None,
+    start0: cutlass.Constexpr[int] = 0,
+    start1: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = False,
+) -> None:
+    if const_expr(k is None):
+        k = cute.size(arr0.shape)
+    if const_expr(arr0.element_type == Float32):
+        minmax_fn = utils.fmin if ascending else cute.arch.fmax
+    else:
+        minmax_fn = min if ascending else max
+    # Write the top k elements to the first half of the array
+    for i in cutlass.range(k, unroll_full=True):
+        arr0[start0 + i] = minmax_fn(arr0[start0 + i], arr1[start1 + k - 1 - i])
+    # Now the 1st half is bitonic, we just need to merge it
+    bitonic_merge(arr0, k, start0, ascending)
+@cute.jit
+def bitonic_topk(
+    arr: cute.Tensor,
+    k: cutlass.Constexpr[int],
+    ascending: cutlass.Constexpr[bool] = False,
+    warp_width: cutlass.Constexpr[int] = cute.arch.WARP_SIZE,
+) -> cute.Tensor:
+    """
+    Bitonic top-k for small arrays of size N (power of 2, N <= 128).
+    Args:
+        arr: Array to sort
+        k: must be power of 2 and <= 128
+        ascending: Sort in ascending order (default False)
+    """
+    assert arr.element_type in [Float32, Int32]
+    n = cute.size(arr.shape)
+    assert k == 1 << int(math.log2(k)), "k must be a power of 2"
+    assert n % k == 0, "n must be divisible by k"
+    topk_vals = cute.make_fragment(k, arr.element_type)
+    for v in cutlass.range(k, unroll_full=True):
+        topk_vals[v] = arr[v]
+    bitonic_sort(topk_vals, ascending=ascending)
+    for i in cutlass.range(1, n // k, unroll_full=True):
+        other_vals = cute.make_fragment(k, arr.element_type)
+        for v in cutlass.range(k, unroll_full=True):
+            other_vals[v] = arr[i * k + v]
+        bitonic_sort(other_vals, ascending=ascending)
+        # Merge 2 sorted top-k sequences to get a new top-k sequence
+        bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)
+    # TODO: this is not efficient for large k (e.g. >= 16) since threads in the same warps
+    # do duplicate work.
+    for i in cutlass.range(int(math.log2(warp_width)), unroll_full=True):
+        other_vals = cute.make_fragment(k, arr.element_type)
+        for v in cutlass.range(k, unroll_full=True):
+            other_vals[v] = cute.arch.shuffle_sync_bfly(topk_vals[v], offset=1 << i)
+        bitonic_topk_merge(topk_vals, other_vals, ascending=ascending)
+    return topk_vals

build/torch-cuda/quack/sort/generate_sorting_networks.py ADDED Viewed

	@@ -0,0 +1,326 @@

+#!/usr/bin/env python3
+"""
+Generate optimized sorting network code from the optimal sorting network data.
+Based on data from: https://bertdobbelaere.github.io/sorting_networks.html
+This script generates CUTE DSL functions for optimal sorting networks of various sizes.
+"""
+import argparse
+import os
+import re
+from typing import List, Tuple, Dict
+# Network strings from bertdobbelaere.github.io/sorting_networks.html
+# Copy-paste network strings here, then run initialize_networks() to parse them
+NETWORK_STRINGS = {
+    # Size 2: 1 CE, depth 1
+    2: """
+[(0,1)]
+    """,
+    # Size 4: 5 CEs, depth 3
+    4: """
+[(0,2),(1,3)]
+[(0,1),(2,3)]
+[(1,2)]
+    """,
+    # Size 8: 19 CEs, depth 6
+    8: """
+[(0,2),(1,3),(4,6),(5,7)]
+[(0,4),(1,5),(2,6),(3,7)]
+[(0,1),(2,3),(4,5),(6,7)]
+[(2,4),(3,5)]
+[(1,4),(3,6)]
+[(1,2),(3,4),(5,6)]
+    """,
+    # Size 16: 60 CEs, depth 10
+    16: """
+[(0,13),(1,12),(2,15),(3,14),(4,8),(5,6),(7,11),(9,10)]
+[(0,5),(1,7),(2,9),(3,4),(6,13),(8,14),(10,15),(11,12)]
+[(0,1),(2,3),(4,5),(6,8),(7,9),(10,11),(12,13),(14,15)]
+[(0,2),(1,3),(4,10),(5,11),(6,7),(8,9),(12,14),(13,15)]
+[(1,2),(3,12),(4,6),(5,7),(8,10),(9,11),(13,14)]
+[(1,4),(2,6),(5,8),(7,10),(9,13),(11,14)]
+[(2,4),(3,6),(9,12),(11,13)]
+[(3,5),(6,8),(7,9),(10,12)]
+[(3,4),(5,6),(7,8),(9,10),(11,12)]
+[(6,7),(8,9)]
+    """,
+    # Size 32: 185 CEs, depth 14
+    32: """
+[(0,1),(2,3),(4,5),(6,7),(8,9),(10,11),(12,13),(14,15),(16,17),(18,19),(20,21),(22,23),(24,25),(26,27),(28,29),(30,31)]
+[(0,2),(1,3),(4,6),(5,7),(8,10),(9,11),(12,14),(13,15),(16,18),(17,19),(20,22),(21,23),(24,26),(25,27),(28,30),(29,31)]
+[(0,4),(1,5),(2,6),(3,7),(8,12),(9,13),(10,14),(11,15),(16,20),(17,21),(18,22),(19,23),(24,28),(25,29),(26,30),(27,31)]
+[(0,8),(1,9),(2,10),(3,11),(4,12),(5,13),(6,14),(7,15),(16,24),(17,25),(18,26),(19,27),(20,28),(21,29),(22,30),(23,31)]
+[(0,16),(1,8),(2,4),(3,12),(5,10),(6,9),(7,14),(11,13),(15,31),(17,24),(18,20),(19,28),(21,26),(22,25),(23,30),(27,29)]
+[(1,2),(3,5),(4,8),(6,22),(7,11),(9,25),(10,12),(13,14),(17,18),(19,21),(20,24),(23,27),(26,28),(29,30)]
+[(1,17),(2,18),(3,19),(4,20),(5,10),(7,23),(8,24),(11,27),(12,28),(13,29),(14,30),(21,26)]
+[(3,17),(4,16),(5,21),(6,18),(7,9),(8,20),(10,26),(11,23),(13,25),(14,28),(15,27),(22,24)]
+[(1,4),(3,8),(5,16),(7,17),(9,21),(10,22),(11,19),(12,20),(14,24),(15,26),(23,28),(27,30)]
+[(2,5),(7,8),(9,18),(11,17),(12,16),(13,22),(14,20),(15,19),(23,24),(26,29)]
+[(2,4),(6,12),(9,16),(10,11),(13,17),(14,18),(15,22),(19,25),(20,21),(27,29)]
+[(5,6),(8,12),(9,10),(11,13),(14,16),(15,17),(18,20),(19,23),(21,22),(25,26)]
+[(3,5),(6,7),(8,9),(10,12),(11,14),(13,16),(15,18),(17,20),(19,21),(22,23),(24,25),(26,28)]
+[(3,4),(5,6),(7,8),(9,10),(11,12),(13,14),(15,16),(17,18),(19,20),(21,22),(23,24),(25,26),(27,28)]
+    """,
+    # Size 64: 512 CEs, depth 21
+    64: """
+[(0,2),(1,3),(4,6),(5,7),(8,10),(9,11),(12,14),(13,15),(16,18),(17,19),(20,22),(21,23),(24,26),(25,27),(28,30),(29,31),(32,34),(33,35),(36,38),(37,39),(40,42),(41,43),(44,46),(45,47),(48,50),(49,51),(52,54),(53,55),(56,58),(57,59),(60,62),(61,63)]
+[(0,1),(2,3),(4,5),(6,7),(8,9),(10,11),(12,13),(14,15),(16,17),(18,19),(20,21),(22,23),(24,25),(26,27),(28,29),(30,31),(32,33),(34,35),(36,37),(38,39),(40,41),(42,43),(44,45),(46,47),(48,49),(50,51),(52,53),(54,55),(56,57),(58,59),(60,61),(62,63)]
+[(0,52),(1,2),(3,55),(4,48),(5,6),(7,51),(8,60),(9,10),(11,63),(12,56),(13,14),(15,59),(16,32),(17,18),(19,35),(20,24),(21,22),(23,27),(25,26),(28,44),(29,30),(31,47),(33,34),(36,40),(37,38),(39,43),(41,42),(45,46),(49,50),(53,54),(57,58),(61,62)]
+[(0,20),(1,53),(2,54),(3,23),(4,28),(5,49),(6,50),(7,31),(8,36),(9,61),(10,62),(11,39),(12,16),(13,57),(14,58),(15,19),(17,33),(18,34),(21,25),(22,26),(24,52),(27,55),(29,45),(30,46),(32,56),(35,59),(37,41),(38,42),(40,60),(43,63),(44,48),(47,51)]
+[(0,4),(1,21),(2,22),(3,7),(5,29),(6,30),(8,12),(9,37),(10,38),(11,15),(13,17),(14,18),(16,20),(19,23),(24,32),(25,53),(26,54),(27,35),(28,36),(31,39),(33,57),(34,58),(40,44),(41,61),(42,62),(43,47),(45,49),(46,50),(48,52),(51,55),(56,60),(59,63)]
+[(0,8),(1,5),(2,6),(3,11),(4,12),(7,15),(9,13),(10,14),(16,40),(17,21),(18,22),(19,43),(20,44),(23,47),(24,28),(25,33),(26,34),(27,31),(29,37),(30,38),(32,36),(35,39),(41,45),(42,46),(48,56),(49,53),(50,54),(51,59),(52,60),(55,63),(57,61),(58,62)]
+[(1,9),(2,10),(4,8),(5,13),(6,14),(7,11),(12,48),(15,51),(16,24),(17,41),(18,42),(19,27),(20,28),(21,45),(22,46),(23,31),(25,29),(26,30),(32,40),(33,37),(34,38),(35,43),(36,44),(39,47),(49,57),(50,58),(52,56),(53,61),(54,62),(55,59)]
+[(4,16),(5,9),(6,10),(7,19),(8,24),(11,27),(13,49),(14,50),(17,25),(18,26),(20,32),(21,29),(22,30),(23,35),(28,40),(31,43),(33,41),(34,42),(36,52),(37,45),(38,46),(39,55),(44,56),(47,59),(53,57),(54,58)]
+[(1,4),(5,17),(6,18),(8,16),(9,25),(10,26),(11,19),(12,24),(15,27),(21,33),(22,34),(29,41),(30,42),(36,48),(37,53),(38,54),(39,51),(44,52),(45,57),(46,58),(47,55),(59,62)]
+[(2,8),(9,17),(10,18),(12,20),(13,25),(14,26),(15,23),(24,32),(27,35),(28,36),(31,39),(37,49),(38,50),(40,48),(43,51),(45,53),(46,54),(55,61)]
+[(2,4),(12,16),(13,21),(14,22),(15,19),(20,24),(23,27),(25,33),(26,34),(28,32),(29,37),(30,38),(31,35),(36,40),(39,43),(41,49),(42,50),(44,48),(47,51),(59,61)]
+[(4,16),(5,20),(10,40),(13,17),(14,18),(21,25),(22,26),(23,53),(24,28),(27,31),(29,33),(30,34),(32,36),(35,39),(37,41),(38,42),(43,58),(45,49),(46,50),(47,59)]
+[(3,17),(6,36),(7,21),(8,32),(9,24),(11,41),(13,28),(14,44),(15,45),(18,48),(19,49),(22,52),(25,29),(26,30),(27,57),(31,55),(33,37),(34,38),(35,50),(39,54),(42,56),(46,60)]
+[(6,20),(8,16),(10,24),(11,25),(14,28),(15,29),(17,33),(18,32),(21,37),(22,36),(26,42),(27,41),(30,46),(31,45),(34,48),(35,49),(38,52),(39,53),(43,57),(47,55)]
+[(3,18),(5,8),(6,12),(7,22),(15,21),(17,32),(19,33),(23,37),(26,40),(30,44),(31,46),(41,56),(42,48),(45,60),(51,57),(55,58)]
+[(3,16),(7,20),(11,26),(18,24),(19,25),(22,28),(23,29),(27,33),(30,36),(34,40),(35,41),(37,52),(38,44),(39,45),(43,56),(47,60)]
+[(3,9),(7,13),(10,16),(11,17),(14,20),(15,30),(19,34),(21,36),(23,38),(25,40),(26,32),(27,42),(29,44),(31,37),(33,48),(43,49),(46,52),(47,53),(50,56),(54,60)]
+[(3,8),(7,10),(9,12),(11,18),(13,14),(15,24),(17,22),(19,28),(21,26),(23,25),(27,34),(29,36),(30,32),(31,33),(35,44),(37,42),(38,40),(39,48),(41,46),(45,52),(49,50),(51,54),(53,56),(55,60)]
+[(3,6),(7,12),(11,16),(15,17),(18,20),(19,24),(21,22),(23,30),(25,32),(26,28),(27,29),(31,38),(33,40),(34,36),(35,37),(39,44),(41,42),(43,45),(46,48),(47,52),(51,56),(57,60)]
+[(3,5),(6,8),(7,9),(10,12),(11,13),(14,16),(15,18),(17,20),(19,21),(22,24),(23,26),(25,28),(27,30),(29,32),(31,34),(33,36),(35,38),(37,40),(39,41),(42,44),(43,46),(45,48),(47,49),(50,52),(51,53),(54,56),(55,57),(58,60)]
+[(3,4),(7,8),(11,12),(13,14),(15,16),(17,18),(19,20),(21,22),(23,24),(25,26),(27,28),(29,30),(31,32),(33,34),(35,36),(37,38),(39,40),(41,42),(43,44),(45,46),(47,48),(49,50),(51,52),(55,56),(59,60)]
+    """,
+}
+# This will be populated by initialize_networks()
+OPTIMAL_NETWORKS: Dict[int, Tuple[int, int, List[List[Tuple[int, int]]]]] = {}
+def parse_network_string(network_str: str) -> List[List[Tuple[int, int]]]:
+    """
+    Parse a sorting network string from bertdobbelaere.github.io format.
+    Examples:
+    Input: "[(0,2),(1,3)], [(0,1),(2,3)], [(1,2)]"
+    Output: [[(0, 2), (1, 3)], [(0, 1), (2, 3)], [(1, 2)]]
+    Input: "[(0,1)], [(1,2)], [(0,1)]"
+    Output: [[(0, 1)], [(1, 2)], [(0, 1)]]
+    """
+    # Remove whitespace and split by '], ['
+    network_str = network_str.strip()
+    if not network_str:
+        return []
+    # Split into layer strings
+    layer_pattern = r"\[((?:\(\d+,\d+\)(?:,\(\d+,\d+\))*)?)\]"
+    layers = []
+    for match in re.finditer(layer_pattern, network_str):
+        layer_str = match.group(1)
+        if not layer_str.strip():
+            layers.append([])
+            continue
+        # Parse comparisons in this layer: (i,j), (k,l), ...
+        comparisons = []
+        comp_pattern = r"\((\d+),(\d+)\)"
+        for comp_match in re.finditer(comp_pattern, layer_str):
+            i, j = int(comp_match.group(1)), int(comp_match.group(2))
+            comparisons.append((i, j))
+        layers.append(comparisons)
+    return layers
+def calculate_network_stats(layers: List[List[Tuple[int, int]]]) -> Tuple[int, int, int]:
+    """Calculate depth, total comparisons, and max index from network layers."""
+    depth = len(layers)
+    total_comparisons = sum(len(layer) for layer in layers)
+    # Find maximum index to determine network size
+    max_index = 0
+    for layer in layers:
+        for i, j in layer:
+            max_index = max(max_index, i, j)
+    network_size = max_index + 1  # Since indices are 0-based
+    return depth, total_comparisons, network_size
+def add_network_from_string(size: int, network_str: str, description: str = ""):
+    """
+    Add a network from a string representation to the OPTIMAL_NETWORKS dictionary.
+    Args:
+        size: Size of the network (number of elements)
+        network_str: Network string in bertdobbelaere.github.io format
+        description: Optional description for debugging
+    """
+    try:
+        layers = parse_network_string(network_str)
+        depth, comparisons, detected_size = calculate_network_stats(layers)
+        if detected_size != size:
+            print(f"Warning: Network size mismatch! Expected {size}, detected {detected_size}")
+            print(f"Network string: {network_str[:100]}...")
+            return False
+        OPTIMAL_NETWORKS[size] = (depth, comparisons, layers)
+        if description:
+            print(f"Added network for size {size}: {description}")
+        print(f"  Depth: {depth}, Comparisons: {comparisons}")
+        return True
+    except Exception as e:
+        print(f"Error parsing network for size {size}: {e}")
+        print(f"Network string: {network_str[:100]}...")
+        return False
+def generate_networks_dict(
+    networks_data: Dict[int, Tuple[int, int, List[List[Tuple[int, int]]]]]
+) -> str:
+    """Generate the global networks dictionary."""
+    lines = ["networks = {"]
+    for size, (depth, num_comparisons, layers) in sorted(networks_data.items()):
+        # Format the network with proper indentation and newlines
+        network_lines = []
+        for i, layer in enumerate(layers):
+            if i == 0:
+                network_lines.append(f"            {layer}")
+            else:
+                network_lines.append(f",\n            {layer}")
+        if len(layers) == 1:
+            network_str = f"[{network_lines[0].strip()}]"
+        else:
+            network_str = "[\n" + "".join(network_lines) + "\n        ]"
+        lines.append(f"    # Size {size}: {num_comparisons} CEs, depth {depth}")
+        lines.append(f"    {size}: {network_str},")
+        lines.append("")
+    lines.append("}")
+    return "\n".join(lines)
+def generate_optimal_sort_function() -> str:
+    """Generate the single optimal_sort function that looks up networks by size."""
+    return """@cute.jit
+def optimal_sort(
+    arr: cute.Tensor,
+    n: cutlass.Constexpr[int],
+    start: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = True
+) -> None:
+    \"\"\"
+    Optimal sorting network dispatcher.
+    Args:
+        arr: Array to sort
+        n: Size of array (must be power of 2 and available in networks)
+        start: Starting index (default 0)
+        ascending: Sort in ascending order (default True)
+    Source: https://bertdobbelaere.github.io/sorting_networks.html
+    \"\"\"
+    assert n in networks
+    for level in networks[n]:
+        for i, j in level:
+            compare_and_swap(arr, start + i, start + j, ascending)
+"""
+def generate_sorting_networks_file(max_size: int = 64):
+    """Generate a complete sorting networks file with optimal networks up to max_size."""
+    output_file = os.path.join(os.path.dirname(__file__), "sorting_networks.py")
+    # Header
+    header = '''# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Tri Dao.
+"""
+Optimal sorting networks generated from: https://bertdobbelaere.github.io/sorting_networks.html
+This file was auto-generated by quack/sort/generate_sorting_networks.py. Do not edit it directly.
+"""
+# fmt: off
+# ruff: noqa
+# isort: skip_file
+import cutlass
+import cutlass.cute as cute
+from .utils import compare_and_swap
+'''
+    # Generate networks dictionary and optimal_sort function
+    sizes = [n for n in range(2, max_size + 1) if n in OPTIMAL_NETWORKS]
+    networks_dict = generate_networks_dict(OPTIMAL_NETWORKS)
+    optimal_sort_func = generate_optimal_sort_function()
+    # Combine everything
+    content = header + networks_dict + "\n\n\n" + optimal_sort_func
+    with open(output_file, "w") as f:
+        f.write(content)
+    print(f"Generated optimal sorting networks for sizes {sizes}")
+    print(f"Output written to: {output_file}")
+    return sizes
+def initialize_networks():
+    """Initialize the OPTIMAL_NETWORKS dictionary by parsing NETWORK_STRINGS."""
+    global OPTIMAL_NETWORKS
+    OPTIMAL_NETWORKS.clear()
+    for size, network_str in NETWORK_STRINGS.items():
+        success = add_network_from_string(size, network_str, f"Size {size} optimal network")
+        if not success:
+            print(f"Warning: Failed to parse network for size {size}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate optimal sorting network code from bertdobbelaere.github.io data"
+    )
+    parser.add_argument(
+        "--max-size",
+        "-m",
+        type=int,
+        default=64,
+        help="Maximum sorting network size to generate (default: 32)",
+    )
+    parser.add_argument(
+        "--stats", "-s", action="store_true", help="Print statistics about the optimal networks"
+    )
+    args = parser.parse_args()
+    # Initialize networks from strings
+    initialize_networks()
+    if args.stats:
+        print("Optimal Sorting Network Statistics:")
+        print("Size\tDepth\tComparisons\tLayers")
+        print("-" * 35)
+        for n in sorted(OPTIMAL_NETWORKS.keys()):
+            if n <= args.max_size:
+                depth, comparisons, layers = OPTIMAL_NETWORKS[n]
+                print(f"{n}\t{depth}\t{comparisons}\t\t{len(layers)}")
+    # Generate the sorting networks file
+    sizes = generate_sorting_networks_file(args.max_size)
+    print(f"\nGenerated optimal sorting networks for {len(sizes)} sizes")
+    print(f"Total networks: {len(sizes)}")
+    print(f"Max network size: {max(sizes)}")
+if __name__ == "__main__":
+    main()

build/torch-cuda/quack/sort/sorting_networks.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) 2025, Wentao Guo, Mayank Mishra, Tri Dao.
+"""
+Optimal sorting networks generated from: https://bertdobbelaere.github.io/sorting_networks.html
+This file was auto-generated by quack/sort/generate_sorting_networks.py. Do not edit it directly.
+"""
+# fmt: off
+# ruff: noqa
+# isort: skip_file
+import cutlass
+import cutlass.cute as cute
+from .utils import compare_and_swap
+networks = {
+    # Size 2: 1 CEs, depth 1
+    2: [[(0, 1)]],
+    # Size 4: 5 CEs, depth 3
+    4: [
+            [(0, 2), (1, 3)],
+            [(0, 1), (2, 3)],
+            [(1, 2)]
+        ],
+    # Size 8: 19 CEs, depth 6
+    8: [
+            [(0, 2), (1, 3), (4, 6), (5, 7)],
+            [(0, 4), (1, 5), (2, 6), (3, 7)],
+            [(0, 1), (2, 3), (4, 5), (6, 7)],
+            [(2, 4), (3, 5)],
+            [(1, 4), (3, 6)],
+            [(1, 2), (3, 4), (5, 6)]
+        ],
+    # Size 16: 60 CEs, depth 10
+    16: [
+            [(0, 13), (1, 12), (2, 15), (3, 14), (4, 8), (5, 6), (7, 11), (9, 10)],
+            [(0, 5), (1, 7), (2, 9), (3, 4), (6, 13), (8, 14), (10, 15), (11, 12)],
+            [(0, 1), (2, 3), (4, 5), (6, 8), (7, 9), (10, 11), (12, 13), (14, 15)],
+            [(0, 2), (1, 3), (4, 10), (5, 11), (6, 7), (8, 9), (12, 14), (13, 15)],
+            [(1, 2), (3, 12), (4, 6), (5, 7), (8, 10), (9, 11), (13, 14)],
+            [(1, 4), (2, 6), (5, 8), (7, 10), (9, 13), (11, 14)],
+            [(2, 4), (3, 6), (9, 12), (11, 13)],
+            [(3, 5), (6, 8), (7, 9), (10, 12)],
+            [(3, 4), (5, 6), (7, 8), (9, 10), (11, 12)],
+            [(6, 7), (8, 9)]
+        ],
+    # Size 32: 185 CEs, depth 14
+    32: [
+            [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 13), (14, 15), (16, 17), (18, 19), (20, 21), (22, 23), (24, 25), (26, 27), (28, 29), (30, 31)],
+            [(0, 2), (1, 3), (4, 6), (5, 7), (8, 10), (9, 11), (12, 14), (13, 15), (16, 18), (17, 19), (20, 22), (21, 23), (24, 26), (25, 27), (28, 30), (29, 31)],
+            [(0, 4), (1, 5), (2, 6), (3, 7), (8, 12), (9, 13), (10, 14), (11, 15), (16, 20), (17, 21), (18, 22), (19, 23), (24, 28), (25, 29), (26, 30), (27, 31)],
+            [(0, 8), (1, 9), (2, 10), (3, 11), (4, 12), (5, 13), (6, 14), (7, 15), (16, 24), (17, 25), (18, 26), (19, 27), (20, 28), (21, 29), (22, 30), (23, 31)],
+            [(0, 16), (1, 8), (2, 4), (3, 12), (5, 10), (6, 9), (7, 14), (11, 13), (15, 31), (17, 24), (18, 20), (19, 28), (21, 26), (22, 25), (23, 30), (27, 29)],
+            [(1, 2), (3, 5), (4, 8), (6, 22), (7, 11), (9, 25), (10, 12), (13, 14), (17, 18), (19, 21), (20, 24), (23, 27), (26, 28), (29, 30)],
+            [(1, 17), (2, 18), (3, 19), (4, 20), (5, 10), (7, 23), (8, 24), (11, 27), (12, 28), (13, 29), (14, 30), (21, 26)],
+            [(3, 17), (4, 16), (5, 21), (6, 18), (7, 9), (8, 20), (10, 26), (11, 23), (13, 25), (14, 28), (15, 27), (22, 24)],
+            [(1, 4), (3, 8), (5, 16), (7, 17), (9, 21), (10, 22), (11, 19), (12, 20), (14, 24), (15, 26), (23, 28), (27, 30)],
+            [(2, 5), (7, 8), (9, 18), (11, 17), (12, 16), (13, 22), (14, 20), (15, 19), (23, 24), (26, 29)],
+            [(2, 4), (6, 12), (9, 16), (10, 11), (13, 17), (14, 18), (15, 22), (19, 25), (20, 21), (27, 29)],
+            [(5, 6), (8, 12), (9, 10), (11, 13), (14, 16), (15, 17), (18, 20), (19, 23), (21, 22), (25, 26)],
+            [(3, 5), (6, 7), (8, 9), (10, 12), (11, 14), (13, 16), (15, 18), (17, 20), (19, 21), (22, 23), (24, 25), (26, 28)],
+            [(3, 4), (5, 6), (7, 8), (9, 10), (11, 12), (13, 14), (15, 16), (17, 18), (19, 20), (21, 22), (23, 24), (25, 26), (27, 28)]
+        ],
+    # Size 64: 521 CEs, depth 21
+    64: [
+            [(0, 2), (1, 3), (4, 6), (5, 7), (8, 10), (9, 11), (12, 14), (13, 15), (16, 18), (17, 19), (20, 22), (21, 23), (24, 26), (25, 27), (28, 30), (29, 31), (32, 34), (33, 35), (36, 38), (37, 39), (40, 42), (41, 43), (44, 46), (45, 47), (48, 50), (49, 51), (52, 54), (53, 55), (56, 58), (57, 59), (60, 62), (61, 63)],
+            [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11), (12, 13), (14, 15), (16, 17), (18, 19), (20, 21), (22, 23), (24, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35), (36, 37), (38, 39), (40, 41), (42, 43), (44, 45), (46, 47), (48, 49), (50, 51), (52, 53), (54, 55), (56, 57), (58, 59), (60, 61), (62, 63)],
+            [(0, 52), (1, 2), (3, 55), (4, 48), (5, 6), (7, 51), (8, 60), (9, 10), (11, 63), (12, 56), (13, 14), (15, 59), (16, 32), (17, 18), (19, 35), (20, 24), (21, 22), (23, 27), (25, 26), (28, 44), (29, 30), (31, 47), (33, 34), (36, 40), (37, 38), (39, 43), (41, 42), (45, 46), (49, 50), (53, 54), (57, 58), (61, 62)],
+            [(0, 20), (1, 53), (2, 54), (3, 23), (4, 28), (5, 49), (6, 50), (7, 31), (8, 36), (9, 61), (10, 62), (11, 39), (12, 16), (13, 57), (14, 58), (15, 19), (17, 33), (18, 34), (21, 25), (22, 26), (24, 52), (27, 55), (29, 45), (30, 46), (32, 56), (35, 59), (37, 41), (38, 42), (40, 60), (43, 63), (44, 48), (47, 51)],
+            [(0, 4), (1, 21), (2, 22), (3, 7), (5, 29), (6, 30), (8, 12), (9, 37), (10, 38), (11, 15), (13, 17), (14, 18), (16, 20), (19, 23), (24, 32), (25, 53), (26, 54), (27, 35), (28, 36), (31, 39), (33, 57), (34, 58), (40, 44), (41, 61), (42, 62), (43, 47), (45, 49), (46, 50), (48, 52), (51, 55), (56, 60), (59, 63)],
+            [(0, 8), (1, 5), (2, 6), (3, 11), (4, 12), (7, 15), (9, 13), (10, 14), (16, 40), (17, 21), (18, 22), (19, 43), (20, 44), (23, 47), (24, 28), (25, 33), (26, 34), (27, 31), (29, 37), (30, 38), (32, 36), (35, 39), (41, 45), (42, 46), (48, 56), (49, 53), (50, 54), (51, 59), (52, 60), (55, 63), (57, 61), (58, 62)],
+            [(1, 9), (2, 10), (4, 8), (5, 13), (6, 14), (7, 11), (12, 48), (15, 51), (16, 24), (17, 41), (18, 42), (19, 27), (20, 28), (21, 45), (22, 46), (23, 31), (25, 29), (26, 30), (32, 40), (33, 37), (34, 38), (35, 43), (36, 44), (39, 47), (49, 57), (50, 58), (52, 56), (53, 61), (54, 62), (55, 59)],
+            [(4, 16), (5, 9), (6, 10), (7, 19), (8, 24), (11, 27), (13, 49), (14, 50), (17, 25), (18, 26), (20, 32), (21, 29), (22, 30), (23, 35), (28, 40), (31, 43), (33, 41), (34, 42), (36, 52), (37, 45), (38, 46), (39, 55), (44, 56), (47, 59), (53, 57), (54, 58)],
+            [(1, 4), (5, 17), (6, 18), (8, 16), (9, 25), (10, 26), (11, 19), (12, 24), (15, 27), (21, 33), (22, 34), (29, 41), (30, 42), (36, 48), (37, 53), (38, 54), (39, 51), (44, 52), (45, 57), (46, 58), (47, 55), (59, 62)],
+            [(2, 8), (9, 17), (10, 18), (12, 20), (13, 25), (14, 26), (15, 23), (24, 32), (27, 35), (28, 36), (31, 39), (37, 49), (38, 50), (40, 48), (43, 51), (45, 53), (46, 54), (55, 61)],
+            [(2, 4), (12, 16), (13, 21), (14, 22), (15, 19), (20, 24), (23, 27), (25, 33), (26, 34), (28, 32), (29, 37), (30, 38), (31, 35), (36, 40), (39, 43), (41, 49), (42, 50), (44, 48), (47, 51), (59, 61)],
+            [(4, 16), (5, 20), (10, 40), (13, 17), (14, 18), (21, 25), (22, 26), (23, 53), (24, 28), (27, 31), (29, 33), (30, 34), (32, 36), (35, 39), (37, 41), (38, 42), (43, 58), (45, 49), (46, 50), (47, 59)],
+            [(3, 17), (6, 36), (7, 21), (8, 32), (9, 24), (11, 41), (13, 28), (14, 44), (15, 45), (18, 48), (19, 49), (22, 52), (25, 29), (26, 30), (27, 57), (31, 55), (33, 37), (34, 38), (35, 50), (39, 54), (42, 56), (46, 60)],
+            [(6, 20), (8, 16), (10, 24), (11, 25), (14, 28), (15, 29), (17, 33), (18, 32), (21, 37), (22, 36), (26, 42), (27, 41), (30, 46), (31, 45), (34, 48), (35, 49), (38, 52), (39, 53), (43, 57), (47, 55)],
+            [(3, 18), (5, 8), (6, 12), (7, 22), (15, 21), (17, 32), (19, 33), (23, 37), (26, 40), (30, 44), (31, 46), (41, 56), (42, 48), (45, 60), (51, 57), (55, 58)],
+            [(3, 16), (7, 20), (11, 26), (18, 24), (19, 25), (22, 28), (23, 29), (27, 33), (30, 36), (34, 40), (35, 41), (37, 52), (38, 44), (39, 45), (43, 56), (47, 60)],
+            [(3, 9), (7, 13), (10, 16), (11, 17), (14, 20), (15, 30), (19, 34), (21, 36), (23, 38), (25, 40), (26, 32), (27, 42), (29, 44), (31, 37), (33, 48), (43, 49), (46, 52), (47, 53), (50, 56), (54, 60)],
+            [(3, 8), (7, 10), (9, 12), (11, 18), (13, 14), (15, 24), (17, 22), (19, 28), (21, 26), (23, 25), (27, 34), (29, 36), (30, 32), (31, 33), (35, 44), (37, 42), (38, 40), (39, 48), (41, 46), (45, 52), (49, 50), (51, 54), (53, 56), (55, 60)],
+            [(3, 6), (7, 12), (11, 16), (15, 17), (18, 20), (19, 24), (21, 22), (23, 30), (25, 32), (26, 28), (27, 29), (31, 38), (33, 40), (34, 36), (35, 37), (39, 44), (41, 42), (43, 45), (46, 48), (47, 52), (51, 56), (57, 60)],
+            [(3, 5), (6, 8), (7, 9), (10, 12), (11, 13), (14, 16), (15, 18), (17, 20), (19, 21), (22, 24), (23, 26), (25, 28), (27, 30), (29, 32), (31, 34), (33, 36), (35, 38), (37, 40), (39, 41), (42, 44), (43, 46), (45, 48), (47, 49), (50, 52), (51, 53), (54, 56), (55, 57), (58, 60)],
+            [(3, 4), (7, 8), (11, 12), (13, 14), (15, 16), (17, 18), (19, 20), (21, 22), (23, 24), (25, 26), (27, 28), (29, 30), (31, 32), (33, 34), (35, 36), (37, 38), (39, 40), (41, 42), (43, 44), (45, 46), (47, 48), (49, 50), (51, 52), (55, 56), (59, 60)]
+        ],
+}
+@cute.jit
+def optimal_sort(
+    arr: cute.Tensor,
+    n: cutlass.Constexpr[int],
+    start: cutlass.Constexpr[int] = 0,
+    ascending: cutlass.Constexpr[bool] = True
+) -> None:
+    """
+    Optimal sorting network dispatcher.
+    Args:
+        arr: Array to sort
+        n: Size of array (must be power of 2 and available in networks)
+        start: Starting index (default 0)
+        ascending: Sort in ascending order (default True)
+    Source: https://bertdobbelaere.github.io/sorting_networks.html
+    """
+    assert n in networks
+    for level in networks[n]:
+        for i, j in level:
+            compare_and_swap(arr, start + i, start + j, ascending)

build/torch-cuda/quack/sort/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import cutlass.cute as cute
+from cutlass import Float32, const_expr
+from .. import utils
+@cute.jit
+def compare_and_swap(
+    arr: cute.Tensor, i: int, j: int, ascending: bool = True, use_selection: bool = False
+) -> None:
+    """Compare and swap elements at indices i and j in ascending or descending order."""
+    if const_expr(use_selection):
+        a, b = arr[i], arr[j]
+        if (a > b) ^ (not ascending):
+            arr[i] = b
+            arr[j] = a
+        # if const_expr(ascending):
+        #     if a > b:
+        #         arr[i] = b
+        #         arr[j] = a
+        # else:
+        #     if a < b:
+        #         arr[i] = b
+        #         arr[j] = a
+    else:
+        min_fn = min if const_expr(arr.element_type != Float32) else utils.fmin
+        max_fn = max if const_expr(arr.element_type != Float32) else cute.arch.fmax
+        if const_expr(ascending):
+            arr[i], arr[j] = min_fn(arr[i], arr[j]), max_fn(arr[i], arr[j])
+        else:
+            arr[i], arr[j] = max_fn(arr[i], arr[j]), min_fn(arr[i], arr[j])

build/torch-cuda/quack/tensormap_manager.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025, Tri Dao.
+from typing import Tuple
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, const_expr, Int32
+from cutlass.utils import TensorMapUpdateMode, TensorMapManager
+from cutlass._mlir.dialects import llvm
+@dataclass(frozen=True)
+class TensorMapManagerSm90(TensorMapManager):
+    """
+    We have to subclass cutlass.utils.TensorMapManager bc it takes in warp_id and only
+    perform the operation if warp_id matches the current warp.
+    But for Hopper pingpong gemm we want to call it with warp_id 0 and 4.
+    So we take in a boolean `is_manager_warp` to determine whether to perform the operation or not.
+    """
+    @cute.jit
+    def init_tensormap_from_atom(
+        self, copy_atom: cute.CopyAtom, dst_ptr: cute.Pointer, is_manager_warp: Boolean
+    ) -> None:
+        if is_manager_warp:
+            with cute.arch.elect_one():
+                cute.nvgpu.cpasync.copy_tensormap(copy_atom, dst_ptr)
+        cute.arch.sync_warp()
+        return
+    @cute.jit
+    def update_tensormap(
+        self,
+        tensor_gmem: Tuple[cute.Tensor, ...],
+        tma_copy_atom: Tuple[cute.CopyAtom, ...],
+        tensormap_gmem_ptr: Tuple[cute.Pointer, ...],
+        is_manager_warp: Boolean,
+        tensormap_smem_ptr: Tuple[cute.Pointer, ...],
+    ) -> None:
+        # updates before touching tensormap in global memory
+        if is_manager_warp:
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for copy_atom, tensor, smem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_smem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(copy_atom, tensor, smem_ptr)
+            # wait until it's safe to update tensormap in global memory
+            with cute.arch.elect_one():
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(0, read=True)
+            cute.arch.sync_warp()
+            # updates to tensormap in global memory
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr):
+                    cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr)
+            else:
+                for copy_atom, tensor, gmem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_gmem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(copy_atom, tensor, gmem_ptr)
+                cute.arch.sync_warp()
+                cute.nvgpu.cpasync.fence_tma_desc_release()
+    @cute.jit
+    def update_tensormap_shape(
+        self,
+        tensormap_gmem_ptr: Tuple[cute.Pointer, ...],
+        is_manager_warp: Boolean,
+        tensormap_smem_ptr: Tuple[cute.Pointer, ...],
+        shapes: Tuple[Int32, ...],
+        orders: cutlass.Constexpr[Tuple[int, ...]],
+    ) -> None:
+        # updates before touching tensormap in global memory
+        if is_manager_warp:
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for smem_ptr, shape, order in zip(tensormap_smem_ptr, shapes, orders):
+                    smem_ptr_i32 = smem_ptr.toint().ir_value()
+                    llvm.inline_asm(
+                        None,
+                        [smem_ptr_i32, Int32(shape).ir_value(), Int32(order).ir_value()],
+                        "{\n\t"
+                        ".reg .b64 smem_ptr_i64;\n\t"
+                        "cvt.u64.u32 smem_ptr_i64, $0;\n\t"
+                        f"tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [smem_ptr_i64], {order}, $1;\n\t"
+                        "}\n",
+                        "r,r",
+                        has_side_effects=True,
+                        is_align_stack=False,
+                        asm_dialect=llvm.AsmDialect.AD_ATT,
+                    )
+            # wait until it's safe to update tensormap in global memory
+            with cute.arch.elect_one():
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(0, read=True)
+            cute.arch.sync_warp()
+            # updates to tensormap in global memory
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr):
+                    cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr)
+            else:
+                assert len(shapes) == len(orders) == len(tensormap_gmem_ptr)
+                for gmem_ptr, shape, order in zip(tensormap_gmem_ptr, shapes, orders):
+                    gmem_ptr_i64 = gmem_ptr.toint().ir_value()
+                    llvm.inline_asm(
+                        None,
+                        [gmem_ptr_i64, Int32(shape).ir_value(), Int32(order).ir_value()],
+                        f"tensormap.replace.tile.global_dim.global.b1024.b32 [$0], {order}, $1;",
+                        "l,r",
+                        has_side_effects=True,
+                        is_align_stack=False,
+                        asm_dialect=llvm.AsmDialect.AD_ATT,
+                    )
+                cute.arch.sync_warp()
+                cute.nvgpu.cpasync.fence_tma_desc_release()