danieldk HF Staff commited on Mar 18

Commit

639dc8e

verified ·

1 Parent(s): 0d682db

Build uploaded using `kernels` (batch 10/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h +985 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h +610 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h +684 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h +250 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h +452 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h +70 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h +265 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h +325 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h +69 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h +231 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h +75 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h +236 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h +572 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h +543 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h +301 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +70 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h +69 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp +253 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h +234 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/reduction_op.h +97 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/scale_type.h +66 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h +255 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h +264 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h +74 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h +241 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h +443 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h +904 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h +175 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h +337 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h +126 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h +376 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h +177 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h +165 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h +127 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h +208 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h +228 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h +113 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h +142 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue.h +548 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h +234 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h +197 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h +335 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h +347 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h +206 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h +401 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h +224 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h +443 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h +513 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h +922 -0
build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h +1717 -0

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h ADDED Viewed

	@@ -0,0 +1,985 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+struct EmptyArguments {};
+template<class T, class = void>
+struct ElementwiseOpDispatcher {
+  using Arguments = EmptyArguments;
+  T op;
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments) {}
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value);
+  }
+};
+template<class T>
+struct ElementwiseOpDispatcher<T, std::void_t<typename T::Arguments>> {
+  using Arguments = typename T::Arguments;
+  Arguments args;
+  T op;
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments args_):args(args_) {}
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value, args);
+  }
+};
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasElementwise {
+public:
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::Default;
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), elementwise(elementwise_) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+public:
+  //
+  // Methods
+  //
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationPerChannelScalingBiasElementwise {
+public:
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+  static bool const IsPerChannelScalingSupported = true;
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute beta;                   ///< scales source tensor
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+public:
+  //
+  // Methods
+  //
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPerChannelScalingBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+    skip_elementwise_ = false;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbeta,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + vbeta[i] * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbeta,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+    ElementCompute z = binary_op(valpha * tmp_Accum + vbeta * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h ADDED Viewed

	@@ -0,0 +1,610 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+#pragma once
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element, int ElementsPerAccess>
+struct ArrayMaximum {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Array<Element, ElementsPerAccess>  const &rhs) const {
+    Array<Element, ElementsPerAccess> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Element                                   rhs) const {
+    Array<Element, ElementsPerAccess> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs);
+    }
+    return result;
+  }
+};
+/// Partial specialization: Element=float
+template <int ElementsPerAccess>
+struct ArrayMaximum<float, ElementsPerAccess> {
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    Array<float, ElementsPerAccess>  const &rhs) const {
+    Array<float, ElementsPerAccess> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    float rhs) const {
+    Array<float, ElementsPerAccess> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs);
+    }
+    return result;
+  }
+};
+/// Partial specialization: Element=half
+template <int ElementsPerAccess>
+struct ArrayMaximum<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    Array<half_t, ElementsPerAccess>  const &rhs) const {
+    Array<half_t, ElementsPerAccess> result;
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(rhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+    #else
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const *rhs_ptr = reinterpret_cast<__half const *>(rhs.raw_data());
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    half_t const &rhs) const {
+    Array<half_t, ElementsPerAccess> result;
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+    __half rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half2 rhs_pair = __half2half2(rhs_raw);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+    #else
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const  rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+    #endif
+    return result;
+  }
+};
+/// Partial specialization: Element=bfloat16_t
+template <int ElementsPerAccess>
+struct ArrayMaximum<bfloat16_t, ElementsPerAccess> {
+  using NvType   = __nv_bfloat16;
+  using NvTypeV2 = __nv_bfloat162;
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    Array<bfloat16_t, ElementsPerAccess>  const &rhs) const {
+    Array<bfloat16_t, ElementsPerAccess> result;
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2 const *rhs_ptr = reinterpret_cast<NvTypeV2 const *>(rhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    #else
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const *rhs_ptr = reinterpret_cast<NvType const *>(rhs.raw_data());
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    bfloat16_t                                   rhs) const {
+    Array<bfloat16_t, ElementsPerAccess> result;
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+    NvType rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvTypeV2 rhs_pair = __bfloat162bfloat162(rhs_raw);
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+    #else
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const  rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+    #endif
+    return result;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, int ElementsPerAccess>
+struct ReluConditional {
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<Element, ElementsPerAccess> const &fragment,
+    Element threshold) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !(fragment[i] < threshold);
+    }
+  }
+};
+template <int ElementsPerAccess>
+struct ReluConditional<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<half_t, ElementsPerAccess> const &fragment,
+    half_t threshold) const {
+    __half y = reinterpret_cast<__half const &>(threshold);
+    __half const *x = reinterpret_cast<__half const *>(fragment.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+template <int ElementsPerAccess>
+struct ReluConditional<bfloat16_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<bfloat16_t, ElementsPerAccess> const &fragment,
+    bfloat16_t threshold) const {
+    __nv_bfloat16 y = reinterpret_cast<__nv_bfloat16 const &>(threshold);
+    __nv_bfloat16 const *x = reinterpret_cast<__nv_bfloat16 const *>(fragment.raw_data());
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing
+/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization.
+///
+/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>.
+///
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  int ElementsPerAccess,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasRelu {
+public:
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementVector = ElementVector_;
+  using ElementT = uint1b_t;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  using ElementwiseOp = ReLu<ElementCompute>;
+  using BinaryOp = plus<ElementCompute>;
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementZ threshold;                    ///< ReLu threshold
+    //
+    // Methods
+    //
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute()),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      threshold(ElementCompute()) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold_ = ElementCompute()
+    ):
+      alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+      threshold = convert_threshold(threshold_);
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(ElementZ()) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold_ = ElementCompute()
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+      threshold = convert_threshold(threshold_);
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(ElementZ()) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementZ threshold_;
+public:
+  //
+  // Methods
+  //
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasRelu(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementZ const &>(allones);
+    }
+  }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    bool conditions[kElementsPerAccess];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = alpha_ * tmp_Accum[i];
+      z += beta_ * tmp_C[i];
+      z = binary_op(z, V[i]);
+      result_Z[i] = z;
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    //
+    // Compute condition
+    //
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions);
+    }
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+    BinaryOp binary_op;
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    bool conditions[kElementsPerAccess];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_Z[i] = z;
+    }
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+    //
+    // Compute condition
+    //
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+    //
+    // Compute conditions
+    //
+    //
+    // Store
+    //
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h ADDED Viewed

	@@ -0,0 +1,684 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear scaling operations used by epilogues. Values are clamped before
+         converting to the output element type.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationClampIsHeavy() {
+  return false;
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationClamp {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_add_accumulator;
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+#endif // Conditional guards to enable partial specialization for packed integers
+////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements then clamps
+/// the output before converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+/// or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+/// above.
+template <
+    /// Data type used to load and store< tensors
+    typename ElementOutput_,
+    /// Number of elements computed per operation
+    int Count,
+    ///< Control Alpha and Beta scaling
+    ScaleType::Kind Scale = ScaleType::Default,
+    /// Rounding mode
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class FastLinearCombinationClamp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  static bool const kIsHeavy = false;
+  /// Host-constructable parameters structure
+  struct Params {
+    /// scales accumulators
+    ElementCompute alpha;
+    /// scales source tensor
+    ElementCompute beta;
+    /// pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *alpha_ptr;
+    /// pointer to source scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params()
+        : alpha(ElementCompute(1)),
+          beta(ElementCompute(0)),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha)
+        : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  FastLinearCombinationClamp(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            FragmentOutput const &source,
+                            ElementCompute uniform = ElementCompute(0)) const {
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementOutput, kCount, Round>
+        source_converter;
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate =
+          mul_add_source(beta_, converted_source);  // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator,
+                                         intermediate);  // D = alpha * Accum + X
+    }
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);
+    }
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h ADDED Viewed

	@@ -0,0 +1,250 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination followed by dGelu operation
+*/
+#pragma once
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDGelu {
+public:
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+  static bool const kIsHeavy = true;
+  static int const kCount = Count;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDGelu(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    dGELU<ElementCompute>  gelu_op;
+    // dGelu
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+    return intermediate;
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    dGELU<ElementCompute>  gelu_op;
+    // dGelu with conversion
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+    return intermediate;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h ADDED Viewed

	@@ -0,0 +1,452 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+#pragma once
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDRelu {
+public:
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+  static int const kCount = Count;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementTensor threshold_;
+  bool participates_in_reduction_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDRelu(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = ElementTensor(params.threshold);
+    participates_in_reduction_  = true;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementTensor const &>(allones);
+      participates_in_reduction_ = false;
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+    return intermediate;
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+    return intermediate;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  int Count,                                           ///< Number of elements computed per operation
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDReluConditionalBits {
+public:
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = uint1b_t;
+  static bool const kIsHeavy = false;
+  static int const kCount = Count;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  FragmentTensor predicate_mask_;
+  bool participates_in_reduction_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDReluConditionalBits(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    participates_in_reduction_ = true;
+    predicate_mask_.clear();
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    predicate_mask_.clear();
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+      bit_not<FragmentTensor> not_op;
+      predicate_mask_ = not_op(predicate_mask_);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    bit_or<FragmentTensor> or_op;
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+    unpack_predicates(conditions, predicates);
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+    return intermediate;
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    bit_or<FragmentTensor> or_op;
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+    unpack_predicates(conditions, predicates);
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+    return intermediate;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with GELU operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator followed by the GELU activation to an array of elements.
+///
+/// D = gelu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h ADDED Viewed

	@@ -0,0 +1,265 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class Activation, class = void>
+struct GenericActivationTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+template <class Activation>
+struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename Activation::Arguments;
+};
+template <typename T>
+struct LinearCombinationGenericParams {
+  T alpha;                  ///< scales accumulators
+  T beta;                   ///< scales source tensor
+  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams():
+    alpha(T(1)),
+    beta(T(0)),
+    alpha_ptr(nullptr),
+    beta_ptr(nullptr) { }
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T alpha,
+    T beta = T(0)
+  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T const *alpha_ptr,
+    T const *beta_ptr = nullptr
+  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator followed by an activation function to an array of elements.
+///
+/// D = activation(alpha * accumulator + beta * source + uniform)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGeneric {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params
+    : LinearCombinationGenericParams<ElementCompute>,
+      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+  };
+private:
+  //
+  // Data members
+  //
+  Params params_;
+  bool skip_elementwise_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGeneric(Params const &params) {
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return params_.beta != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.beta = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(params_.beta, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
+    }
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h ADDED Viewed

	@@ -0,0 +1,325 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations with a generic element-wise activation
+  function. Scaling factors are applied to operands A, B, and C. The pre-activation auxiliary
+  output is also returned.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+///   D = activation(Aux)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  typename ElementAuxOutput_,                          ///< Data type used to store auxiliary output
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGenericWithScalingAndAbsMax {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAuxOutput = ElementAuxOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalingFactor = ElementAccumulator_;
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = float;
+  static bool const kIsScalingAndAmaxAuxOutputNeeded = (platform::is_same<ElementAuxOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementAuxOutput, cutlass::float_e5m2_t>::value);
+  static bool const kIsScalingAndAmaxOutputNeeded    = (platform::is_same<ElementOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementOutput, cutlass::float_e5m2_t>::value);
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAuxOutput = Array<ElementAuxOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    struct ActivationParams
+      : LinearCombinationGenericParams<ElementCompute>,
+        GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+      using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+    };
+    ActivationParams activation;
+    ElementScalingFactor const* scale_a_ptr = nullptr;   ///< pointer to a scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_b_ptr = nullptr;   ///< pointer to b scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_c_ptr = nullptr;   ///< pointer to c scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_d_ptr = nullptr;   ///< pointer to d scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_aux_ptr = nullptr; ///< pointer to aux scalar - if not null, loads it from memory
+    ElementAbsmax * abs_max_aux_ptr = nullptr;      ///< pointer to location to store amax of Aux
+    ElementAbsmax * abs_max_D_ptr   = nullptr;      ///< pointer to location to store amax of D
+    CUTLASS_HOST_DEVICE
+    Params() :
+      scale_a_ptr(nullptr),
+      scale_b_ptr(nullptr),
+      scale_c_ptr(nullptr),
+      scale_d_ptr(nullptr),
+      scale_aux_ptr(nullptr),
+      abs_max_aux_ptr(nullptr),
+      abs_max_D_ptr(nullptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ActivationParams activation_params,
+           ElementScalingFactor const* scale_a_ptr,
+           ElementScalingFactor const* scale_b_ptr,
+           ElementScalingFactor const* scale_c_ptr,
+           ElementScalingFactor const* scale_d_ptr,
+           ElementScalingFactor const* scale_aux_ptr,
+           ElementAbsmax * abs_max_aux_ptr,
+           ElementAbsmax * abs_max_D_ptr) :
+           activation(activation_params),
+           scale_a_ptr(scale_a_ptr),
+           scale_b_ptr(scale_b_ptr),
+           scale_c_ptr(scale_c_ptr),
+           scale_d_ptr(scale_d_ptr),
+           scale_aux_ptr(scale_aux_ptr),
+           abs_max_aux_ptr(abs_max_aux_ptr),
+           abs_max_D_ptr(abs_max_D_ptr) {}
+  };
+private:
+  //
+  // Data members
+  //
+  Params params_;
+  bool skip_elementwise_;
+  // Scaling factors for output and auxiliary output
+  ElementCompute scale_d_;
+  ElementCompute scale_aux_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericWithScalingAndAbsMax(Params const &params) :
+    params_(params),
+    skip_elementwise_(false),
+    scale_d_(ElementCompute(params.scale_d_ptr ? *(params.scale_d_ptr) : ElementScalingFactor(1))),
+    scale_aux_(ElementCompute(params.scale_aux_ptr ? *(params.scale_aux_ptr) : ElementScalingFactor(1)))
+  {
+    params_.activation.alpha = (params.activation.alpha_ptr ? *params.activation.alpha_ptr : params.activation.alpha);
+    params_.activation.beta = (params.activation.beta_ptr ? *params.activation.beta_ptr : params.activation.beta);
+    auto scale_a =
+        ElementCompute(params.scale_a_ptr ? *(params.scale_a_ptr) : ElementScalingFactor(1));
+    auto scale_b =
+        ElementCompute(params.scale_b_ptr ? *(params.scale_b_ptr) : ElementScalingFactor(1));
+    auto scale_c =
+        ElementCompute(params.scale_c_ptr ? *(params.scale_c_ptr) : ElementScalingFactor(1));
+    multiplies<ElementCompute> multiply;
+    params_.activation.alpha = multiply(params.activation.alpha, multiply(scale_a, scale_b));
+    params_.activation.beta = multiply(params.activation.beta, scale_c);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return params_.activation.beta != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.activation.beta = ElementCompute(1);
+    }
+    // Only the final partition should perform the activation function
+    // and scale the output and auxiliary output values.
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+      scale_d_ = ElementCompute(1.);
+      scale_aux_ = ElementCompute(1.);
+    }
+  }
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + (beta * scale_c * source) + bias
+  ///      D = activation(Aux)
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias,
+    FragmentOutput const &source) {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.beta, converted_source);
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }
+    intermediate = add(intermediate, bias);
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + bias
+  ///      D = activation(Aux)
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias) {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    ActivationFunctor<FragmentCompute> activation;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.alpha, converted_accumulator);
+    }
+    intermediate = add(intermediate, bias);
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_output_abs_max() const {
+    return params_.abs_max_D_ptr;
+  }
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_aux_output_abs_max() const {
+    return params_.abs_max_aux_ptr;
+  }
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_d() const {
+    return scale_d_;
+  }
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_aux() const {
+    return scale_aux_;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h ADDED Viewed

	@@ -0,0 +1,69 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with HardSwish operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator followed by the HardSwish activation to an array of elements.
+///
+/// D = hardswish(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
+                                                            ElementCompute_, Scale, Round>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h ADDED Viewed

	@@ -0,0 +1,231 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationLeakyRelu {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta_bias;              ///< scales bias tensor
+    ElementCompute leaky_alpha;            ///< leaky_alpha
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta_bias(ElementCompute(0)),
+      leaky_alpha(ElementCompute(1))
+       { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta_bias,
+      ElementCompute leaky_alpha = ElementCompute(1)
+    ): alpha(alpha), beta_bias(beta_bias), leaky_alpha(leaky_alpha) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_bias_;
+  ElementCompute leaky_alpha_recip_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationLeakyRelu(Params const &params) {
+    alpha_ = (params.alpha);
+    beta_bias_ = (params.beta_bias);
+    leaky_alpha_recip_ = (ElementCompute(params.leaky_alpha));
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_bias_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    LeakyReLU<ComputeFragment> leakyrelu;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_bias_, converted_source);                        // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+    LeakyReLU<ComputeFragment> leakyrelu;
+    //printf("in doing with bias");
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h ADDED Viewed

	@@ -0,0 +1,75 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct LinearCombinationParams {
+  uint64_t alpha_data[2];
+  uint64_t beta_data[2];
+  CUTLASS_HOST_DEVICE
+  LinearCombinationParams()
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu}
+  { }
+  template <typename ElementCompute>
+  CUTLASS_HOST_DEVICE
+  LinearCombinationParams(ElementCompute alpha, ElementCompute beta)
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu}
+  {
+#if defined(__CUDA_ARCH__)
+    reinterpret_cast<ElementCompute&>(alpha_data) = alpha;
+    reinterpret_cast<ElementCompute&>(beta_data) = beta;
+#else
+    memcpy( alpha_data, &alpha, sizeof(ElementCompute) );
+    memcpy( beta_data, &beta, sizeof(ElementCompute) );
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h ADDED Viewed

	@@ -0,0 +1,236 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations on planar-complex arrays
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/complex.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to arrays of planar-complex elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  ScaleType::Kind Scale = ScaleType::Default           ///< Control Alpha and Beta scaling
+>
+class LinearCombinationPlanarComplex {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = complex<ElementCompute>;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = ArrayPlanarComplex<ElementOutput, kCount>;
+  using FragmentAccumulator = ArrayPlanarComplex<ElementAccumulator, kCount>;
+  using ComputeFragment = ArrayPlanarComplex<ElementCompute, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementScalar alpha{ElementCompute(1)};         ///< scales accumulators
+    ElementScalar beta{ElementCompute(0)};          ///< scales source tensor
+    ElementScalar const* alpha_ptr{nullptr};        ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementScalar const* beta_ptr{nullptr};         ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    Params() = default;
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar alpha,
+      ElementScalar beta
+    ): alpha(alpha), beta(beta)
+    {}
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar const *alpha_ptr,
+      ElementScalar const *beta_ptr
+    ): alpha_ptr(alpha_ptr), beta_ptr(beta_ptr)
+    {}
+  };
+private:
+  //
+  // Data members
+  //
+  ElementScalar alpha_;
+  ElementScalar beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPlanarComplex(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    return beta_.real() != ElementCompute(0) || beta_.imag() != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_source{
+      source_converter(source.real),
+      source_converter(source.imag)};
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real),
+      accumulator_converter(accumulator.imag)};
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+    // Perform binary operations
+    // complex multiply: I = beta * C
+    ComputeFragment intermediate {
+      mul_op(beta_.real(), converted_source.real),
+      mul_op(beta_.real(), converted_source.imag)
+    };
+    intermediate.real = mul_add_op(-beta_.imag(), converted_source.imag, intermediate.real);
+    intermediate.imag = mul_add_op( beta_.imag(), converted_source.real, intermediate.imag);
+    // complex multiply-add: I = alpha * AB + I
+    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real, intermediate.real);
+    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag, intermediate.imag);
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return FragmentOutput{
+      destination_converter(intermediate.real),
+      destination_converter(intermediate.imag)};
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real),
+      accumulator_converter(accumulator.imag)};
+    // Perform binary operations
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+    // complex multiply-add: I = alpha * AB + I
+    ComputeFragment intermediate {
+      mul_op(alpha_.real(), converted_accumulator.real),
+      mul_op(alpha_.real(), converted_accumulator.imag)
+    };
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return FragmentOutput{
+      destination_converter(intermediate.real),
+      destination_converter(intermediate.imag)};
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h ADDED Viewed

	@@ -0,0 +1,572 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+#pragma once
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationReluIsHeavy() {
+  return false;
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+    ReLu<FragmentCompute> relu;
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+    ReLu<FragmentCompute> relu;
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+#endif // Conditional guards to enable partial specialization for packed integers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h ADDED Viewed

	@@ -0,0 +1,543 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a relu operation used by epilogues.
+  This one only supports relu0 and tries to folding relu into other instructions.  Thus,
+  serial splitk is not supported by this one.  For example, relu can be folded into
+  hfma2/hmul2 for sm80+
+*/
+#pragma once
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationRelu0IsHeavy() {
+  return false;
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu0 {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add_relu0<FragmentCompute> mul_add_relu0_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+      // Compute threshold optionally
+      intermediate = relu(intermediate);
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+    ReLu<FragmentCompute> relu;
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if (Scale == ScaleType::Nothing) return false;
+    return beta_ != ElementCompute(0);
+  }
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+    ReLu<FragmentCompute> relu;
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+      scaled_accumulator = compute_converter(intermediate);
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+#endif // Conditional guards to enable partial specialization for packed integers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h ADDED Viewed

	@@ -0,0 +1,301 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue functor specialized for residual blocks in deep neural networks.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          template <typename T> class BinaryOp2_ = detail::NoOp,
+          bool StoreT_ = false,
+          typename ElementVector_ = ElementC_>
+class LinearCombinationResidualBlock {
+public:
+  static bool const kIsSingleSource = false;
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp1 = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using BinaryOp2 = BinaryOp2_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+private:
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+public:
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+  /// Applies the operation UnaryOp(BinaryOp(BinaryOp(ActivationOp(AB + bias), residual1), residual2))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual1, FragmentC const &residual2,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp1 binary_op1;
+    BinaryOp2 binary_op2;
+    ActivationOp activation;
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual1 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual1);
+    FragmentCompute tmp_residual2 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual2);
+    FragmentCompute z =
+        binary_op2(binary_op1(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual1), beta_ * tmp_residual2);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+/// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          bool StoreT_,
+          typename ElementVector_>
+class LinearCombinationResidualBlock<ElementOutput_, ElementAccumulator_,
+          ElementCompute_, ElementC_, ElementsPerAccess,
+          ActivationOp_, BinaryOp1_, UnaryOp_,
+          detail::NoOp, StoreT_, ElementVector_> {
+public:
+  static bool const kIsSingleSource = true;
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+private:
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+public:
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+  /// Applies the operation UnaryOp(BinaryOp(ActivationOp(AB + bias), residual))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp binary_op;
+    ActivationOp activation;
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual);
+    FragmentCompute z =
+        binary_op(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with Sigmoid operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements.
+///
+/// D = sigmoid(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h ADDED Viewed

	@@ -0,0 +1,69 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with SiLU operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements.
+///
+/// D = silu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp ADDED Viewed

	@@ -0,0 +1,253 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operation, bias addition, and tensor-tensor
+  elementwise operations
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+namespace detail {
+/// Returns whether a source operand is needed for a combination of binary operation and scale
+/// type. Simple specialized checks are made for cases in which 0 is an identity element of
+/// the binary operation.
+template <class BinaryOp, class ElementCompute, ScaleType::Kind Scale>
+CUTLASS_HOST_DEVICE
+bool is_binary_op_source_needed(ElementCompute scale) {
+  if constexpr (cute::is_same_v<BinaryOp, NoOp<ElementCompute>>) {
+    return false;
+  }
+  else if constexpr (cute::is_same_v<BinaryOp, plus<ElementCompute>> || cute::is_same_v<BinaryOp, minus<ElementCompute>>) {
+    // Cases for binary operators for which 0 is an identity element
+    if constexpr (Scale == ScaleType::NoBetaScaling) return true;
+    if constexpr (Scale == ScaleType::OnlyAlphaScaling) return false;
+    if constexpr (Scale == ScaleType::Nothing) return false;
+    return scale != ElementCompute(0);
+  }
+  return true;
+}
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/** Compute a tensor-tensor broadcast epilogue.
+ *
+ * @param ElementOutput_ Data type used to load and store tensors
+ * @param ElementAccumulator_ Accumulator data type
+ * @param ElementCompute_ Data type used to compute linear combination
+ * @param ElementBias_ Data type of Bias elements
+ * @param ActivationFunctor_ Fused Activation
+ * @param BinaryOp0_ Binary operation to perform on O0 and C0. detail::NoOp means no operation
+ * @param BinaryOp1_ Binary operation to perform on O1 and C1. detail::NoOp means no operation
+ * @param UnaryOp_ Unary operation to perform on final result
+ * @param Scale Controls the type of Alpha and Beta scaling to perform
+ * @param Round How values should be rounded in conversions
+ * @param ElementSource_ Data type used for source operands
+ *
+ *  Computes the following:
+ *      O0 = alpha * accumulator + bias
+ *      O1 = BinaryOp0(O0, beta * C0)
+ *      O2 = BinaryOp1(O1, beta * C1)
+ *      D  = UnaryOp(O2)
+ */
+template <
+  class ElementOutput_,
+  class ElementAccumulator_ = ElementOutput_,
+  class ElementCompute_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  template <class T> class ActivationFunctor_ = Identity,
+  template <class T> class BinaryOp0_ = plus,
+  template <class T> class BinaryOp1_ = detail::NoOp,
+  template <class T> class UnaryOp_ = Identity,
+  ScaleType::Kind Scale = ScaleType::Default,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  class ElementSource_ = ElementOutput_
+>
+class LinearCombinationTensorBroadcast {
+public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementBias = ElementBias_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+  using ElementScalingFactor = ElementAccumulator_;
+  using UnaryOp = UnaryOp_<ElementCompute>;
+  using BinaryOp0 = BinaryOp0_<ElementCompute>;
+  using BinaryOp1 = BinaryOp1_<ElementCompute>;
+  using ActivationFunctor = ActivationFunctor_<ElementCompute>;
+  static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentBias = Array<ElementBias, kCount>;
+  static constexpr FloatRoundStyle kRound = Round;
+  using NoOpType = detail::NoOp<ElementCompute>;
+  static constexpr bool IsBinaryOp0Enabled = !cute::is_same_v<BinaryOp0, NoOpType>;
+  static constexpr bool IsBinaryOp1Enabled = !cute::is_same_v<BinaryOp1, NoOpType>;
+  static constexpr bool IsUnaryOpEnabled = !cute::is_same_v<UnaryOp, NoOpType> && !cute::is_same_v<UnaryOp, Identity<ElementCompute>>;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha{};                          ///< scales accumulators
+    ElementCompute beta{};                           ///< scales source tensor
+    ElementCompute const* alpha_ptr = nullptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const* beta_ptr = nullptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    Params() = default;
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr, ElementCompute const* beta_ptr)
+        : alpha_ptr(alpha_ptr),
+          beta_ptr(beta_ptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr)
+        : alpha_ptr(alpha_ptr) {}
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta)
+        : alpha(alpha),
+          beta(beta) {}
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTensorBroadcast(Params const& params)
+      : alpha_(params.alpha_ptr ? *params.alpha_ptr : params.alpha),
+        beta_(params.beta_ptr ? *params.beta_ptr : params.beta) {}
+  /// Returns true if source 0 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source0_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp0, ElementCompute, Scale>(beta_);
+  }
+  /// Returns true if source 1 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source1_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp1, ElementCompute, Scale>(beta_);
+  }
+  //
+  // Specialization for scalar
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source0, ElementC source1, ElementBias const bias) {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementCompute, ElementBias, Round> bias_converter;
+    NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ActivationFunctor act;
+    multiplies<ElementCompute> mul;
+    multiply_add<ElementCompute> madd;
+    ElementCompute intermediate = accumulator_converter(accumulator);
+    intermediate = madd(alpha_, intermediate, bias_converter(bias));
+    intermediate = act(intermediate);
+    // Apply BinaryOp0, if needed
+    if constexpr (IsBinaryOp0Enabled) {
+      BinaryOp0 bin0;
+      ElementCompute converted_source = source_converter(source0);
+      intermediate = bin0(intermediate, mul(beta_, converted_source));
+    }
+    // Apply BinaryOp1, if needed
+    if constexpr (IsBinaryOp1Enabled) {
+      BinaryOp1 bin1;
+      ElementCompute converted_source = source_converter(source1);
+      intermediate = bin1(intermediate, mul(beta_, converted_source));
+    }
+    // Apply UnaryOp, if needed
+    if constexpr (IsUnaryOpEnabled) {
+      UnaryOp unary;
+      intermediate = unary(intermediate);
+    }
+    return destination_converter(intermediate);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h ADDED Viewed

	@@ -0,0 +1,234 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with elementwise
+*/
+#pragma once
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationWithElementwise {
+public:
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+  static bool const kIsHeavy = true;
+  static int const kCount = Count;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+  static FloatRoundStyle const kRound = Round;
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationWithElementwise(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    return intermediate;
+  }
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    return intermediate;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/reduction_op.h ADDED Viewed

	@@ -0,0 +1,97 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing reduction operations used by epilogues.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Applies a reduction sum to an array of elements.
+///
+///
+template <
+  typename Element_,                             ///< Data type used to load and store tensors
+  int Count                                      ///< Number of elements computed per operation
+>
+class ReductionOpPlus {
+public:
+  using Element = Element_;
+  static int const kCount = Count;
+  using Fragment = Array<Element, kCount>;
+  using Operator = plus<Fragment>;
+  /// Host-constructable parameters structure
+  struct Params { };
+private:
+  /// reduction operator
+  Operator operator_;
+public:
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  ReductionOpPlus(Params const &params) {
+  }
+  /// Computes Compute =>
+  CUTLASS_HOST_DEVICE
+  Fragment operator()(
+    Fragment const &lhs,
+    Fragment const &rhs) const {
+    return operator_(lhs, rhs);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/scale_type.h ADDED Viewed

	@@ -0,0 +1,66 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Enum defines the behaviors of the epilogue.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specifies internal data type for computation
+/// Note :
+///  1. Scalar means alpha/beta is a single value from host(constant param) or device memory.
+///  2. Vector means alpha/beta is a vector always from device memory.
+struct ScaleType {
+  enum Kind {
+    Default,                           // D = scalar_alpha x Acc + scalar_beta x C
+    NoBetaScaling,                     // D = scalar_alpha x Acc + C
+    OnlyAlphaScaling,                  // D = scalar_alpha x Acc
+    PerChannelScaling,                 // D = vector_alpha x Acc + vector_beta x C
+    OnlyAlphaPerChannelScaling,        // D = vector_alpha x Acc
+    Nothing                            // D = Acc
+  };
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h ADDED Viewed

	@@ -0,0 +1,255 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator_ = arch::OpMultiplyAddComplex
+>
+struct DefaultEpilogueComplexTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 0>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi)
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK,
+                                      OutputOp_, ElementsPerAccess,
+                                      arch::OpMultiplyAddGaussianComplex
+> {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 0>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h ADDED Viewed

	@@ -0,0 +1,264 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator_ = arch::OpMultiplyAddComplex,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+>
+struct DefaultEpilogueComplexTensorOpBlas3 {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+  static BlasMode const kBlasMode = BlasMode_;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput
+    , kBlasMode
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 0>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi)
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  BlasMode BlasMode_
+>
+struct DefaultEpilogueComplexTensorOpBlas3 <Shape_, WarpMmaTensorOp_, PartitionsK,
+                                      OutputOp_, ElementsPerAccess,
+                                      arch::OpMultiplyAddGaussianComplex
+                                      , BlasMode_
+> {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+  static BlasMode const kBlasMode = BlasMode_;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 0>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h ADDED Viewed

	@@ -0,0 +1,74 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Direct store epilogue
+*/
+#pragma once
+////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/epilogue/threadblock/epilogue_direct_store.h"
+#include "cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Given a properly constructed epilogue, returns a direct store epilogue
+template <typename EpilogueTensorOp>
+struct DefaultEpilogueDirectStore {
+  using OutputTileIterator = DirectStoreEpilogueIterator<typename EpilogueTensorOp::OutputTileIterator::Element>;
+  using Epilogue = EpilogueDirectStore<
+    typename EpilogueTensorOp::Shape,
+    typename EpilogueTensorOp::WarpMmaOperator,
+    EpilogueTensorOp::kPartitionsK,
+    OutputTileIterator,
+    typename EpilogueTensorOp::AccumulatorFragmentIterator,
+    typename EpilogueTensorOp::WarpTileIterator,
+    typename EpilogueTensorOp::SharedLoadIterator,
+    typename EpilogueTensorOp::OutputOp
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h ADDED Viewed

	@@ -0,0 +1,241 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Constructs a default epilogue for planar complex outputs.
+  This template reuses components for real-valued epilogues and applies them to planar complex
+  output matrices.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue_planar_complex.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMma_,
+  typename OpcodeClass_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_,
+  WarpMmaOperator_,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  PartitionsK,
+  OutputOp_,
+  ElementsPerAccess> {
+  using RealEpilogue = DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_,
+  WarpMmaOperator_,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  PartitionsK,
+  OutputOp_,
+  ElementsPerAccess> {
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_,
+  WarpMmaOperator_,
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  PartitionsK,
+  OutputOp_,
+  ElementsPerAccess> {
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_,
+  WarpMmaOperator_,
+  arch::OpClassSimt,
+  ArchTag_,
+  PartitionsK,
+  OutputOp_,
+  ElementsPerAccess> {
+  using RealEpilogue = DefaultEpilogueSimt<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h ADDED Viewed

	@@ -0,0 +1,443 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using SIMT.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
+#include "cutlass/epilogue/warp/tile_iterator_simt.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_simt.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_depthwise.h"
+#include "cutlass/layout/permute.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueSimt {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtStridedDgrad {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtAffineRankN {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for SimtOps.
+template <typename Shape_,        // ThreadBlock Shape
+          typename WarpMmaSimt_,  // mma_depthwise_simt
+          typename OutputOp_,
+          int ElementsPerAccess_,
+          typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >
+struct DefaultDirectConvEpilogueSimt {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using WarpShape = typename WarpMmaSimt::Shape;
+  using OutputOp = OutputOp_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static int const kElementsPerAccess = ElementsPerAccess_;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  /// Number of threads total
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN
+  >;
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ThreadBlockOutputShape::kC, ThreadBlockOutputShape::kNHW>,
+    kThreads,
+    kElementsPerAccess
+  >;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorDirectConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
+    typename WarpMmaSimt::Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorPitchLinear<
+    OutputTileThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueDepthwise<
+    Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    WarpMmaSimt,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h ADDED Viewed

	@@ -0,0 +1,904 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+#include "cutlass/layout/permute.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+  static int const kFragmentsPerIteration = 1;
+};
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+  static int const kFragmentsPerIteration = 2;
+};
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+  static int const kFragmentsPerIteration = 1;
+};
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+  static int const kFragmentsPerIteration = 1;
+};
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  float,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+  static int const kFragmentsPerIteration = 2;
+};
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  bfloat16_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+  static int const kFragmentsPerIteration = 2;
+};
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+  static int const kFragmentsPerIteration = 2;
+};
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput,
+  int32_t,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+                "ElementsPerAccess needs to be 16 or 8.");
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+  static int const kFragmentsPerIteration = 1;
+};
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  using ElementOutput = cutlass::float_e4m3_t;
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+  static int const kFragmentsPerIteration = 1;
+};
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+  using ElementOutput = cutlass::float_e5m2_t;
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+  static int const kFragmentsPerIteration = 1;
+};
+} // namespace detail
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h ADDED Viewed

	@@ -0,0 +1,175 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+>
+struct DefaultEpilogueTensorOpBlas3 {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h ADDED Viewed

	@@ -0,0 +1,337 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops on Volta.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/layout/permute.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueVoltaTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpStridedDgrad {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpAffineRankN {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h ADDED Viewed

	@@ -0,0 +1,126 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Default configuration for epilogue computing absolute maximum of output and auxiliary outputs.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+#include "cutlass/layout/permute.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for absolute-maximum-computing  epilogues with TensorOps
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementAuxOutput,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithAbsMax {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  //
+  // Stores the output
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  //
+  // Stores the auxiliary output
+  //
+  using AuxOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementAuxOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithAbsMax<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    AuxOutputTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h ADDED Viewed

	@@ -0,0 +1,376 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h"
+#include "cutlass/layout/permute.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueWithBroadcastSimt {
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimt<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for strided dgrad epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastSimtStridedDgrad {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastTensorOp {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for streamk epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultStreamkEpilogueWithBroadcastTensorOp {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueStreamkWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for VoltaTensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueWithBroadcastVoltaTensorOp {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h ADDED Viewed

	@@ -0,0 +1,177 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+#include "cutlass/layout/permute.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionTensorOp {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionVoltaTensorOp {
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h ADDED Viewed

	@@ -0,0 +1,165 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using WMMA.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/layout/permute.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for WMMA TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWmmaTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+  /// Hard-coded padding elements added
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h ADDED Viewed

	@@ -0,0 +1,127 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for SIMT accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename MmaSimtPolicy_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapSimt {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  //
+  // Definitions
+  //
+  struct Detail {
+    static int const kWarpSize = 32;
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+    /// Computes number of thread-level matrix multiplies are needed to span a warp
+    static int const kGroupCount =
+      WarpShape::kM / (MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM);
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+    /// Number of iterations
+    static int const kIterations = MmaSimtPolicy::LaneMmaShape::kM * kGroupCount;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap<
+    OutputTileShape<                          // Shape
+      ThreadblockShape::kN,
+      1,
+      MmaSimtPolicy::WarpShape::kRow,
+      Detail::WarpCount::kM,
+      1>,
+    OutputTileShape<                          // Count
+      1,
+      MmaSimtPolicy::LaneMmaShape::kM,
+      Detail::kGroupCount,
+      1,
+      Detail::kIterations>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h ADDED Viewed

	@@ -0,0 +1,208 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  //
+  // Definitions
+  //
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+  //
+  // Definitions
+  //
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedOutputTileThreadMap<
+      layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
+                               WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedConvThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+  //
+  // Definitions
+  //
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedConvOutputTileThreadMap<
+      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
+                  WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h ADDED Viewed

	@@ -0,0 +1,228 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  int PartitionsK,
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ElementAccumulator
+>
+struct DefaultThreadMapVoltaTensorOp;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  PartitionsK,
+  ElementOutput_,
+  ElementsPerAccess,
+  half_t> {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = half_t;
+  //
+  // Definitions
+  //
+  struct Detail {
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  PartitionsK,
+  ElementOutput_,
+  ElementsPerAccess,
+  float> {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = float;
+  //
+  // Definitions
+  //
+  struct Detail {
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h ADDED Viewed

	@@ -0,0 +1,113 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Defines the optimal thread map for Wmma TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename InstructionShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapWmmaTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  //
+  // Definitions
+  //
+  struct Detail {
+    /// Wmma Tensor Operations fundamentally perform operations on InstructionShape::kM rows
+    static int const kTensorOpRows = InstructionShape::kM;
+    static int const kWarpSize = 32;
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+  //
+  // ThreadMap
+  //
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h ADDED Viewed

	@@ -0,0 +1,142 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+template <typename Element_>
+class DirectStoreEpilogueIterator {
+public:
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+  static int const kElementsPerAccess = 1;
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+      stride = layout.stride(0) * sizeof(Element);
+    }
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+public:
+  //
+  // Data members
+  //
+  Element *pointer;     // pointer to the output matrix
+  LongIndex stride;     // stride in elements between rows
+  TensorCoord extent;   // extent of output matrix
+  int thread_idx;       // thread index
+  TensorCoord threadblock_offset;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  DirectStoreEpilogueIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer_,
+    TensorCoord extent_,
+    int thread_idx_,
+    TensorCoord threadblock_offset_ = TensorCoord(),
+    int const * indices = nullptr
+  ):
+    pointer(pointer_),
+    stride(params.stride / sizeof(Element)),
+    extent(extent_),
+    thread_idx(thread_idx_),
+    threadblock_offset(threadblock_offset_)
+  {
+  }
+};
+///////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue.h ADDED Viewed

	@@ -0,0 +1,548 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+  The shared memory resource is time-sliced across warps.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class Epilogue :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+public:
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+public:
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+    // No-op
+    CUTLASS_DEVICE
+    void load() { }
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+    typename OutputTileIterator::Fragment source_fragment;
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+    // Load addend source fragment from global memory
+    CUTLASS_DEVICE
+    void load() {
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+    }
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Thread index in the threadblock
+  int thread_idx;
+  /// Warp index in the threadblock
+  int warp_idx;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  Epilogue(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+    __syncthreads();
+    // Initialize/load source-fragment data
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(
+        output_fragment,
+        output_op,
+        aligned_accum_fragment,
+        source_fragment);
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
+  template<class Seq>
+  struct acc2smem;
+  template <size_t... Seq>
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+      warp_tile_iterator.store(accum_fragment);
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                    AccumulatorFragmentIterator const &iterator_begin,
+                    WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+  /// Streams the result to global memory
+  template <typename SourceAspect>
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    #ifdef __clang__
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wcuda-compat"
+    // Turn off clangs warning about loop unroll argument using parens.
+    #endif
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter)
+    {
+      //
+      // Load the source
+      //
+        source.load();
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+        iter, accum_fragment_iterator, this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      if (kPartitionsK > 1) {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+      //
+      // Compute the output result
+      //
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, aligned_accum_fragment[0]);
+      //
+      // Store the final result
+      //
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+    #ifdef __clang__
+    #pragma clang diagnostic pop
+    #endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h ADDED Viewed

	@@ -0,0 +1,234 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#if !defined(__CUDACC_RTC__)
+#include <type_traits>
+#include <utility>
+#endif
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+//
+// This is used for metaprogramming epilogue functors. If they define
+// `static bool const kIsHeavy = true;`, then the epilogue functor itself is
+// not inlined. This results in smaller code and is advantageous if the epilogue
+// functor consists of many instructions.
+//
+// If the epilogue functor does not define `kIsHeavy` or if it is `false`, then
+// the behavior from CUTLASS 2.5 and before is retained. The epilogue is fully
+// unrolled and inlined.
+//
+template<class>
+struct TypeSink {  typedef void type; };
+template<class T> using TypeSinkT = typename TypeSink<T>::type;
+template<class T, class=void> struct IsEpilogueFunctorHeavy {
+  static bool const value = false;
+};
+template<class T> struct IsEpilogueFunctorHeavy<T, TypeSinkT< decltype( T::kIsHeavy ) > > {
+  static bool const value = T::kIsHeavy;
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Base class for epilogues defining warp-level
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpShape_,                      ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerIteration = 1
+>
+class EpilogueBase {
+public:
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using Padding = Padding_;
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename AccumulatorTile::Element;
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = FragmentsPerIteration;
+public:
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      (Shape::kRow + Padding::kRow) * kFragmentsPerIteration,
+      Shape::kColumn + Padding::kColumn
+    >;
+    //
+    // Data members
+    //
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+    //
+    // Methods
+    //
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(),
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+protected:
+  //
+  // Data members
+  //
+  SharedStorage &shared_storage_;
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBase(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h ADDED Viewed

	@@ -0,0 +1,197 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Basic subset of epilogue functionality for supporting StreamK decompositions
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/block_striped.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// StreamK epilogue functionality for cross-block accumulator fragment reduction
+template <
+  typename Shape,                          ///< Shape of threadblock tile (concept: GemmShape)
+  int PartitionsK,
+  typename WarpMmaOperator,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  typename AccumulatorFragmentIterator>    ///< Iterator for enumerating fragments within the per-thread tile of raw accumulators
+class EpilogueBaseStreamK
+{
+protected:
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+                        Shape::kM / WarpMmaOperator::Shape::kM,
+                        Shape::kN / WarpMmaOperator::Shape::kN,
+                        PartitionsK>;
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+public:
+  /// Number of AccumulatorTile fragments per thread
+  static int const kAccumulatorFragments = AccumulatorFragmentIterator::Policy::kIterations;
+protected:
+  /// Number of AccumulatorTile fragments per block output tile
+  static int const kOutputTileFragments = kBlockThreads * kAccumulatorFragments;
+  /// Block-striped transfer utility for sharing AccumulatorFragment
+  using BlockStripedT = BlockStriped<kBlockThreads, AccumulatorFragment>;
+  /// AccumulatorFragment stride in the shared workspace between different peer blocks (each thread block can share accumulators for up to two block output tiles)
+  static const int kPeerFragmentStride = kOutputTileFragments * 2;
+public:
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =sizeof(AccumulatorFragment) * kPeerFragmentStride;
+public:
+  /// Thread index in the threadblock
+  int thread_idx;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBaseStreamK(
+      int thread_idx)                                       ///< ID of a thread within the threadblock
+  :
+      thread_idx(thread_idx)
+  {}
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace
+  CUTLASS_DEVICE
+  void reduce(
+      AccumulatorFragment &accum_fragment,                  ///< [out] sum of all shared accumulator fragments for these peer partials
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *workspace_ptr)
+  {
+    plus<AccumulatorFragment> add_fragments;
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+    int fragment_offset = (peer_idx_begin * kPeerFragmentStride) + (reduce_fragment_idx * kBlockThreads);
+    // Load first peer fragment
+    BlockStripedT::load(accum_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+    fragment_offset += kPeerFragmentStride;         // Move to next peer
+    fragment_offset += kOutputTileFragments;        // Move to the set of fragments for this peer's "non-started" output tile
+    // Reduce fragments from additional peers
+    #pragma unroll 2
+    for (; fragment_offset < peer_idx_end * kPeerFragmentStride; fragment_offset += kPeerFragmentStride)
+    {
+      // Load peer fragment
+      AccumulatorFragment addend_fragment;
+      BlockStripedT::load(addend_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+      // Add peer fragment
+      accum_fragment = add_fragments(accum_fragment, addend_fragment);
+    }
+  }
+  /// Shares the accumulator set with peers in the global workspace
+  CUTLASS_DEVICE
+  void share(
+      int peer_idx,
+      void *workspace_ptr,
+      AccumulatorTile const &accumulators,
+      bool started_tile)                      ///< Whether this thread block computed the first work volume for the current output tile
+  {
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+    int fragment_offset = peer_idx * kPeerFragmentStride;
+    if (!started_tile) {
+      // Move to the set of fragments for the "non-started" output tile
+      fragment_offset += kOutputTileFragments;
+    }
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    // Convert raw accumulator tile to fragments and store
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kAccumulatorFragments; ++iter)
+    {
+      // Acquire reordered accumulator fragment
+      AccumulatorFragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+      // Store accumulator fragment
+      BlockStripedT::store(fragment_workspace + fragment_offset, accum_fragment, this->thread_idx);
+      fragment_offset += kBlockThreads;
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h ADDED Viewed

	@@ -0,0 +1,335 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for Depthwise convoltuion
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator
+template <typename Shape_,                   ///< Shape of threadblock tile (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename WarpMmaOperator_,         ///< Warp-level MMA operator (concept:
+                                             ///< gemm::warp::MmaTensorOp)
+          typename OutputTileIterator_,      ///< Tile iterator reading and writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting accumulators
+          typename WarpTileIterator_,    ///< Warp-scoped tile iterator writing accumulators to SMEM
+          typename SharedLoadIterator_,  ///< Threadblock-scoped tile iterator loading from SMEM
+          typename OutputOp_,            ///< Output operator
+          typename Padding_  ///< Padding added to SMEM allocation to avoid bank conflicts (concept:
+                             ///< MatrixShape)
+          >
+class EpilogueDepthwise {
+ public:
+  using Shape = Shape_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType =
+      Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN>;
+ public:
+  static_assert(SharedLoadIterator::Fragment::kElements ==
+  OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "OutputTileIterator::kElementsPerAccess must not be zero.");
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<ThreadBlockOutputShape::kNHW, ThreadBlockOutputShape::kC>;
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<Shape::kRow, Shape::kColumn>;
+    //
+    // Data members
+    //
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+    //
+    // Methods
+    //
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() { return storage.data(); }
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(storage.data(), Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n;  // warp coordinates within a cta
+  int tid_m, tid_n;    // thread coordinates within a warp
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDepthwise(SharedStorage &shared_storage,  ///< Shared storage object
+                    int thread_idx_,                ///< ID of a thread within the threadblock
+                    int warp_idx_,                  ///< ID of warp within threadblock
+                    int lane_idx_                   ///< Id of thread within warp
+                    )
+      : thread_idx(thread_idx_),
+        warp_idx(warp_idx_),
+        lane_idx(lane_idx_),
+        shared_load_iterator_(shared_storage.reference(), thread_idx_),
+        warp_tile_iterator_(shared_storage.reference(), thread_idx_, lane_idx_) {}
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(OutputOp const &output_op,                ///< Output operator
+                  OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                  AccumulatorTile const &accumulators,  ///< Complete warp-level accumulator tile
+                  OutputTileIterator source_iterator,   ///< Threadblock tile coordinate in GEMM (in
+                                                        ///< units of threadblock tiles)
+                  const int smem_base_offset) {         ///< SMEM base offset for epilogue operation
+    // initiate the smem base offset for different output tile.
+    warp_tile_iterator_.set_smem_base_address(smem_base_offset);
+    shared_load_iterator_.set_smem_base_address(smem_base_offset);
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+ private:
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators,      ///< Complete warp-level accumulator tile
+      OutputTileIterator source_iterator) {     ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+    source_iterator.load(source_fragment);
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+    __syncthreads();
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+    typename OutputTileIterator::Fragment output_fragment;
+    apply_output_operator_(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators) {    ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+    __syncthreads();
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+    typename OutputTileIterator::Fragment output_fragment;
+    apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment);
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment) {
+    OutputAccessType *output_frag_ptr =
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+    AccumulatorAccessType const *compute_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+    OutputAccessType const *source_frag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,  ///< Output operator
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
+    OutputAccessType *output_frag_ptr = reinterpret_cast<OutputAccessType *>(&output_fragment);
+    AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+    int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h ADDED Viewed

	@@ -0,0 +1,347 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs and convolution using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueDirectStore {
+public:
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = MatrixShape<0, 0>;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = 1;
+  static int constexpr kSmemTiles = 1;
+  static int constexpr kSmemPointerOffset = 0;
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage { } ;
+private:
+  // Assume accumulator tile is multipile interleaved 32x32 tile.
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename platform::conditional<
+                              platform::is_same<ElementAccumulator, float>::value,
+                              MatrixShape<2, 2>,
+                              MatrixShape<1, 4> >::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = MatrixShape<4, 4>;
+  static_assert(OutputOp::kCount >= 2,
+    "The direct store epilogue for Tensor Ops requires the output functor have kCount >= 2.");
+private:
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n; // warp coordinates within a cta
+  int tid_m, tid_n;   // thread coordinates within a warp
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDirectStore(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx_,                   ///< ID of a thread within the threadblock
+    int warp_idx_,                     ///< ID of warp within threadblock
+    int lane_idx_                     ///< Id of thread within warp
+  ):
+    thread_idx(thread_idx_),
+    warp_idx(warp_idx_),
+    lane_idx(lane_idx_)
+  {
+    // warp offsetting calculations
+    warp_offset = warp_idx * WarpShape::kM * WarpShape::kN;
+    int warp_id_mn = warp_idx % (WarpCount::kM * WarpShape::kN);
+    warp_m = warp_id_mn % WarpCount::kM;
+    warp_n = warp_id_mn / WarpCount::kM;
+    MatrixCoord warp_offset_coord(warp_m*WarpShape::kM, warp_n*WarpShape::kN);
+    // thread offsetting calculations
+    int quad = (lane_idx >> 2);
+    int lane_in_quad = (lane_idx & 3);
+    // this seems to be te correct layout
+    tid_m = quad;
+    tid_n = 2 * lane_in_quad;
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    }
+    else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+private:
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+      ElementOutput *source_ptr = source_iterator.pointer + mL * source_iterator.stride;
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output
+        int nL = nL_base + accum_n;
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+        OutputFragmentType output_fragment;
+        if(guard) {
+          reinterpret_cast<OutputAccessType &>(output_fragment) =
+            *reinterpret_cast<OutputAccessType const *>(source_ptr + nL);
+        }
+        // Perform output operator
+        output_fragment = output_op(accum_fragment, output_fragment);
+        if(guard) {
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = reinterpret_cast<OutputAccessType const &>(output_fragment);
+        }
+      }
+    }
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output
+        int nL = nL_base + accum_n;
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+        OutputFragmentType output_fragment;
+        // Perform output operator
+        output_fragment = output_op(accum_fragment);
+        if(guard) {
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) =
+            reinterpret_cast<OutputAccessType const &>(output_fragment);
+        }
+      }
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h ADDED Viewed

	@@ -0,0 +1,206 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator
+template <
+  typename ElementAccumulator_,
+  typename ElementOutput_,
+  typename ThreadBlockShape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  bool ReduceKForA_
+>
+class EpilogueGemmKReduction {
+public:
+  using ThreadBlockShape = ThreadBlockShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// Accumulator element
+  using ElementAccumulator = ElementAccumulator_;
+  /// Output element
+  using ElementOutput = ElementOutput_;
+  /// Output access size
+  static int const kElementsPerAccess = 1;
+  static bool const kReduceKForA = ReduceKForA_;
+  static int const kThreadBlockSize = kReduceKForA ? ThreadBlockShape::kM : ThreadBlockShape::kN;
+  static int const kWarpSize = kReduceKForA ? WarpShape::kM : WarpShape::kN;
+  static int const kIterations = kWarpSize / 8;
+  using FragmentAccumulator = Array<ElementAccumulator, kIterations>;
+private:
+  int thread_offset_;
+  ElementOutput* pointer_;
+  int col_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueGemmKReduction(
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx,                     ///< Id of thread within warp
+    int threadblock_offset,
+    ElementOutput* pointer
+  )
+  {
+     col_ = lane_idx % 4;
+     thread_offset_ = threadblock_offset * kThreadBlockSize
+                    + warp_idx * kWarpSize
+                    + lane_idx / 4 + col_ * 8;
+     pointer_ = pointer + LongIndex(thread_offset_);
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    int size,
+    FragmentAccumulator &gemm_k_with_reduction_accumulation,
+    bool LoadForSerialSplitK
+  ) {
+      bool guard[kIterations / 4];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        guard[i] = ((thread_offset_ + i * 32) < size);
+      }
+      Array<ElementOutput, kIterations / 4> source;
+      source.clear();
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        ElementOutput *source_ptr = reinterpret_cast<ElementOutput *>(&source);
+        cutlass::arch::global_load<ElementOutput, sizeof(ElementOutput)>(
+                                                  source_ptr[i],
+                                                  (void *)(pointer_ + i * 32),
+                                                  guard[i] && LoadForSerialSplitK);
+      }
+      FragmentAccumulator sum = gemm_k_with_reduction_accumulation;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations; ++i) {
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 1);
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 2);
+      }
+      Array<ElementAccumulator, kIterations / 4> intermediate;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        if (col_ == 0) {
+          intermediate[i] = sum[0 + i * 4];
+        }
+        if (col_ == 1) {
+          intermediate[i] = sum[1 + i * 4];
+        }
+        if (col_ == 2) {
+          intermediate[i] = sum[2 + i * 4];
+        }
+        if (col_ == 3) {
+          intermediate[i] = sum[3 + i * 4];
+        }
+      }
+      NumericArrayConverter<ElementAccumulator, ElementOutput, kIterations / 4> source_converter;
+      Array<ElementAccumulator, kIterations / 4> converted_source = source_converter(source);
+      plus<Array<ElementAccumulator, kIterations / 4>> plus_source;
+      intermediate = plus_source(intermediate, converted_source);
+      NumericArrayConverter<ElementOutput, ElementAccumulator, kIterations / 4> converter;
+      Array<ElementOutput, kIterations / 4> result = converter(intermediate);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        cutlass::arch::global_store<ElementOutput, sizeof(ElementOutput)>(result[i],
+                                                (void *)(pointer_ + i * 32), guard[i]);
+      }
+    }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h ADDED Viewed

	@@ -0,0 +1,401 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator for planar-complex output representations.
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+>
+class EpiloguePlanarComplex {
+public:
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = ArrayPlanarComplex<
+    typename WarpMmaOperator::FragmentC::Element,
+    WarpMmaOperator::FragmentC::kElements
+  >;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Shape of each warp-level operation
+  using WarpShape = typename WarpMmaOperator::Shape;
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+  /// Shared memory allocation
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      Shape::kRow + Padding::kRow,
+      Shape::kColumn + Padding::kColumn
+    >;
+    static int const kImaginaryStride = StorageShape::kCount;
+    //
+    // Data members
+    //
+    AlignedBuffer<Element, kImaginaryStride * 2> storage;
+    //
+    // Methods
+    //
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(),
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  SharedStorage &shared_storage_;
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePlanarComplex(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    shared_load_iterator_(shared_storage.reference(), thread_idx),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    OutputTileIterator destination_iterator_real,     ///< Tile iterator for destination
+    OutputTileIterator destination_iterator_imag,     ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator_real,          ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    OutputTileIterator source_iterator_imag) {        ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    typename OutputTileIterator::Fragment source_fragment_real;
+    typename OutputTileIterator::Fragment source_fragment_imag;
+    if (!output_op.is_source_needed()) {
+      source_iterator_real.clear_mask();
+      source_iterator_imag.clear_mask();
+    }
+    source_fragment_real.clear();
+    source_fragment_imag.clear();
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator_real(accumulators.real);
+    AccumulatorFragmentIterator accum_fragment_iterator_imag(accumulators.imag);
+    //
+    // Iterate over accumulator tile
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      //
+      // Load the source
+      //
+      source_iterator_real.load(source_fragment_real);
+      source_iterator_imag.load(source_fragment_imag);
+      ++source_iterator_real;
+      ++source_iterator_imag;
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_real;
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_imag;
+      accum_fragment_iterator_real.load(accum_fragment_real);
+      accum_fragment_iterator_imag.load(accum_fragment_imag);
+      ++accum_fragment_iterator_real;
+      ++accum_fragment_iterator_imag;
+      this->warp_tile_iterator_.store(accum_fragment_real);
+      this->warp_tile_iterator_.store_with_pointer_offset(accum_fragment_imag, SharedStorage::kImaginaryStride);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_real[kPartitionsK];
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_imag[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment_real[0]);
+      shared_load_iterator_.load_with_pointer_offset(aligned_accum_fragment_imag[0], SharedStorage::kImaginaryStride);
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      static_assert(kPartitionsK  == 1, "Sliced-K not supported for planar complex at this time");
+      //
+      // Compute the output result
+      //
+      typename OutputTileIterator::Fragment output_fragment_real;
+      typename OutputTileIterator::Fragment output_fragment_imag;
+      apply_output_operator_(
+        output_fragment_real,
+        output_fragment_imag,
+        output_op,
+        aligned_accum_fragment_real[0],
+        aligned_accum_fragment_imag[0],
+        source_fragment_real,
+        source_fragment_imag);
+      //
+      // Store the final result
+      //
+      destination_iterator_real.store(output_fragment_real);
+      destination_iterator_imag.store(output_fragment_imag);
+      ++destination_iterator_real;
+      ++destination_iterator_imag;
+    }
+  }
+private:
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment_real,
+    typename OutputTileIterator::Fragment &output_fragment_imag,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_real,
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_imag,
+    typename OutputTileIterator::Fragment const &source_fragment_real,
+    typename OutputTileIterator::Fragment const &source_fragment_imag) {
+    OutputAccessType *output_frag_real_ptr =
+      reinterpret_cast<OutputAccessType *>(&output_fragment_real);
+    OutputAccessType *output_frag_imag_ptr =
+      reinterpret_cast<OutputAccessType *>(&output_fragment_imag);
+    AccumulatorAccessType const *compute_frag_real_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_real);
+    AccumulatorAccessType const *compute_frag_imag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_imag);
+    OutputAccessType const *source_frag_real_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_real);
+    OutputAccessType const *source_frag_imag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_imag);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      auto result_fragment = output_op(
+        make_ArrayPlanarComplex(compute_frag_real_ptr[i], compute_frag_imag_ptr[i]),
+        make_ArrayPlanarComplex(source_frag_real_ptr[i], source_frag_imag_ptr[i])
+      );
+      output_frag_real_ptr[i] = result_fragment.real;
+      output_frag_imag_ptr[i] = result_fragment.imag;
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h ADDED Viewed

	@@ -0,0 +1,224 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMM/CONV to store accumulator in shared memory after
+    applying scale, bias loaded from global memory and element-wise operations.
+    This Epilogue is typically used in fused GEMM/CONV to stage the intermediate accumulator.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator
+template <
+  typename SmemTileIterator_,               ///< Shared memory Tile iterator to output to shared memory
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename ScaleBiasIterator_,              ///< Iterator to load scale and bias from global memory
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueSmemAccumulator {
+public:
+  using SmemTileIterator = SmemTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using ScaleBiasIterator = ScaleBiasIterator_;
+  using OutputOp = OutputOp_;
+  /// Fragment of accumulator tile
+  using FragmentAccumulator = typename AccumulatorFragmentIterator::Fragment;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentScaleBias = typename ScaleBiasIterator::Fragment;
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueSmemAccumulator() {}
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator,          ///< Complete warp-level accumulator tile
+    ScaleBiasIterator scale_iterator,             ///< iterator for scale vector in global memory
+    ScaleBiasIterator bias_iterator) {            ///< iterator for bias vector in global memory
+    // Fragment to load scale bias from global memory
+    FragmentScaleBias tb_frag_scale;
+    FragmentScaleBias tb_frag_bias;
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+    /// Load scale and bias from global memory
+    if(PerChannelScale)
+        scale_iterator.load(tb_frag_scale);
+    bias_iterator.load(tb_frag_bias);
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using ScaleBiasAccessType = typename OutputOp::FragmentScaleBias;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+        ScaleBiasAccessType const * scale_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_scale);
+        ScaleBiasAccessType const * bias_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_bias);
+        FragmentSmemAccessType * smem_frag_ptr =
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+          AccumulatorAccessType const * accumulator_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it],
+                scale_frag_ptr[cid * kOutputIterations + it], bias_frag_ptr[cid * kOutputIterations + it]);
+          }
+        }
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+      }
+    }
+  }
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator) {          ///< Complete warp-level accumulator tile
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+        FragmentSmemAccessType * smem_frag_ptr =
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+          AccumulatorAccessType const * accumulator_frag_ptr =
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it]);
+          }
+        }
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+      }
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h ADDED Viewed

	@@ -0,0 +1,443 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// This base class is meant to define the concept required of the
+/// EpilogueStreamkWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueStreamkWithBroadcastOpBase : EpilogueWithBroadcastOpBase<
+                                            ElementC_,
+                                            ElementAccumulator_,
+                                            ElementCompute_,
+                                            ElementZ_,
+                                            ElementT_,
+                                            ElementsPerAccess,
+                                            StoreZ,
+                                            StoreT
+                                            >
+{
+  /// Parameters structure - required
+  struct Params { };
+  //
+  // Methods
+  //
+  /// Constructor from Params
+  EpilogueStreamkWithBroadcastOpBase(Params const &params_) { }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueStreamkWithBroadcast;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EpilogueStreamkWithBroadcast: Two sources
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> :
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>;
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+      OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord())
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+    __syncthreads();
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator1, source_iterator2, tensor_iterator, problem_size, threadblock_offset);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EpilogueStreamkWithBroadcast: Single source
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> :
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>;
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord())
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+    __syncthreads();
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator, tensor_iterator, problem_size, threadblock_offset);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h ADDED Viewed

	@@ -0,0 +1,513 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
+  The epilogue finds max values in each row of the row-major output matrix and stores them.
+  The max values are also used for a further round of threadblock scoped reduction operation, where
+  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
+*/
+#pragma once
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/fast_math.h"
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename ElementAccumulator_,
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ElementwiseFunctor_,
+  bool UseMasking_ = false
+>
+class EpilogueVisitorSoftmax {
+public:
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+  static bool const kUseMasking = UseMasking_;
+  /// Argument structure
+  struct Arguments {
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+    //
+    // Methods
+    //
+    Arguments():
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+    }
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+    }
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      int64_t                               batch_stride_C_,
+      int64_t                               batch_stride_D_,
+      int64_t                               batch_stride_Max_,
+      int64_t                               batch_stride_Sum_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_Max(batch_stride_Max_),
+      batch_stride_Sum(batch_stride_Sum_)
+    {
+    }
+  };
+  struct Params {
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params()
+    {
+    }
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Max(args.batch_stride_Max),
+      batch_stride_Sum(args.batch_stride_Sum)
+    {
+    }
+  };
+  /// Shared storage
+  struct SharedStorage {
+  };
+private:
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  MatrixCoord                           extent_real_;
+  ElementwiseFunctor                    elementwise_;
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+  ElementNorm                           *ptr_Max_;
+  ElementSum                            *ptr_Sum_;
+  int                                   column_offset_;
+  ElementSoftmaxCompute                 accum_max_;
+  ElementSoftmaxCompute                 accum_sum_;
+  MatrixCoord                           thread_offset_;
+  float                                 infinity_;
+public:
+  CUTLASS_DEVICE
+  EpilogueVisitorSoftmax(
+    Params const &params,
+    SharedStorage &shared_storage,
+    cutlass::MatrixCoord const &problem_size,
+    int thread_idx,
+    int warp_idx,
+    int lane_idx,
+    typename OutputTileIterator::Params params_C,
+    typename OutputTileIterator::Params params_D,
+    typename OutputTileIterator::Element *ptr_C,
+    typename OutputTileIterator::Element *ptr_D,
+    ElementNorm *ptr_Max = nullptr,
+    ElementSum *ptr_Sum = nullptr,
+    cutlass::MatrixCoord const &threadblock_offset = cutlass::MatrixCoord(0, 0),
+    int column_offset = 0,
+    cutlass::MatrixCoord const &problem_size_real = cutlass::MatrixCoord(0, 0),
+    float infinity = 10000.0f
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size),
+    elementwise_(params.elementwise),
+    iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+    iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+    ptr_Max_(ptr_Max),
+    ptr_Sum_(ptr_Sum),
+    column_offset_(column_offset),
+    extent_real_(problem_size_real),
+    infinity_(infinity)
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+  }
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+  }
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+  }
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // Clear accumulators for max and sum when starting a whole row
+    clear_accum_();
+  }
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+    using Mul = cutlass::multiplies<SoftmaxFragment>;
+    using Minus = cutlass::minus<SoftmaxFragment>;
+    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
+    Minus     minus;
+    Exp       exponential;
+    SoftmaxFragment result;
+    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      result = source_converter(elementwise_(accum));
+    }else{
+      result = source_converter(elementwise_(accum, source_vector));
+    }
+    thread_offset_ =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+    bool column_guard = (thread_offset_.column() < extent_.column());
+    if (kUseMasking) {
+      int elements_in_boundary = extent_real_.column() - thread_offset_.column();
+      elements_in_boundary = (elements_in_boundary > kElementsPerAccess) ? kElementsPerAccess : elements_in_boundary;
+      elementwise_padding_(result, elements_in_boundary);
+    }
+    ElementSoftmaxCompute accum_max_prev = accum_max_;
+    // Compute the maximum within one row
+    if (!column_idx) {
+      // This is the first fragment in a new row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result);
+      }
+    }
+    else {
+      // This is an additional fragment in the same row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result, accum_max_);
+      }
+    }
+    // proactively compute max in warps
+    accum_max_ = warp_reduce_max_(accum_max_);
+    ElementSoftmaxCompute updater = fast_exp(accum_max_prev - accum_max_);
+    SoftmaxFragment intermediate = exponential(minus(result, accum_max_));
+    if (kHasMultiStepsInRow) {
+      if (!column_idx) {
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate) : ElementSoftmaxCompute(0);
+      } else {
+        // Algorithm in $3.1, https://arxiv.org/pdf/2205.14135v1.pdf
+        // S* = S* x updater + sum_row(P'), where updater = exp(M* - M_row)
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate, accum_sum_ * updater) : accum_sum_ * updater;
+      }
+    } else {
+      accum_sum_ = (column_guard) ? sum_accumulator_(intermediate, accum_sum_) : ElementSoftmaxCompute(0);
+    }
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+    // Compute accumulate sum only in the last step
+    accum_sum_ = warp_reduce_sum_(accum_sum_);
+    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
+    bool row_guard = thread_offset_.row() < extent_.row();
+    bool is_write_thread = row_guard && is_first_thread_in_tile;
+    int block_batch = blockIdx.z;
+    ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
+    ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
+    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
+              convert_norm_output(accum_max_),
+              (void *)curr_ptr_max,
+              is_write_thread);
+    arch::global_store<ElementSum, sizeof(ElementSum)>(
+              convert_sum_output(accum_sum_),
+              (void *)curr_ptr_sum,
+              is_write_thread);
+    // Clear accumulators for max and sum when finishing a whole row
+    clear_accum_();
+  }
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+  }
+private:
+  CUTLASS_DEVICE
+  void elementwise_padding_(SoftmaxFragment &result, int elements_in_boundary) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      result[i] = (i < elements_in_boundary) ? result[i] : ElementSoftmaxCompute(-infinity_);
+    }
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_sum_(ElementSoftmaxCompute sum_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
+      sum_ += tmp;
+    }
+    return sum_;
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_max_(ElementSoftmaxCompute max_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
+      max_ = fast_max(max_, tmp);
+    }
+    return max_;
+  }
+  CUTLASS_DEVICE
+  void clear_accum_() {
+    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+    accum_max_ = ElementSoftmaxCompute(min_float);
+    accum_sum_ = ElementSoftmaxCompute(0);
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+    return sum_;
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute sum_) {
+    // ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+    return sum_;
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute max_ = accum[0];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+    return max_;
+  }
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+    return max_;
+  }
+};
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h ADDED Viewed

	@@ -0,0 +1,922 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Threadblock-level epilogue computing:
+    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+    D = activation(Aux)
+    if Aux is fp8 type:
+        abs_max_output = max( abs(aux) | (for every aux in Aux))
+        Aux = scale_aux * Aux
+    endif
+    if D is fp8 type:
+        abs_max_output = max( abs(d) | (for every d in D))
+        D = scale_d * D
+    endif
+    Parameter Aux is optionally stored to global memory
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Helper class for keeping track of absolute maximums and performing scaling
+template <
+  typename Iterator,        // Iterator type used for storing the data for which absolute maximum and scaling
+                            // will be computed. This type is used for predicating absolute maximum calculations.
+  typename Fragment,        // Type of input to be computed on
+  bool ScalingAndAmaxNeeded // Whether to perform absolute maximum and scaling operations
+>
+struct ScalingAndAmaxHelper;
+/// Partial specialization that does not perform scaling or calculate an absolute maximum
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, false> {
+  using Element = typename Fragment::Element;
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale) { }
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& inp) {
+    return inp;
+  }
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return Element(0.);
+  }
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) { }
+};
+/// Partial specialization that keeps track of an absolute maximum value of inputs seen
+/// and scales inputs
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, true> {
+  using Element = typename Fragment::Element;
+  using AccessType = typename Iterator::AccessType;
+  using ThreadMap = typename Iterator::ThreadMap;
+  Element abs_max;
+  Element scale;
+  // Operators
+  maximum_with_nan_propogation<Element> max_op;
+  absolute_value_op<Element> abs_op;
+  multiplies<Fragment> multiply;
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale_) : abs_max(0.), scale(scale_) { }
+  // Compute the absolute maximum value between `abs_max` and the entries
+  // of `frag` for predicated-on entries of `iterator`. Return a scaled
+  // version of `inp`.
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& frag) {
+    using PredicateGroup = Array<Element, Iterator::ThreadMap::kElementsPerAccess>;
+    PredicateGroup const *frag_ptr = reinterpret_cast<PredicateGroup const *>(&frag);
+    typename Iterator::Mask mask;
+    iterator.get_mask(mask);
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+          bool row_guard = ((row_offset + iterator.thread_start_row()) < iterator.extent_row());
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask.predicates[column];
+            if (guard) {
+              int access_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+              CUTLASS_PRAGMA_UNROLL
+              for (int i = 0; i < PredicateGroup::kElements; ++i) {
+                abs_max = max_op(abs_max, abs_op(frag_ptr[access_idx][i]));
+              }
+            }
+          }
+        }
+      }
+    }
+    // Perform scaling
+    return multiply(scale, frag);
+  }
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return abs_max;
+  }
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) {
+    scale = scale_;
+  }
+};
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AuxOutputTileIterator_,          ///< Tile iterator writing auxiliary output tensors
+  typename ElementVector_,                  ///< Data type of bias vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsen the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithAbsMax :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+public:
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AuxOutputTileIterator = AuxOutputTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = typename OutputOp::ElementAbsmax;
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+  /// Helpers for (optionally) computing absolute maximums and scaling output and auxiliary output
+  using OutputScaler = detail::ScalingAndAmaxHelper<OutputTileIterator,
+                                                    FragmentCompute,
+                                                    OutputOp::kIsScalingAndAmaxOutputNeeded>;
+  using AuxOutputScaler = detail::ScalingAndAmaxHelper<AuxOutputTileIterator,
+                                                       FragmentCompute,
+                                                       OutputOp::kIsScalingAndAmaxAuxOutputNeeded>;
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Data type of auxiliary output
+  using ElementAuxOutput = typename AuxOutputTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+  /// Auxiliary output access type
+  using AuxAccessType = Array<ElementAuxOutput, OutputTileIterator::kElementsPerAccess>;
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+  /// Used for the broadcast
+  struct BroadcastDetail {
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+public:
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Thread index within the threadblock
+  int thread_idx_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithAbsMax(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp &output_op,                              ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    BroadcastFragment broadcast_fragment;
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+    OutputScaler output_scaler(output_op.get_scale_d());
+    AuxOutputScaler aux_scaler(output_op.get_scale_aux());
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+    // Store the absolute maximum values of the output and auxiliar tensors, if needed.
+    if (output_op.get_ptr_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(output_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_output_abs_max(), local_abs_max);
+    }
+    if (output_op.get_ptr_aux_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(aux_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_aux_output_abs_max(), local_abs_max);
+    }
+  }
+private:
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+    broadcast_fragment.clear();
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+      AccessType loaded;
+      loaded.clear();
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp &output_op,                              ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        //
+        // Apply output operation
+        //
+        FragmentCompute frag_Z_compute;
+        FragmentCompute frag_Aux_compute;
+        apply_output_operator_source_not_needed_(
+          frag_Z_compute,
+          frag_Aux_compute,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+        //
+        // Conditionally store fragments
+        //
+        // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+        frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+        NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                              OutputTileIterator::Fragment::kElements> cvt_to_dst;
+        typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+        // Always store the output
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+        // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+        if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+          frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+          NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                                AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+          typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+          aux_iterator.store(frag_Aux);
+          ++aux_iterator;
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+  template<class Seq>
+  struct acc2smem_source_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp &output_op,                          ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      //
+      // Load the source
+      //
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+      //
+      // Apply output operation
+      //
+      FragmentCompute frag_Z_compute;
+      FragmentCompute frag_Aux_compute;
+      apply_output_operator_(
+        frag_Z_compute,
+        frag_Aux_compute,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+      //
+      // Conditionally store fragments
+      //
+      // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+      frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+      NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                            OutputTileIterator::Fragment::kElements> cvt_to_dst;
+      typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+      // Always store the output
+      destination_iterator.store(frag_Z);
+      ++destination_iterator;
+      // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+      if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+        frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+        NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                              AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+        typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+        aux_iterator.store(frag_Aux);
+        ++aux_iterator;
+      }
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_Aux_ptr[i],
+          frag_AB_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn],
+          frag_C_ptr[i]);
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      output_op(
+        frag_Z_ptr[i],
+        frag_Aux_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu130-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h ADDED Viewed

	@@ -0,0 +1,1717 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueWithBroadcastOpBase {
+  using ElementOutput = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = StoreZ;
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT;
+  /// Parameters structure - required
+  struct Params { };
+  //
+  // Methods
+  //
+  /// Constructor from Params
+  EpilogueWithBroadcastOpBase(Params const &params_) { }
+  /// Determine if the source is needed. May return false if
+  bool is_source_needed() const {
+    return true;
+  }
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) { }
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C1,
+    FragmentC const &frag_C2,
+    FragmentCompute const &V) const {
+  }
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueWithBroadcast;
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+public:
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+  static bool const kIsSingleSource = false;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+  /// Used for the broadcast
+  struct BroadcastDetail {
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+public:
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Thread index within the threadblock
+  int thread_idx_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,              ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,              ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    BroadcastFragment broadcast_fragment;
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator1,
+        source_iterator2,
+        tensor_iterator);
+    }
+  }
+private:
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+    broadcast_fragment.clear();
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+      AccessType loaded;
+      loaded.clear();
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        //
+        // Apply output operation
+        //
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+        //
+        // Conditionally store fragments
+        //
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+  template<class Seq>
+  struct acc2smem_source_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,          ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,          ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) {
+    typename OutputTileIterator::Fragment source_fragment1;
+    source_fragment1.clear();
+    typename OutputTileIterator::Fragment source_fragment2;
+    source_fragment2.clear();
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      //
+      // Load the source
+      //
+      source_iterator1.load(source_fragment1);
+      ++source_iterator1;
+      source_iterator2.load(source_fragment2);
+      ++source_iterator2;
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+      //
+      // Apply output operation
+      //
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment1,
+        source_fragment2,
+        broadcast_fragment);
+      //
+      // Conditionally store fragments
+      //
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C1,
+    typename OutputTileIterator::Fragment const &frag_C2,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    OutputAccessType const *frag_C1_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C1);
+    OutputAccessType const *frag_C2_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C2);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C1_ptr[i],
+          frag_C2_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      output_op(
+        frag_Z_ptr[i],
+        frag_T_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+        OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord())
+    {
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment1;
+      source_fragment1.clear();
+      typename OutputTileIterator::Fragment source_fragment2;
+      source_fragment2.clear();
+      if (output_op.is_source_needed())
+      {
+        source_iterator1 += reduce_fragment_idx;
+        source_iterator1.load(source_fragment1);
+        source_iterator2 += reduce_fragment_idx;
+        source_iterator2.load(source_fragment2);
+      }
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+      //
+      // Apply output operation
+      //
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment1,
+          source_fragment2,
+          broadcast_fragment);
+      }
+      //
+      // Conditionally store fragments
+      //
+      if (OutputOp::kStoreZ) {
+        destination_iterator += reduce_fragment_idx;
+        destination_iterator.store(frag_Z);
+      }
+      if (OutputOp::kStoreT) {
+        tensor_iterator += reduce_fragment_idx;
+        tensor_iterator.store(frag_T);
+      }
+    }
+};
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+public:
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+  /// Used for the broadcast
+  struct BroadcastDetail {
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+public:
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  /// Thread index within the threadblock
+  int thread_idx_;
+public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+  }
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    BroadcastFragment broadcast_fragment;
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator,
+        tensor_iterator);
+    }
+  }
+private:
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+    broadcast_fragment.clear();
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+      AccessType loaded;
+      loaded.clear();
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        //
+        // Apply output operation
+        //
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+        //
+        // Conditionally store fragments
+        //
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+  template<class Seq>
+  struct acc2smem_source_needed;
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) {
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+    //
+    // Iterate over accumulator tile
+    //
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      //
+      // Load the source
+      //
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+      //
+      // Convert and store fragment
+      //
+      __syncthreads();
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+      __syncthreads();
+      //
+      // Load fragments from shared memory
+      //
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+      //
+      // Apply output operation
+      //
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+      //
+      // Conditionally store fragments
+      //
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      output_op(
+        frag_Z_ptr[i],
+        frag_T_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord())
+    {
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment;
+      source_fragment.clear();
+      if (output_op.is_source_needed())
+      {
+        source_iterator += reduce_fragment_idx;
+        source_iterator.load(source_fragment);
+      }
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+      //
+      // Apply output operation
+      //
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment,
+          broadcast_fragment);
+      }
+      //
+      // Conditionally store fragments
+      //
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////