feat: add grouped poly norm CUDA kernel with scores and hidden_clamp fusion

Hand-written CUDA kernel for GroupedFusedMulPolyNorm (MoE).
Fuses polynomial normalization, mul, scores multiplication, and
hidden_clamp into a single kernel launch per forward/backward.

- 4 kernel variants: fwd/bwd × vectorized(width=8)/scalar
- scores: nullable, always fp32, fused into fwd output and bwd gradients
- hidden_clamp: < 0 disabled, >= 0 clamps input/mul/output with correct
backward gradient masking (recompute output for mask, no extra memory)
- ROCm compatible (64-bit warp sync mask)
- C++ op registration for torch.compile (register_fake in Python layer)
- ~3x faster than torch.compile'd PyTorch reference on B200

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

activation/grouped_poly_norm.cu +657 -0
build.toml +2 -0
torch-ext/torch_binding.cpp +30 -0
torch-ext/torch_binding.h +30 -0

activation/grouped_poly_norm.cu ADDED Viewed

	@@ -0,0 +1,657 @@

+#include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "assert_utils.h"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+namespace motif {
+template <typename type, int N> struct alignas(sizeof(type) * N) vec_t {
+  type data[N];
+};
+__device__ __forceinline__ int find_expert(const int32_t *__restrict__ offsets,
+                                           int num_experts, int row) {
+  int lo = 0, hi = num_experts;
+#pragma unroll 6
+  for (int i = 0; i < 12; ++i) {
+    if (lo >= hi) break;
+    int mid = (lo + hi) >> 1;
+    if (offsets[mid] <= row)
+      lo = mid + 1;
+    else
+      hi = mid;
+  }
+  return lo;
+}
+__device__ __forceinline__ float4 warp_reduce_f4(float4 v) {
+#ifndef USE_ROCM
+  constexpr unsigned int FULL_MASK = 0xffffffff;
+#else
+  constexpr unsigned long long FULL_MASK = 0xffffffffffffffffULL;
+#endif
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+    v.x += __shfl_xor_sync(FULL_MASK, v.x, mask);
+    v.y += __shfl_xor_sync(FULL_MASK, v.y, mask);
+    v.z += __shfl_xor_sync(FULL_MASK, v.z, mask);
+    v.w += __shfl_xor_sync(FULL_MASK, v.w, mask);
+  }
+  return v;
+}
+template <int BLOCK_SIZE>
+__device__ __forceinline__ float4 block_reduce_f4(float4 v) {
+  constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE;
+  __shared__ float4 warp_results[NUM_WARPS];
+  v = warp_reduce_f4(v);
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int lane_id = threadIdx.x % WARP_SIZE;
+  if (lane_id == 0) warp_results[warp_id] = v;
+  __syncthreads();
+  if (warp_id == 0 && lane_id < NUM_WARPS)
+    v = warp_results[lane_id];
+  else
+    v = make_float4(0.f, 0.f, 0.f, 0.f);
+  if (warp_id == 0) v = warp_reduce_f4(v);
+  __shared__ float4 result;
+  if (threadIdx.x == 0) result = v;
+  __syncthreads();
+  return result;
+}
+// ---------------------------------------------------------------------------
+// Grouped PolyNorm Forward — vectorized (width > 0)
+// Pass 1: accumulate sum_x2, sum_x4, sum_x6 for RMS stats
+// Pass 2: compute poly * mul output, save inv_rms for backward
+// ---------------------------------------------------------------------------
+template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
+    grouped_poly_norm_fwd_kernel(
+        scalar_t *__restrict__ output,
+        acc_t *__restrict__ inv_rms,
+        const scalar_t *__restrict__ input,
+        const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight,
+        const scalar_t *__restrict__ bias,
+        const int32_t *__restrict__ offsets,
+        const float *__restrict__ scores,  // nullable, always fp32
+        const acc_t eps, const int D, const int num_experts,
+        const int expert_offset,
+        const acc_t hidden_clamp) {  // < 0 = disabled
+  using v_t = vec_t<scalar_t, width>;
+  const bool do_clamp = (hidden_clamp >= acc_t(0));
+  const int row = blockIdx.x;
+  const int vec_d = D / width;
+  const int64_t base = (int64_t)row * vec_d;
+  const v_t *__restrict__ in_v = reinterpret_cast<const v_t *>(input) + base;
+  const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3 + 0];
+  const acc_t w1 = weight[eidx * 3 + 1];
+  const acc_t w2 = weight[eidx * 3 + 2];
+  const acc_t b_val = bias[eidx];
+  const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
+  // Pass 1: RMS stats (on clamped input if enabled)
+  acc_t s2 = 0, s4 = 0, s6 = 0;
+  for (int i = threadIdx.x; i < vec_d; i += BLOCK_SIZE) {
+    v_t xv = in_v[i];
+#pragma unroll
+    for (int j = 0; j < width; ++j) {
+      acc_t x = xv.data[j];
+      if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
+      acc_t x2 = x * x;
+      s2 += x2;
+      s4 += x2 * x2;
+      s6 += x2 * x2 * x2;
+    }
+  }
+  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(s2, s4, s6, 0.f));
+  const acc_t inv_d = acc_t(1) / D;
+  const acc_t ir1 = rsqrtf(sums.x * inv_d + eps);
+  const acc_t ir2 = rsqrtf(sums.y * inv_d + eps);
+  const acc_t ir3 = rsqrtf(sums.z * inv_d + eps);
+  // Save inv_rms for backward
+  if (threadIdx.x == 0) {
+    inv_rms[row * 3 + 0] = ir1;
+    inv_rms[row * 3 + 1] = ir2;
+    inv_rms[row * 3 + 2] = ir3;
+  }
+  const acc_t w2ir1 = w2 * ir1;
+  const acc_t w1ir2 = w1 * ir2;
+  const acc_t w0ir3 = w0 * ir3;
+  // Pass 2: output = poly * mul (with clamping)
+  const v_t *__restrict__ m_v = reinterpret_cast<const v_t *>(mul) + base;
+  v_t *__restrict__ out_v = reinterpret_cast<v_t *>(output) + base;
+  for (int i = threadIdx.x; i < vec_d; i += BLOCK_SIZE) {
+    v_t xv = in_v[i];
+    v_t mv = m_v[i];
+    v_t ov;
+#pragma unroll
+    for (int j = 0; j < width; ++j) {
+      acc_t x = xv.data[j];
+      if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
+      acc_t m = (acc_t)mv.data[j];
+      if (do_clamp) m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
+      acc_t x2 = x * x;
+      acc_t x3 = x2 * x;
+      acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
+      acc_t out_val = poly * m * score;
+      if (do_clamp) out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
+      ov.data[j] = (scalar_t)out_val;
+    }
+    out_v[i] = ov;
+  }
+}
+// Scalar fallback forward
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE)
+    grouped_poly_norm_fwd_scalar(
+        scalar_t *__restrict__ output,
+        acc_t *__restrict__ inv_rms,
+        const scalar_t *__restrict__ input,
+        const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight,
+        const scalar_t *__restrict__ bias,
+        const int32_t *__restrict__ offsets,
+        const float *__restrict__ scores,  // nullable, always fp32
+        const acc_t eps, const int D, const int num_experts,
+        const int expert_offset,
+        const acc_t hidden_clamp) {
+  const bool do_clamp = (hidden_clamp >= acc_t(0));
+  const int row = blockIdx.x;
+  const int64_t off = (int64_t)row * D;
+  const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1], w2 = weight[eidx * 3 + 2];
+  const acc_t b_val = bias[eidx];
+  const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
+  acc_t s2 = 0, s4 = 0, s6 = 0;
+  for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
+    acc_t x = input[off + i];
+    if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
+    acc_t x2 = x * x;
+    s2 += x2; s4 += x2 * x2; s6 += x2 * x2 * x2;
+  }
+  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(s2, s4, s6, 0.f));
+  const acc_t inv_d = acc_t(1) / D;
+  const acc_t ir1 = rsqrtf(sums.x * inv_d + eps);
+  const acc_t ir2 = rsqrtf(sums.y * inv_d + eps);
+  const acc_t ir3 = rsqrtf(sums.z * inv_d + eps);
+  if (threadIdx.x == 0) {
+    inv_rms[row * 3] = ir1; inv_rms[row * 3 + 1] = ir2; inv_rms[row * 3 + 2] = ir3;
+  }
+  const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
+  for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
+    acc_t x = input[off + i];
+    if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
+    acc_t m = (acc_t)mul[off + i];
+    if (do_clamp) m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
+    acc_t x2 = x * x, x3 = x2 * x;
+    acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
+    acc_t out_val = poly * m * score;
+    if (do_clamp) out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
+    output[off + i] = (scalar_t)out_val;
+  }
+}
+// ---------------------------------------------------------------------------
+// Grouped PolyNorm Backward — vectorized (width > 0)
+// Weight/bias grads use atomicAdd directly (no temp buffer + scatter_add).
+// ---------------------------------------------------------------------------
+template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
+    grouped_poly_norm_bwd_kernel(
+        scalar_t *__restrict__ grad_input,
+        scalar_t *__restrict__ grad_mul,
+        float *__restrict__ weight_grad,    // [num_total_experts, 3] fp32
+        float *__restrict__ bias_grad,      // [num_total_experts] fp32
+        const scalar_t *__restrict__ grad_output,
+        const scalar_t *__restrict__ input,
+        const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight,
+        const scalar_t *__restrict__ bias,
+        const int32_t *__restrict__ offsets,
+        const acc_t *__restrict__ inv_rms,
+        const float *__restrict__ scores,  // nullable, always fp32
+        acc_t *__restrict__ grad_scores,      // nullable (null when scores is null)
+        const acc_t eps, const int D, const int num_experts,
+        const int expert_offset,
+        const acc_t hidden_clamp) {
+  using v_t = vec_t<scalar_t, width>;
+  const bool do_clamp = (hidden_clamp >= acc_t(0));
+  const int row = blockIdx.x;
+  const int vec_d = D / width;
+  const int64_t base = (int64_t)row * vec_d;
+  const v_t *__restrict__ in_v = reinterpret_cast<const v_t *>(input) + base;
+  const v_t *__restrict__ go_v = reinterpret_cast<const v_t *>(grad_output) + base;
+  const v_t *__restrict__ m_v = reinterpret_cast<const v_t *>(mul) + base;
+  const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3 + 0];
+  const acc_t w1 = weight[eidx * 3 + 1];
+  const acc_t w2 = weight[eidx * 3 + 2];
+  const acc_t b_val = bias[eidx];
+  const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
+  const acc_t ir1 = inv_rms[row * 3 + 0];
+  const acc_t ir2 = inv_rms[row * 3 + 1];
+  const acc_t ir3 = inv_rms[row * 3 + 2];
+  const acc_t w2ir1 = w2 * ir1;
+  const acc_t w1ir2 = w1 * ir2;
+  const acc_t w0ir3 = w0 * ir3;
+  // ---- Pass 1: dot products (with clamp masks) ----
+  acc_t sdpx = 0, sdpx2 = 0, sdpx3 = 0, sdp = 0;
+  for (int i = threadIdx.x; i < vec_d; i += BLOCK_SIZE) {
+    v_t xv = in_v[i];
+    v_t gv = go_v[i];
+    v_t mv = m_v[i];
+#pragma unroll
+    for (int j = 0; j < width; ++j) {
+      acc_t x_orig = xv.data[j];
+      acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
+      acc_t m_orig = (acc_t)mv.data[j];
+      acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
+      acc_t go = (acc_t)gv.data[j];
+      // Output clamp mask: recompute pre-clamp output
+      if (do_clamp) {
+        acc_t x2 = x * x, x3 = x2 * x;
+        acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
+        acc_t out_pre = poly * m * score;
+        if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
+      }
+      acc_t x2 = x * x;
+      acc_t dp = go * m * score;
+      sdp += dp;
+      sdpx += dp * x;
+      sdpx2 += dp * x2;
+      sdpx3 += dp * x2 * x;
+    }
+  }
+  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
+  const acc_t inv_d = acc_t(1) / D;
+  const acc_t s1 = sums.x * inv_d;
+  const acc_t s2 = sums.y * inv_d;
+  const acc_t s3 = sums.z * inv_d;
+  const acc_t bias_grad_val = sums.w;
+  const acc_t cx = w2 * s1 * ir1 * ir1;
+  const acc_t cx2 = w1 * s2 * ir2 * ir2;
+  const acc_t cx3 = w0 * s3 * ir3 * ir3;
+  // ---- Pass 2: grad_input + grad_mul + weight grads + grad_scores ----
+  acc_t dw0 = 0, dw1 = 0, dw2 = 0, gs_acc = 0;
+  v_t *__restrict__ gi_v = reinterpret_cast<v_t *>(grad_input) + base;
+  v_t *__restrict__ gm_v = reinterpret_cast<v_t *>(grad_mul) + base;
+  for (int i = threadIdx.x; i < vec_d; i += BLOCK_SIZE) {
+    v_t xv = in_v[i];
+    v_t gv = go_v[i];
+    v_t mv = m_v[i];
+    v_t gi, gm;
+#pragma unroll
+    for (int j = 0; j < width; ++j) {
+      acc_t x_orig = xv.data[j];
+      acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
+      acc_t m_orig = (acc_t)mv.data[j];
+      acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
+      acc_t x2 = x * x;
+      acc_t x3 = x2 * x;
+      acc_t go = (acc_t)gv.data[j];
+      // Output clamp mask
+      acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
+      if (do_clamp) {
+        acc_t out_pre = (poly + b_val) * m * score;
+        if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
+      }
+      acc_t dp = go * m * score;
+      // grad_mul with mul clamp mask
+      acc_t gm_val = go * (poly + b_val) * score;
+      if (do_clamp && fabsf(m_orig) > hidden_clamp) gm_val = acc_t(0);
+      gm.data[j] = (scalar_t)gm_val;
+      // grad_input with input clamp mask
+      acc_t g = ir1 * (w2 * dp - x * cx);
+      g += acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2);
+      g += acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
+      if (do_clamp && fabsf(x_orig) > hidden_clamp) g = acc_t(0);
+      gi.data[j] = (scalar_t)g;
+      dw0 += dp * x3 * ir3;
+      dw1 += dp * x2 * ir2;
+      dw2 += dp * x * ir1;
+      gs_acc += go * (poly + b_val) * m;  // grad_scores accumulator
+    }
+    gi_v[i] = gi;
+    gm_v[i] = gm;
+  }
+  // Reduce weight grads + grad_scores (.w channel)
+  float4 wg = block_reduce_f4<BLOCK_SIZE>(make_float4(dw0, dw1, dw2, gs_acc));
+  if (threadIdx.x == 0) {
+    atomicAdd(&weight_grad[eidx * 3 + 0], wg.x);
+    atomicAdd(&weight_grad[eidx * 3 + 1], wg.y);
+    atomicAdd(&weight_grad[eidx * 3 + 2], wg.z);
+    atomicAdd(&bias_grad[eidx], bias_grad_val);
+    if (grad_scores != nullptr) {
+      grad_scores[row] = wg.w;
+    }
+  }
+}
+// ---------------------------------------------------------------------------
+// Scalar fallback (width == 0)
+// ---------------------------------------------------------------------------
+template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE)
+    grouped_poly_norm_bwd_scalar(
+        scalar_t *__restrict__ grad_input,
+        scalar_t *__restrict__ grad_mul,
+        float *__restrict__ weight_grad,
+        float *__restrict__ bias_grad,
+        const scalar_t *__restrict__ grad_output,
+        const scalar_t *__restrict__ input,
+        const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight,
+        const scalar_t *__restrict__ bias,
+        const int32_t *__restrict__ offsets,
+        const acc_t *__restrict__ inv_rms,
+        const float *__restrict__ scores,  // nullable, always fp32
+        acc_t *__restrict__ grad_scores,      // nullable
+        const acc_t eps, const int D, const int num_experts,
+        const int expert_offset,
+        const acc_t hidden_clamp) {
+  const bool do_clamp = (hidden_clamp >= acc_t(0));
+  const int row = blockIdx.x;
+  const int64_t off = (int64_t)row * D;
+  const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1], w2 = weight[eidx * 3 + 2];
+  const acc_t b_val = bias[eidx];
+  const acc_t ir1 = inv_rms[row * 3], ir2 = inv_rms[row * 3 + 1], ir3 = inv_rms[row * 3 + 2];
+  const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
+  const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
+  // Pass 1: dot products with clamp masks
+  acc_t sdpx = 0, sdpx2 = 0, sdpx3 = 0, sdp = 0;
+  for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
+    acc_t x_orig = input[off + i];
+    acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
+    acc_t m_orig = (acc_t)mul[off + i];
+    acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
+    acc_t go = (acc_t)grad_output[off + i];
+    if (do_clamp) {
+      acc_t x2 = x * x, x3 = x2 * x;
+      acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
+      acc_t out_pre = poly * m * score;
+      if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
+    }
+    acc_t x2 = x * x;
+    acc_t dp = go * m * score;
+    sdp += dp; sdpx += dp * x; sdpx2 += dp * x2; sdpx3 += dp * x2 * x;
+  }
+  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
+  const acc_t inv_d = acc_t(1) / D;
+  const acc_t s1 = sums.x * inv_d, s2 = sums.y * inv_d, s3 = sums.z * inv_d;
+  const acc_t cx = w2 * s1 * ir1 * ir1, cx2 = w1 * s2 * ir2 * ir2, cx3 = w0 * s3 * ir3 * ir3;
+  // Pass 2: grads with clamp masks
+  acc_t dw0 = 0, dw1 = 0, dw2 = 0, gs_acc = 0;
+  for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
+    acc_t x_orig = input[off + i], m_orig = (acc_t)mul[off + i];
+    acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
+    acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
+    acc_t go = (acc_t)grad_output[off + i];
+    acc_t x2 = x * x, x3 = x2 * x;
+    acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
+    // Output clamp mask
+    if (do_clamp) {
+      acc_t out_pre = (poly + b_val) * m * score;
+      if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
+    }
+    acc_t dp = go * m * score;
+    acc_t gm_val = go * (poly + b_val) * score;
+    if (do_clamp && fabsf(m_orig) > hidden_clamp) gm_val = acc_t(0);
+    grad_mul[off + i] = (scalar_t)gm_val;
+    acc_t g = ir1 * (w2 * dp - x * cx) + acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2)
+              + acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
+    if (do_clamp && fabsf(x_orig) > hidden_clamp) g = acc_t(0);
+    grad_input[off + i] = (scalar_t)g;
+    dw0 += dp * x3 * ir3; dw1 += dp * x2 * ir2; dw2 += dp * x * ir1;
+    gs_acc += go * (poly + b_val) * m;
+  }
+  float4 wg = block_reduce_f4<BLOCK_SIZE>(make_float4(dw0, dw1, dw2, gs_acc));
+  if (threadIdx.x == 0) {
+    atomicAdd(&weight_grad[eidx * 3 + 0], wg.x);
+    atomicAdd(&weight_grad[eidx * 3 + 1], wg.y);
+    atomicAdd(&weight_grad[eidx * 3 + 2], wg.z);
+    atomicAdd(&bias_grad[eidx], sums.w);
+    if (grad_scores != nullptr) {
+      grad_scores[row] = wg.w;
+    }
+  }
+}
+} // namespace motif
+// ---------------------------------------------------------------------------
+// Internal helpers — shared kernel dispatch
+// ---------------------------------------------------------------------------
+#define FWD_LAUNCH(width_val, scalar_type_name)                                \
+  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
+      input.scalar_type(), scalar_type_name, [&] {                             \
+        motif::grouped_poly_norm_fwd_kernel<scalar_t, float, width_val, BLOCK> \
+            <<<grid, block, 0, stream>>>(                                      \
+                output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),        \
+                input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),          \
+                weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),        \
+                offsets.data_ptr<int32_t>(), scores_ptr,                        \
+                (float)eps, D, num_experts, (int)expert_offset,                \
+                (float)hidden_clamp);                                          \
+      })
+static std::tuple<torch::Tensor, torch::Tensor>
+_fwd_impl(const torch::Tensor &input, const torch::Tensor &mul,
+           const torch::Tensor &weight, const torch::Tensor &bias,
+           const torch::Tensor &offsets, const float *scores_ptr,
+           double eps, int64_t expert_offset, double hidden_clamp) {
+  const int D = input.size(-1);
+  const int64_t N = input.size(0);
+  const int num_experts = offsets.size(0);
+  constexpr int BLOCK = 128;
+  dim3 grid(N); dim3 block(BLOCK);
+  auto output = torch::empty_like(input);
+  auto inv_rms = torch::empty({N, 3}, input.options().dtype(torch::kFloat));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (D % 8 == 0 && input.element_size() == 2)
+    FWD_LAUNCH(8, "grouped_poly_norm_fwd");
+  else if (D % 4 == 0 && input.element_size() == 4)
+    FWD_LAUNCH(4, "grouped_poly_norm_fwd");
+  else {
+    MOTIF_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "grouped_poly_norm_fwd_scalar", [&] {
+          motif::grouped_poly_norm_fwd_scalar<scalar_t, float, BLOCK>
+              <<<grid, block, 0, stream>>>(
+                  output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),
+                  input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
+                  offsets.data_ptr<int32_t>(), scores_ptr,
+                  (float)eps, D, num_experts, (int)expert_offset,
+                  (float)hidden_clamp);
+        });
+  }
+  return {output, inv_rms};
+}
+#undef FWD_LAUNCH
+#define BWD_LAUNCH(width_val, scalar_type_name, kernel_name)                   \
+  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
+      input.scalar_type(), scalar_type_name, [&] {                             \
+        motif::kernel_name<scalar_t, float, width_val, BLOCK>                  \
+            <<<grid, block, 0, stream>>>(                                      \
+                input_grad.data_ptr<scalar_t>(),                               \
+                mul_grad.data_ptr<scalar_t>(),                                 \
+                wg_f32.data_ptr<float>(), bg_f32.data_ptr<float>(),            \
+                grad_output.data_ptr<scalar_t>(),                              \
+                input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),          \
+                weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),        \
+                offsets.data_ptr<int32_t>(), inv_rms.data_ptr<float>(),        \
+                scores_ptr, gs_ptr,                                            \
+                (float)eps, D, num_experts, (int)expert_offset,                \
+                (float)hidden_clamp);                                          \
+      })
+static std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+                   torch::Tensor>
+_bwd_impl(const torch::Tensor &grad_output, const torch::Tensor &input,
+           const torch::Tensor &mul, const torch::Tensor &weight,
+           const torch::Tensor &bias, const torch::Tensor &offsets,
+           const torch::Tensor &inv_rms, const float *scores_ptr,
+           float *gs_ptr, int64_t N,
+           double eps, int64_t expert_offset, double hidden_clamp) {
+  const int D = input.size(-1);
+  const int num_experts = offsets.size(0);
+  constexpr int BLOCK = 128;
+  dim3 grid(N); dim3 block(BLOCK);
+  auto input_grad = torch::empty_like(input);
+  auto mul_grad = torch::empty_like(mul);
+  auto wg_f32 = torch::zeros({weight.size(0), 3}, input.options().dtype(torch::kFloat));
+  auto bg_f32 = torch::zeros({bias.size(0)}, input.options().dtype(torch::kFloat));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (D % 8 == 0 && input.element_size() == 2)
+    BWD_LAUNCH(8, "grouped_poly_norm_bwd", grouped_poly_norm_bwd_kernel);
+  else if (D % 4 == 0 && input.element_size() == 4)
+    BWD_LAUNCH(4, "grouped_poly_norm_bwd", grouped_poly_norm_bwd_kernel);
+  else {
+    MOTIF_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "grouped_poly_norm_bwd_scalar", [&] {
+          motif::grouped_poly_norm_bwd_scalar<scalar_t, float, BLOCK>
+              <<<grid, block, 0, stream>>>(
+                  input_grad.data_ptr<scalar_t>(),
+                  mul_grad.data_ptr<scalar_t>(),
+                  wg_f32.data_ptr<float>(), bg_f32.data_ptr<float>(),
+                  grad_output.data_ptr<scalar_t>(),
+                  input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
+                  offsets.data_ptr<int32_t>(), inv_rms.data_ptr<float>(),
+                  scores_ptr, gs_ptr,
+                  (float)eps, D, num_experts, (int)expert_offset,
+                  (float)hidden_clamp);
+        });
+  }
+  auto weight_grad = wg_f32.to(weight.dtype());
+  auto bias_grad = bg_f32.unsqueeze(-1).to(bias.dtype());
+  // gs_f32 handled by caller
+  return {input_grad, mul_grad, weight_grad, bias_grad, torch::Tensor()};
+}
+#undef BWD_LAUNCH
+// ---------------------------------------------------------------------------
+// Public API: without scores
+// ---------------------------------------------------------------------------
+std::tuple<torch::Tensor, torch::Tensor>
+grouped_poly_norm_forward(
+    const torch::Tensor &input, const torch::Tensor &mul,
+    const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, double eps, int64_t expert_offset) {
+  return _fwd_impl(input, mul, weight, bias, offsets, nullptr, eps, expert_offset, -1.0);
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+grouped_poly_norm_backward(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &mul, const torch::Tensor &weight,
+    const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, double eps, int64_t expert_offset) {
+  const int64_t N = input.size(0);
+  auto [ig, mg, wg, bg, _] = _bwd_impl(
+      grad_output, input, mul, weight, bias, offsets, inv_rms,
+      nullptr, nullptr, N, eps, expert_offset, -1.0);
+  return {ig, mg, wg, bg};
+}
+// ---------------------------------------------------------------------------
+// Public API: with scores
+// ---------------------------------------------------------------------------
+std::tuple<torch::Tensor, torch::Tensor>
+grouped_poly_norm_forward_scored(
+    const torch::Tensor &input, const torch::Tensor &mul,
+    const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, const torch::Tensor &scores,
+    double eps, int64_t expert_offset, double hidden_clamp) {
+  return _fwd_impl(input, mul, weight, bias, offsets,
+                    scores.data_ptr<float>(), eps, expert_offset, hidden_clamp);
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+grouped_poly_norm_backward_scored(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &mul, const torch::Tensor &weight,
+    const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, const torch::Tensor &scores,
+    double eps, int64_t expert_offset, double hidden_clamp) {
+  const int64_t N = input.size(0);
+  auto gs_f32 = torch::empty({N}, input.options().dtype(torch::kFloat));
+  auto [ig, mg, wg, bg, _] = _bwd_impl(
+      grad_output, input, mul, weight, bias, offsets, inv_rms,
+      scores.data_ptr<float>(), gs_f32.data_ptr<float>(), N,
+      eps, expert_offset, hidden_clamp);
+  auto gs = gs_f32.unsqueeze(-1);
+  return {ig, mg, wg, bg, gs};
+}

build.toml CHANGED Viewed

@@ -19,6 +19,7 @@ src = [
   "activation/fused_mul_poly_norm.cu",
   "activation/rms_norm.cu",
   "activation/fused_add_rms_norm.cu",
   "activation/cuda_compat.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",
@@ -33,6 +34,7 @@ src = [
   "activation/fused_mul_poly_norm.cu",
   "activation/rms_norm.cu",
   "activation/fused_add_rms_norm.cu",
   "activation/cuda_compat.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",

   "activation/fused_mul_poly_norm.cu",
   "activation/rms_norm.cu",
   "activation/fused_add_rms_norm.cu",
+  "activation/grouped_poly_norm.cu",
   "activation/cuda_compat.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",
   "activation/fused_mul_poly_norm.cu",
   "activation/rms_norm.cu",
   "activation/fused_add_rms_norm.cu",
+  "activation/grouped_poly_norm.cu",
   "activation/cuda_compat.h",
   "activation/dispatch_utils.h",
   "activation/assert_utils.h",

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -48,6 +48,36 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "(Tensor, Tensor)");
   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

       "(Tensor, Tensor)");
   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
+  // grouped_poly_norm (without scores)
+  ops.def("grouped_poly_norm_forward("
+          "Tensor input, Tensor mul, Tensor weight, "
+          "Tensor bias, Tensor offsets, "
+          "float eps, int expert_offset) -> (Tensor, Tensor)");
+  ops.impl("grouped_poly_norm_forward", torch::kCUDA,
+           &grouped_poly_norm_forward);
+  ops.def("grouped_poly_norm_backward("
+          "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
+          "Tensor bias, Tensor offsets, Tensor inv_rms, "
+          "float eps, int expert_offset) -> (Tensor, Tensor, Tensor, Tensor)");
+  ops.impl("grouped_poly_norm_backward", torch::kCUDA,
+           &grouped_poly_norm_backward);
+  // grouped_poly_norm (with scores)
+  ops.def("grouped_poly_norm_forward_scored("
+          "Tensor input, Tensor mul, Tensor weight, "
+          "Tensor bias, Tensor offsets, Tensor scores, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
+  ops.impl("grouped_poly_norm_forward_scored", torch::kCUDA,
+           &grouped_poly_norm_forward_scored);
+  ops.def("grouped_poly_norm_backward_scored("
+          "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
+          "Tensor bias, Tensor offsets, Tensor inv_rms, Tensor scores, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor, Tensor, Tensor, Tensor)");
+  ops.impl("grouped_poly_norm_backward_scored", torch::kCUDA,
+           &grouped_poly_norm_backward_scored);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h CHANGED Viewed

@@ -35,3 +35,33 @@ std::tuple<torch::Tensor, torch::Tensor> fused_add_rms_norm_backward(
     const torch::Tensor &output_grad, const torch::Tensor &add_output_grad,
     const torch::Tensor &input, const torch::Tensor &weight, double eps,
     bool need_input_grad);

     const torch::Tensor &output_grad, const torch::Tensor &add_output_grad,
     const torch::Tensor &input, const torch::Tensor &weight, double eps,
     bool need_input_grad);
+// Without scores
+std::tuple<torch::Tensor, torch::Tensor>
+grouped_poly_norm_forward(
+    const torch::Tensor &input, const torch::Tensor &mul,
+    const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, double eps, int64_t expert_offset);
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+grouped_poly_norm_backward(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &mul, const torch::Tensor &weight,
+    const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, double eps, int64_t expert_offset);
+// With scores (hidden_clamp < 0 = disabled)
+std::tuple<torch::Tensor, torch::Tensor>
+grouped_poly_norm_forward_scored(
+    const torch::Tensor &input, const torch::Tensor &mul,
+    const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, const torch::Tensor &scores,
+    double eps, int64_t expert_offset, double hidden_clamp);
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+grouped_poly_norm_backward_scored(
+    const torch::Tensor &grad_output, const torch::Tensor &input,
+    const torch::Tensor &mul, const torch::Tensor &weight,
+    const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, const torch::Tensor &scores,
+    double eps, int64_t expert_offset, double hidden_clamp);