style: apply yapf, isort, and clang-format

Browse files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (10) hide show

README.md +0 -103
activation/grouped_poly_norm.cu +192 -171
benchmarks/cases/grouped_mul_poly.py +28 -22
benchmarks/profile_bwd.py +0 -146
benchmarks/run_cases.py +39 -16
registration.h +3 -2
setup.py +3 -3
tests/test_fused_mul_grouped_poly_norm.py +168 -61
torch-ext/activation/grouped_poly_norm.py +42 -35
torch-ext/torch_binding.cpp +14 -10

README.md CHANGED Viewed

@@ -251,109 +251,6 @@ print(poly_norm(x))
 > | Forward | 0.7 ms | 2.1 ms | **3.0x** |
 > | Backward | 1.4 ms | 3.7 ms | **2.6x** |
-#### B200 Results (bf16)
-<details>
-<summary>Forward Performance</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive |
-|-----------|---------|-----------|--------------|------------|-----------------|
-| 1 | 1024 | 294.54 | 73.46 | 64.33 | 4.58x |
-| 1 | 2048 | 373.50 | 94.88 | 65.26 | 5.72x |
-| 1 | 4096 | 372.65 | 94.90 | 66.90 | 5.57x |
-| 1 | 8192 | 486.98 | 102.33 | 72.71 | 6.70x |
-| 2 | 4096 | 486.66 | 101.87 | 72.27 | 6.73x |
-| 2 | 8192 | 950.62 | 106.96 | 90.06 | 10.56x |
-| 4 | 4096 | 950.72 | 107.17 | 71.28 | 13.34x |
-| 4 | 8192 | 1779.12 | 198.91 | 96.93 | 18.35x |
-| 8 | 4096 | 1778.73 | 199.10 | 96.88 | 18.36x |
-| 8 | 8192 | 3384.03 | 381.91 | 179.57 | 18.85x |
-</details>
-<details>
-<summary>Backward Performance</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive |
-|-----------|---------|-----------|--------------|------------|-----------------|
-| 1 | 1024 | 1690.61 | 999.66 | 1017.66 | 1.66x |
-| 1 | 8192 | 1680.39 | 906.43 | 906.41 | 1.85x |
-| 2 | 8192 | 2466.73 | 870.74 | 862.78 | 2.86x |
-| 4 | 4096 | 2466.04 | 942.62 | 945.68 | 2.61x |
-| 4 | 8192 | 4543.10 | 941.01 | 908.30 | 5.00x |
-| 8 | 4096 | 4542.91 | 814.73 | 900.01 | 5.05x |
-| 8 | 8192 | 8599.41 | 956.81 | 955.07 | 9.00x |
-</details>
-<details>
-<summary>Forward + Backward Combined</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive | CUDA vs Compiled |
-|-----------|---------|-----------|--------------|------------|-----------------|-------------------|
-| 1 | 1024 | 1985.15 | 1073.12 | 1081.99 | 1.83x | 0.99x |
-| 1 | 4096 | 2085.10 | 974.32 | 960.73 | 2.17x | 1.01x |
-| 1 | 8192 | 2167.37 | 1008.76 | 979.12 | 2.21x | 1.03x |
-| 2 | 4096 | 2083.49 | 1001.03 | 965.30 | 2.16x | 1.04x |
-| 2 | 8192 | 3417.35 | 977.70 | 952.84 | 3.59x | 1.03x |
-| 4 | 4096 | 3416.76 | 1049.79 | 1016.97 | 3.36x | 1.03x |
-| 4 | 8192 | 6322.22 | 1139.92 | 1005.23 | 6.29x | 1.13x |
-| 8 | 4096 | 6321.64 | 1013.83 | 996.89 | 6.34x | 1.02x |
-| 8 | 8192 | 11983.44 | 1338.71 | 1134.64 | 10.56x | 1.18x |
-</details>
-#### B200 Results (fp32)
-<details>
-<summary>Forward Performance</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive |
-|-----------|---------|-----------|--------------|------------|-----------------|
-| 1 | 1024 | 318.05 | 83.29 | 64.24 | 4.95x |
-| 1 | 2048 | 311.14 | 95.19 | 63.64 | 4.89x |
-| 1 | 8192 | 401.78 | 101.61 | 68.21 | 5.89x |
-| 2 | 4096 | 403.42 | 100.97 | 68.01 | 5.93x |
-| 2 | 8192 | 803.31 | 130.51 | 68.21 | 11.78x |
-| 4 | 4096 | 802.86 | 130.61 | 66.97 | 11.99x |
-| 4 | 8192 | 1505.96 | 246.77 | 100.49 | 14.99x |
-| 8 | 4096 | 1507.87 | 246.84 | 100.23 | 15.04x |
-| 8 | 8192 | 2856.93 | 476.34 | 184.40 | 15.49x |
-</details>
-<details>
-<summary>Backward Performance</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive |
-|-----------|---------|-----------|--------------|------------|-----------------|
-| 1 | 1024 | 1604.25 | 989.30 | 1114.12 | 1.44x |
-| 1 | 8192 | 1996.40 | 1117.71 | 1115.47 | 1.79x |
-| 2 | 8192 | 2353.87 | 1119.41 | 1118.57 | 2.10x |
-| 4 | 4096 | 2358.47 | 1102.23 | 1125.16 | 2.10x |
-| 4 | 8192 | 4346.92 | 1125.33 | 1135.36 | 3.83x |
-| 8 | 4096 | 4347.47 | 1104.27 | 1119.63 | 3.88x |
-| 8 | 8192 | 8226.50 | 1172.66 | 1197.68 | 6.87x |
-</details>
-<details>
-<summary>Forward + Backward Combined</summary>
-| batch_size | seq_len | Naive (us) | Compiled (us) | CUDA (us) | CUDA vs Naive | CUDA vs Compiled |
-|-----------|---------|-----------|--------------|------------|-----------------|-------------------|
-| 1 | 1024 | 1922.30 | 1072.59 | 1178.36 | 1.63x | 0.91x |
-| 1 | 4096 | 2367.77 | 1208.69 | 1192.07 | 1.99x | 1.01x |
-| 1 | 8192 | 2398.19 | 1219.32 | 1183.69 | 2.03x | 1.03x |
-| 2 | 4096 | 2401.39 | 1248.87 | 1154.72 | 2.08x | 1.08x |
-| 2 | 8192 | 3157.18 | 1249.92 | 1186.77 | 2.66x | 1.05x |
-| 4 | 4096 | 3161.33 | 1232.84 | 1192.13 | 2.65x | 1.03x |
-| 4 | 8192 | 5852.88 | 1372.10 | 1235.86 | 4.74x | 1.11x |
-| 8 | 4096 | 5855.34 | 1351.11 | 1219.85 | 4.80x | 1.11x |
-| 8 | 8192 | 11083.43 | 1649.00 | 1382.07 | 8.02x | 1.19x |
-</details>
 ## Pre-commit Hooks
 This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.

 > | Forward | 0.7 ms | 2.1 ms | **3.0x** |
 > | Backward | 1.4 ms | 3.7 ms | **2.6x** |
 ## Pre-commit Hooks
 This project uses [pre-commit](https://pre-commit.com/) to automatically check and format code before commits.

activation/grouped_poly_norm.cu CHANGED Viewed

@@ -18,7 +18,8 @@ __device__ __forceinline__ int find_expert(const int32_t *__restrict__ offsets,
   int lo = 0, hi = num_experts;
 #pragma unroll 6
   for (int i = 0; i < 12; ++i) {
-    if (lo >= hi) break;
     int mid = (lo + hi) >> 1;
     if (offsets[mid] <= row)
       lo = mid + 1;
@@ -53,7 +54,8 @@ __device__ __forceinline__ float4 block_reduce_f4(float4 v) {
   const int warp_id = threadIdx.x / WARP_SIZE;
   const int lane_id = threadIdx.x % WARP_SIZE;
-  if (lane_id == 0) warp_results[warp_id] = v;
   __syncthreads();
   if (warp_id == 0 && lane_id < NUM_WARPS)
@@ -61,10 +63,12 @@ __device__ __forceinline__ float4 block_reduce_f4(float4 v) {
   else
     v = make_float4(0.f, 0.f, 0.f, 0.f);
-  if (warp_id == 0) v = warp_reduce_f4(v);
   __shared__ float4 result;
-  if (threadIdx.x == 0) result = v;
   __syncthreads();
   return result;
 }
@@ -77,17 +81,14 @@ __device__ __forceinline__ float4 block_reduce_f4(float4 v) {
 template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
 __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
     grouped_poly_norm_fwd_kernel(
-        scalar_t *__restrict__ output,
-        acc_t *__restrict__ inv_rms,
-        const scalar_t *__restrict__ input,
-        const scalar_t *__restrict__ mul,
-        const scalar_t *__restrict__ weight,
-        const scalar_t *__restrict__ bias,
         const int32_t *__restrict__ offsets,
-        const float *__restrict__ scores,  // nullable, always fp32
         const acc_t eps, const int D, const int num_experts,
         const int expert_offset,
-        const acc_t hidden_clamp) {  // < 0 = disabled
   using v_t = vec_t<scalar_t, width>;
   const bool do_clamp = (hidden_clamp >= acc_t(0));
@@ -111,7 +112,8 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x = xv.data[j];
-      if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
       acc_t x2 = x * x;
       s2 += x2;
       s4 += x2 * x2;
@@ -148,14 +150,17 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x = xv.data[j];
-      if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
       acc_t m = (acc_t)mv.data[j];
-      if (do_clamp) m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
       acc_t x2 = x * x;
       acc_t x3 = x2 * x;
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
       acc_t out_val = poly * m * score;
-      if (do_clamp) out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
       ov.data[j] = (scalar_t)out_val;
     }
     out_v[i] = ov;
@@ -164,34 +169,33 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 // Scalar fallback forward
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void __launch_bounds__(BLOCK_SIZE)
-    grouped_poly_norm_fwd_scalar(
-        scalar_t *__restrict__ output,
-        acc_t *__restrict__ inv_rms,
-        const scalar_t *__restrict__ input,
-        const scalar_t *__restrict__ mul,
-        const scalar_t *__restrict__ weight,
-        const scalar_t *__restrict__ bias,
-        const int32_t *__restrict__ offsets,
-        const float *__restrict__ scores,  // nullable, always fp32
-        const acc_t eps, const int D, const int num_experts,
-        const int expert_offset,
-        const acc_t hidden_clamp) {
   const bool do_clamp = (hidden_clamp >= acc_t(0));
   const int row = blockIdx.x;
   const int64_t off = (int64_t)row * D;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
-  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1], w2 = weight[eidx * 3 + 2];
   const acc_t b_val = bias[eidx];
   const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
   acc_t s2 = 0, s4 = 0, s6 = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x = input[off + i];
-    if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
     acc_t x2 = x * x;
-    s2 += x2; s4 += x2 * x2; s6 += x2 * x2 * x2;
   }
   float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(s2, s4, s6, 0.f));
@@ -201,19 +205,24 @@ __global__ void __launch_bounds__(BLOCK_SIZE)
   const acc_t ir3 = rsqrtf(sums.z * inv_d + eps);
   if (threadIdx.x == 0) {
-    inv_rms[row * 3] = ir1; inv_rms[row * 3 + 1] = ir2; inv_rms[row * 3 + 2] = ir3;
   }
   const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x = input[off + i];
-    if (do_clamp) x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
     acc_t m = (acc_t)mul[off + i];
-    if (do_clamp) m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
     acc_t x2 = x * x, x3 = x2 * x;
     acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
     acc_t out_val = poly * m * score;
-    if (do_clamp) out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
     output[off + i] = (scalar_t)out_val;
   }
 }
@@ -225,22 +234,17 @@ __global__ void __launch_bounds__(BLOCK_SIZE)
 template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
 __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
     grouped_poly_norm_bwd_kernel(
-        scalar_t *__restrict__ grad_input,
-        scalar_t *__restrict__ grad_mul,
-        float *__restrict__ weight_grad,    // [num_total_experts, 3] fp32
-        float *__restrict__ bias_grad,      // [num_total_experts] fp32
         const scalar_t *__restrict__ grad_output,
-        const scalar_t *__restrict__ input,
-        const scalar_t *__restrict__ mul,
-        const scalar_t *__restrict__ weight,
-        const scalar_t *__restrict__ bias,
-        const int32_t *__restrict__ offsets,
-        const acc_t *__restrict__ inv_rms,
-        const float *__restrict__ scores,  // nullable, always fp32
-        acc_t *__restrict__ grad_scores,      // nullable (null when scores is null)
         const acc_t eps, const int D, const int num_experts,
-        const int expert_offset,
-        const acc_t hidden_clamp) {
   using v_t = vec_t<scalar_t, width>;
   const bool do_clamp = (hidden_clamp >= acc_t(0));
@@ -249,7 +253,8 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
   const int64_t base = (int64_t)row * vec_d;
   const v_t *__restrict__ in_v = reinterpret_cast<const v_t *>(input) + base;
-  const v_t *__restrict__ go_v = reinterpret_cast<const v_t *>(grad_output) + base;
   const v_t *__restrict__ m_v = reinterpret_cast<const v_t *>(mul) + base;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
@@ -278,9 +283,11 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x_orig = xv.data[j];
-      acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
       acc_t m_orig = (acc_t)mv.data[j];
-      acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
       acc_t go = (acc_t)gv.data[j];
       // Output clamp mask: recompute pre-clamp output
@@ -288,7 +295,8 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
         acc_t x2 = x * x, x3 = x2 * x;
         acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
         acc_t out_pre = poly * m * score;
-        if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
       }
       acc_t x2 = x * x;
@@ -300,7 +308,8 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
     }
   }
-  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
   const acc_t inv_d = acc_t(1) / D;
   const acc_t s1 = sums.x * inv_d;
@@ -327,9 +336,11 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x_orig = xv.data[j];
-      acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
       acc_t m_orig = (acc_t)mv.data[j];
-      acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
       acc_t x2 = x * x;
       acc_t x3 = x2 * x;
       acc_t go = (acc_t)gv.data[j];
@@ -338,27 +349,30 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
       if (do_clamp) {
         acc_t out_pre = (poly + b_val) * m * score;
-        if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
       }
       acc_t dp = go * m * score;
       // grad_mul with mul clamp mask
       acc_t gm_val = go * (poly + b_val) * score;
-      if (do_clamp && fabsf(m_orig) > hidden_clamp) gm_val = acc_t(0);
       gm.data[j] = (scalar_t)gm_val;
       // grad_input with input clamp mask
       acc_t g = ir1 * (w2 * dp - x * cx);
       g += acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2);
       g += acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
-      if (do_clamp && fabsf(x_orig) > hidden_clamp) g = acc_t(0);
       gi.data[j] = (scalar_t)g;
       dw0 += dp * x3 * ir3;
       dw1 += dp * x2 * ir2;
       dw2 += dp * x * ir1;
-      gs_acc += go * (poly + b_val) * m;  // grad_scores accumulator
     }
     gi_v[i] = gi;
@@ -383,32 +397,27 @@ __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
 // Scalar fallback (width == 0)
 // ---------------------------------------------------------------------------
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
-__global__ void __launch_bounds__(BLOCK_SIZE)
-    grouped_poly_norm_bwd_scalar(
-        scalar_t *__restrict__ grad_input,
-        scalar_t *__restrict__ grad_mul,
-        float *__restrict__ weight_grad,
-        float *__restrict__ bias_grad,
-        const scalar_t *__restrict__ grad_output,
-        const scalar_t *__restrict__ input,
-        const scalar_t *__restrict__ mul,
-        const scalar_t *__restrict__ weight,
-        const scalar_t *__restrict__ bias,
-        const int32_t *__restrict__ offsets,
-        const acc_t *__restrict__ inv_rms,
-        const float *__restrict__ scores,  // nullable, always fp32
-        acc_t *__restrict__ grad_scores,      // nullable
-        const acc_t eps, const int D, const int num_experts,
-        const int expert_offset,
-        const acc_t hidden_clamp) {
   const bool do_clamp = (hidden_clamp >= acc_t(0));
   const int row = blockIdx.x;
   const int64_t off = (int64_t)row * D;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
-  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1], w2 = weight[eidx * 3 + 2];
   const acc_t b_val = bias[eidx];
-  const acc_t ir1 = inv_rms[row * 3], ir2 = inv_rms[row * 3 + 1], ir3 = inv_rms[row * 3 + 2];
   const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
   const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
@@ -416,34 +425,44 @@ __global__ void __launch_bounds__(BLOCK_SIZE)
   acc_t sdpx = 0, sdpx2 = 0, sdpx3 = 0, sdp = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x_orig = input[off + i];
-    acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
     acc_t m_orig = (acc_t)mul[off + i];
-    acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
     acc_t go = (acc_t)grad_output[off + i];
     if (do_clamp) {
       acc_t x2 = x * x, x3 = x2 * x;
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
       acc_t out_pre = poly * m * score;
-      if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
     }
     acc_t x2 = x * x;
     acc_t dp = go * m * score;
-    sdp += dp; sdpx += dp * x; sdpx2 += dp * x2; sdpx3 += dp * x2 * x;
   }
-  float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
   const acc_t inv_d = acc_t(1) / D;
   const acc_t s1 = sums.x * inv_d, s2 = sums.y * inv_d, s3 = sums.z * inv_d;
-  const acc_t cx = w2 * s1 * ir1 * ir1, cx2 = w1 * s2 * ir2 * ir2, cx3 = w0 * s3 * ir3 * ir3;
   // Pass 2: grads with clamp masks
   acc_t dw0 = 0, dw1 = 0, dw2 = 0, gs_acc = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x_orig = input[off + i], m_orig = (acc_t)mul[off + i];
-    acc_t x = do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
-    acc_t m = do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
     acc_t go = (acc_t)grad_output[off + i];
     acc_t x2 = x * x, x3 = x2 * x;
     acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
@@ -451,21 +470,27 @@ __global__ void __launch_bounds__(BLOCK_SIZE)
     // Output clamp mask
     if (do_clamp) {
       acc_t out_pre = (poly + b_val) * m * score;
-      if (fabsf(out_pre) > hidden_clamp) go = acc_t(0);
     }
     acc_t dp = go * m * score;
     acc_t gm_val = go * (poly + b_val) * score;
-    if (do_clamp && fabsf(m_orig) > hidden_clamp) gm_val = acc_t(0);
     grad_mul[off + i] = (scalar_t)gm_val;
-    acc_t g = ir1 * (w2 * dp - x * cx) + acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2)
-              + acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
-    if (do_clamp && fabsf(x_orig) > hidden_clamp) g = acc_t(0);
     grad_input[off + i] = (scalar_t)g;
-    dw0 += dp * x3 * ir3; dw1 += dp * x2 * ir2; dw2 += dp * x * ir1;
     gs_acc += go * (poly + b_val) * m;
   }
@@ -487,28 +512,27 @@ __global__ void __launch_bounds__(BLOCK_SIZE)
 // Internal helpers — shared kernel dispatch
 // ---------------------------------------------------------------------------
 #define FWD_LAUNCH(width_val, scalar_type_name)                                \
-  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
-      input.scalar_type(), scalar_type_name, [&] {                             \
-        motif::grouped_poly_norm_fwd_kernel<scalar_t, float, width_val, BLOCK> \
-            <<<grid, block, 0, stream>>>(                                      \
-                output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),        \
-                input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),          \
-                weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),        \
-                offsets.data_ptr<int32_t>(), scores_ptr,                        \
-                (float)eps, D, num_experts, (int)expert_offset,                \
-                (float)hidden_clamp);                                          \
-      })
 static std::tuple<torch::Tensor, torch::Tensor>
 _fwd_impl(const torch::Tensor &input, const torch::Tensor &mul,
-           const torch::Tensor &weight, const torch::Tensor &bias,
-           const torch::Tensor &offsets, const float *scores_ptr,
-           double eps, int64_t expert_offset, double hidden_clamp) {
   const int D = input.size(-1);
   const int64_t N = input.size(0);
   const int num_experts = offsets.size(0);
   constexpr int BLOCK = 128;
-  dim3 grid(N); dim3 block(BLOCK);
   auto output = torch::empty_like(input);
   auto inv_rms = torch::empty({N, 3}, input.options().dtype(torch::kFloat));
@@ -527,9 +551,8 @@ _fwd_impl(const torch::Tensor &input, const torch::Tensor &mul,
                   output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),
                   input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
                   weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
-                  offsets.data_ptr<int32_t>(), scores_ptr,
-                  (float)eps, D, num_experts, (int)expert_offset,
-                  (float)hidden_clamp);
         });
   }
   return {output, inv_rms};
@@ -537,39 +560,37 @@ _fwd_impl(const torch::Tensor &input, const torch::Tensor &mul,
 #undef FWD_LAUNCH
 #define BWD_LAUNCH(width_val, scalar_type_name, kernel_name)                   \
-  MOTIF_DISPATCH_FLOATING_TYPES(                                               \
-      input.scalar_type(), scalar_type_name, [&] {                             \
-        motif::kernel_name<scalar_t, float, width_val, BLOCK>                  \
-            <<<grid, block, 0, stream>>>(                                      \
-                input_grad.data_ptr<scalar_t>(),                               \
-                mul_grad.data_ptr<scalar_t>(),                                 \
-                wg_f32.data_ptr<float>(), bg_f32.data_ptr<float>(),            \
-                grad_output.data_ptr<scalar_t>(),                              \
-                input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),          \
-                weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),        \
-                offsets.data_ptr<int32_t>(), inv_rms.data_ptr<float>(),        \
-                scores_ptr, gs_ptr,                                            \
-                (float)eps, D, num_experts, (int)expert_offset,                \
-                (float)hidden_clamp);                                          \
-      })
 static std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
-                   torch::Tensor>
 _bwd_impl(const torch::Tensor &grad_output, const torch::Tensor &input,
-           const torch::Tensor &mul, const torch::Tensor &weight,
-           const torch::Tensor &bias, const torch::Tensor &offsets,
-           const torch::Tensor &inv_rms, const float *scores_ptr,
-           float *gs_ptr, int64_t N,
-           double eps, int64_t expert_offset, double hidden_clamp) {
   const int D = input.size(-1);
   const int num_experts = offsets.size(0);
   constexpr int BLOCK = 128;
-  dim3 grid(N); dim3 block(BLOCK);
   auto input_grad = torch::empty_like(input);
   auto mul_grad = torch::empty_like(mul);
-  auto wg_f32 = torch::zeros({weight.size(0), 3}, input.options().dtype(torch::kFloat));
-  auto bg_f32 = torch::zeros({bias.size(0)}, input.options().dtype(torch::kFloat));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -583,15 +604,13 @@ _bwd_impl(const torch::Tensor &grad_output, const torch::Tensor &input,
           motif::grouped_poly_norm_bwd_scalar<scalar_t, float, BLOCK>
               <<<grid, block, 0, stream>>>(
                   input_grad.data_ptr<scalar_t>(),
-                  mul_grad.data_ptr<scalar_t>(),
-                  wg_f32.data_ptr<float>(), bg_f32.data_ptr<float>(),
-                  grad_output.data_ptr<scalar_t>(),
                   input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
                   weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
                   offsets.data_ptr<int32_t>(), inv_rms.data_ptr<float>(),
-                  scores_ptr, gs_ptr,
-                  (float)eps, D, num_experts, (int)expert_offset,
-                  (float)hidden_clamp);
         });
   }
@@ -606,54 +625,56 @@ _bwd_impl(const torch::Tensor &grad_output, const torch::Tensor &input,
 // Public API: without scores
 // ---------------------------------------------------------------------------
 std::tuple<torch::Tensor, torch::Tensor>
-grouped_poly_norm_forward(
-    const torch::Tensor &input, const torch::Tensor &mul,
-    const torch::Tensor &weight, const torch::Tensor &bias,
-    const torch::Tensor &offsets, double eps, int64_t expert_offset,
-    double hidden_clamp) {
-  return _fwd_impl(input, mul, weight, bias, offsets, nullptr, eps, expert_offset, hidden_clamp);
 }
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-grouped_poly_norm_backward(
-    const torch::Tensor &grad_output, const torch::Tensor &input,
-    const torch::Tensor &mul, const torch::Tensor &weight,
-    const torch::Tensor &bias, const torch::Tensor &offsets,
-    const torch::Tensor &inv_rms, double eps, int64_t expert_offset,
-    double hidden_clamp) {
   const int64_t N = input.size(0);
-  auto [ig, mg, wg, bg, _] = _bwd_impl(
-      grad_output, input, mul, weight, bias, offsets, inv_rms,
-      nullptr, nullptr, N, eps, expert_offset, hidden_clamp);
   return {ig, mg, wg, bg};
 }
 // ---------------------------------------------------------------------------
 // Public API: with scores
 // ---------------------------------------------------------------------------
-std::tuple<torch::Tensor, torch::Tensor>
-grouped_poly_norm_forward_scored(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
-    const torch::Tensor &offsets, const torch::Tensor &scores,
-    double eps, int64_t expert_offset, double hidden_clamp) {
-  return _fwd_impl(input, mul, weight, bias, offsets,
-                    scores.data_ptr<float>(), eps, expert_offset, hidden_clamp);
 }
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 grouped_poly_norm_backward_scored(
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
-    const torch::Tensor &inv_rms, const torch::Tensor &scores,
-    double eps, int64_t expert_offset, double hidden_clamp) {
   const int64_t N = input.size(0);
   auto gs_f32 = torch::empty({N}, input.options().dtype(torch::kFloat));
-  auto [ig, mg, wg, bg, _] = _bwd_impl(
-      grad_output, input, mul, weight, bias, offsets, inv_rms,
-      scores.data_ptr<float>(), gs_f32.data_ptr<float>(), N,
-      eps, expert_offset, hidden_clamp);
   auto gs = gs_f32.unsqueeze(-1);
   return {ig, mg, wg, bg, gs};
 }

   int lo = 0, hi = num_experts;
 #pragma unroll 6
   for (int i = 0; i < 12; ++i) {
+    if (lo >= hi)
+      break;
     int mid = (lo + hi) >> 1;
     if (offsets[mid] <= row)
       lo = mid + 1;
   const int warp_id = threadIdx.x / WARP_SIZE;
   const int lane_id = threadIdx.x % WARP_SIZE;
+  if (lane_id == 0)
+    warp_results[warp_id] = v;
   __syncthreads();
   if (warp_id == 0 && lane_id < NUM_WARPS)
   else
     v = make_float4(0.f, 0.f, 0.f, 0.f);
+  if (warp_id == 0)
+    v = warp_reduce_f4(v);
   __shared__ float4 result;
+  if (threadIdx.x == 0)
+    result = v;
   __syncthreads();
   return result;
 }
 template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
 __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
     grouped_poly_norm_fwd_kernel(
+        scalar_t *__restrict__ output, acc_t *__restrict__ inv_rms,
+        const scalar_t *__restrict__ input, const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight, const scalar_t *__restrict__ bias,
         const int32_t *__restrict__ offsets,
+        const float *__restrict__ scores, // nullable, always fp32
         const acc_t eps, const int D, const int num_experts,
         const int expert_offset,
+        const acc_t hidden_clamp) { // < 0 = disabled
   using v_t = vec_t<scalar_t, width>;
   const bool do_clamp = (hidden_clamp >= acc_t(0));
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x = xv.data[j];
+      if (do_clamp)
+        x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
       acc_t x2 = x * x;
       s2 += x2;
       s4 += x2 * x2;
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x = xv.data[j];
+      if (do_clamp)
+        x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
       acc_t m = (acc_t)mv.data[j];
+      if (do_clamp)
+        m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
       acc_t x2 = x * x;
       acc_t x3 = x2 * x;
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
       acc_t out_val = poly * m * score;
+      if (do_clamp)
+        out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
       ov.data[j] = (scalar_t)out_val;
     }
     out_v[i] = ov;
 // Scalar fallback forward
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE) grouped_poly_norm_fwd_scalar(
+    scalar_t *__restrict__ output, acc_t *__restrict__ inv_rms,
+    const scalar_t *__restrict__ input, const scalar_t *__restrict__ mul,
+    const scalar_t *__restrict__ weight, const scalar_t *__restrict__ bias,
+    const int32_t *__restrict__ offsets,
+    const float *__restrict__ scores, // nullable, always fp32
+    const acc_t eps, const int D, const int num_experts,
+    const int expert_offset, const acc_t hidden_clamp) {
   const bool do_clamp = (hidden_clamp >= acc_t(0));
   const int row = blockIdx.x;
   const int64_t off = (int64_t)row * D;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1],
+              w2 = weight[eidx * 3 + 2];
   const acc_t b_val = bias[eidx];
   const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
   acc_t s2 = 0, s4 = 0, s6 = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x = input[off + i];
+    if (do_clamp)
+      x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
     acc_t x2 = x * x;
+    s2 += x2;
+    s4 += x2 * x2;
+    s6 += x2 * x2 * x2;
   }
   float4 sums = block_reduce_f4<BLOCK_SIZE>(make_float4(s2, s4, s6, 0.f));
   const acc_t ir3 = rsqrtf(sums.z * inv_d + eps);
   if (threadIdx.x == 0) {
+    inv_rms[row * 3] = ir1;
+    inv_rms[row * 3 + 1] = ir2;
+    inv_rms[row * 3 + 2] = ir3;
   }
   const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x = input[off + i];
+    if (do_clamp)
+      x = fminf(fmaxf(x, -hidden_clamp), hidden_clamp);
     acc_t m = (acc_t)mul[off + i];
+    if (do_clamp)
+      m = fminf(fmaxf(m, -hidden_clamp), hidden_clamp);
     acc_t x2 = x * x, x3 = x2 * x;
     acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
     acc_t out_val = poly * m * score;
+    if (do_clamp)
+      out_val = fminf(fmaxf(out_val, -hidden_clamp), hidden_clamp);
     output[off + i] = (scalar_t)out_val;
   }
 }
 template <typename scalar_t, typename acc_t, int width, int BLOCK_SIZE>
 __global__ void __launch_bounds__(BLOCK_SIZE, 65536 / (BLOCK_SIZE * 64))
     grouped_poly_norm_bwd_kernel(
+        scalar_t *__restrict__ grad_input, scalar_t *__restrict__ grad_mul,
+        float *__restrict__ weight_grad, // [num_total_experts, 3] fp32
+        float *__restrict__ bias_grad,   // [num_total_experts] fp32
         const scalar_t *__restrict__ grad_output,
+        const scalar_t *__restrict__ input, const scalar_t *__restrict__ mul,
+        const scalar_t *__restrict__ weight, const scalar_t *__restrict__ bias,
+        const int32_t *__restrict__ offsets, const acc_t *__restrict__ inv_rms,
+        const float *__restrict__ scores, // nullable, always fp32
+        acc_t *__restrict__ grad_scores,  // nullable (null when scores is null)
         const acc_t eps, const int D, const int num_experts,
+        const int expert_offset, const acc_t hidden_clamp) {
   using v_t = vec_t<scalar_t, width>;
   const bool do_clamp = (hidden_clamp >= acc_t(0));
   const int64_t base = (int64_t)row * vec_d;
   const v_t *__restrict__ in_v = reinterpret_cast<const v_t *>(input) + base;
+  const v_t *__restrict__ go_v =
+      reinterpret_cast<const v_t *>(grad_output) + base;
   const v_t *__restrict__ m_v = reinterpret_cast<const v_t *>(mul) + base;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x_orig = xv.data[j];
+      acc_t x =
+          do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
       acc_t m_orig = (acc_t)mv.data[j];
+      acc_t m =
+          do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
       acc_t go = (acc_t)gv.data[j];
       // Output clamp mask: recompute pre-clamp output
         acc_t x2 = x * x, x3 = x2 * x;
         acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
         acc_t out_pre = poly * m * score;
+        if (fabsf(out_pre) > hidden_clamp)
+          go = acc_t(0);
       }
       acc_t x2 = x * x;
     }
   }
+  float4 sums =
+      block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
   const acc_t inv_d = acc_t(1) / D;
   const acc_t s1 = sums.x * inv_d;
 #pragma unroll
     for (int j = 0; j < width; ++j) {
       acc_t x_orig = xv.data[j];
+      acc_t x =
+          do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
       acc_t m_orig = (acc_t)mv.data[j];
+      acc_t m =
+          do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
       acc_t x2 = x * x;
       acc_t x3 = x2 * x;
       acc_t go = (acc_t)gv.data[j];
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
       if (do_clamp) {
         acc_t out_pre = (poly + b_val) * m * score;
+        if (fabsf(out_pre) > hidden_clamp)
+          go = acc_t(0);
       }
       acc_t dp = go * m * score;
       // grad_mul with mul clamp mask
       acc_t gm_val = go * (poly + b_val) * score;
+      if (do_clamp && fabsf(m_orig) > hidden_clamp)
+        gm_val = acc_t(0);
       gm.data[j] = (scalar_t)gm_val;
       // grad_input with input clamp mask
       acc_t g = ir1 * (w2 * dp - x * cx);
       g += acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2);
       g += acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
+      if (do_clamp && fabsf(x_orig) > hidden_clamp)
+        g = acc_t(0);
       gi.data[j] = (scalar_t)g;
       dw0 += dp * x3 * ir3;
       dw1 += dp * x2 * ir2;
       dw2 += dp * x * ir1;
+      gs_acc += go * (poly + b_val) * m; // grad_scores accumulator
     }
     gi_v[i] = gi;
 // Scalar fallback (width == 0)
 // ---------------------------------------------------------------------------
 template <typename scalar_t, typename acc_t, int BLOCK_SIZE>
+__global__ void __launch_bounds__(BLOCK_SIZE) grouped_poly_norm_bwd_scalar(
+    scalar_t *__restrict__ grad_input, scalar_t *__restrict__ grad_mul,
+    float *__restrict__ weight_grad, float *__restrict__ bias_grad,
+    const scalar_t *__restrict__ grad_output,
+    const scalar_t *__restrict__ input, const scalar_t *__restrict__ mul,
+    const scalar_t *__restrict__ weight, const scalar_t *__restrict__ bias,
+    const int32_t *__restrict__ offsets, const acc_t *__restrict__ inv_rms,
+    const float *__restrict__ scores, // nullable, always fp32
+    acc_t *__restrict__ grad_scores,  // nullable
+    const acc_t eps, const int D, const int num_experts,
+    const int expert_offset, const acc_t hidden_clamp) {
   const bool do_clamp = (hidden_clamp >= acc_t(0));
   const int row = blockIdx.x;
   const int64_t off = (int64_t)row * D;
   const int eidx = find_expert(offsets, num_experts, row) + expert_offset;
+  const acc_t w0 = weight[eidx * 3], w1 = weight[eidx * 3 + 1],
+              w2 = weight[eidx * 3 + 2];
   const acc_t b_val = bias[eidx];
+  const acc_t ir1 = inv_rms[row * 3], ir2 = inv_rms[row * 3 + 1],
+              ir3 = inv_rms[row * 3 + 2];
   const acc_t w2ir1 = w2 * ir1, w1ir2 = w1 * ir2, w0ir3 = w0 * ir3;
   const acc_t score = (scores != nullptr) ? (acc_t)scores[row] : acc_t(1);
   acc_t sdpx = 0, sdpx2 = 0, sdpx3 = 0, sdp = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x_orig = input[off + i];
+    acc_t x =
+        do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
     acc_t m_orig = (acc_t)mul[off + i];
+    acc_t m =
+        do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
     acc_t go = (acc_t)grad_output[off + i];
     if (do_clamp) {
       acc_t x2 = x * x, x3 = x2 * x;
       acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1 + b_val;
       acc_t out_pre = poly * m * score;
+      if (fabsf(out_pre) > hidden_clamp)
+        go = acc_t(0);
     }
     acc_t x2 = x * x;
     acc_t dp = go * m * score;
+    sdp += dp;
+    sdpx += dp * x;
+    sdpx2 += dp * x2;
+    sdpx3 += dp * x2 * x;
   }
+  float4 sums =
+      block_reduce_f4<BLOCK_SIZE>(make_float4(sdpx, sdpx2, sdpx3, sdp));
   const acc_t inv_d = acc_t(1) / D;
   const acc_t s1 = sums.x * inv_d, s2 = sums.y * inv_d, s3 = sums.z * inv_d;
+  const acc_t cx = w2 * s1 * ir1 * ir1, cx2 = w1 * s2 * ir2 * ir2,
+              cx3 = w0 * s3 * ir3 * ir3;
   // Pass 2: grads with clamp masks
   acc_t dw0 = 0, dw1 = 0, dw2 = 0, gs_acc = 0;
   for (int i = threadIdx.x; i < D; i += BLOCK_SIZE) {
     acc_t x_orig = input[off + i], m_orig = (acc_t)mul[off + i];
+    acc_t x =
+        do_clamp ? fminf(fmaxf(x_orig, -hidden_clamp), hidden_clamp) : x_orig;
+    acc_t m =
+        do_clamp ? fminf(fmaxf(m_orig, -hidden_clamp), hidden_clamp) : m_orig;
     acc_t go = (acc_t)grad_output[off + i];
     acc_t x2 = x * x, x3 = x2 * x;
     acc_t poly = x3 * w0ir3 + x2 * w1ir2 + x * w2ir1;
     // Output clamp mask
     if (do_clamp) {
       acc_t out_pre = (poly + b_val) * m * score;
+      if (fabsf(out_pre) > hidden_clamp)
+        go = acc_t(0);
     }
     acc_t dp = go * m * score;
     acc_t gm_val = go * (poly + b_val) * score;
+    if (do_clamp && fabsf(m_orig) > hidden_clamp)
+      gm_val = acc_t(0);
     grad_mul[off + i] = (scalar_t)gm_val;
+    acc_t g = ir1 * (w2 * dp - x * cx) +
+              acc_t(2) * x * ir2 * (w1 * dp - x2 * cx2) +
+              acc_t(3) * x2 * ir3 * (w0 * dp - x3 * cx3);
+    if (do_clamp && fabsf(x_orig) > hidden_clamp)
+      g = acc_t(0);
     grad_input[off + i] = (scalar_t)g;
+    dw0 += dp * x3 * ir3;
+    dw1 += dp * x2 * ir2;
+    dw2 += dp * x * ir1;
     gs_acc += go * (poly + b_val) * m;
   }
 // Internal helpers — shared kernel dispatch
 // ---------------------------------------------------------------------------
 #define FWD_LAUNCH(width_val, scalar_type_name)                                \
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), scalar_type_name, [&] {   \
+    motif::grouped_poly_norm_fwd_kernel<scalar_t, float, width_val, BLOCK>     \
+        <<<grid, block, 0, stream>>>(                                          \
+            output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),            \
+            input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),              \
+            weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),            \
+            offsets.data_ptr<int32_t>(), scores_ptr, (float)eps, D,            \
+            num_experts, (int)expert_offset, (float)hidden_clamp);             \
+  })
 static std::tuple<torch::Tensor, torch::Tensor>
 _fwd_impl(const torch::Tensor &input, const torch::Tensor &mul,
+          const torch::Tensor &weight, const torch::Tensor &bias,
+          const torch::Tensor &offsets, const float *scores_ptr, double eps,
+          int64_t expert_offset, double hidden_clamp) {
   const int D = input.size(-1);
   const int64_t N = input.size(0);
   const int num_experts = offsets.size(0);
   constexpr int BLOCK = 128;
+  dim3 grid(N);
+  dim3 block(BLOCK);
   auto output = torch::empty_like(input);
   auto inv_rms = torch::empty({N, 3}, input.options().dtype(torch::kFloat));
                   output.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(),
                   input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
                   weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
+                  offsets.data_ptr<int32_t>(), scores_ptr, (float)eps, D,
+                  num_experts, (int)expert_offset, (float)hidden_clamp);
         });
   }
   return {output, inv_rms};
 #undef FWD_LAUNCH
 #define BWD_LAUNCH(width_val, scalar_type_name, kernel_name)                   \
+  MOTIF_DISPATCH_FLOATING_TYPES(input.scalar_type(), scalar_type_name, [&] {   \
+    motif::kernel_name<scalar_t, float, width_val, BLOCK>                      \
+        <<<grid, block, 0, stream>>>(                                          \
+            input_grad.data_ptr<scalar_t>(), mul_grad.data_ptr<scalar_t>(),    \
+            wg_f32.data_ptr<float>(), bg_f32.data_ptr<float>(),                \
+            grad_output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),      \
+            mul.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),             \
+            bias.data_ptr<scalar_t>(), offsets.data_ptr<int32_t>(),            \
+            inv_rms.data_ptr<float>(), scores_ptr, gs_ptr, (float)eps, D,      \
+            num_experts, (int)expert_offset, (float)hidden_clamp);             \
+  })
 static std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+                  torch::Tensor>
 _bwd_impl(const torch::Tensor &grad_output, const torch::Tensor &input,
+          const torch::Tensor &mul, const torch::Tensor &weight,
+          const torch::Tensor &bias, const torch::Tensor &offsets,
+          const torch::Tensor &inv_rms, const float *scores_ptr, float *gs_ptr,
+          int64_t N, double eps, int64_t expert_offset, double hidden_clamp) {
   const int D = input.size(-1);
   const int num_experts = offsets.size(0);
   constexpr int BLOCK = 128;
+  dim3 grid(N);
+  dim3 block(BLOCK);
   auto input_grad = torch::empty_like(input);
   auto mul_grad = torch::empty_like(mul);
+  auto wg_f32 =
+      torch::zeros({weight.size(0), 3}, input.options().dtype(torch::kFloat));
+  auto bg_f32 =
+      torch::zeros({bias.size(0)}, input.options().dtype(torch::kFloat));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
           motif::grouped_poly_norm_bwd_scalar<scalar_t, float, BLOCK>
               <<<grid, block, 0, stream>>>(
                   input_grad.data_ptr<scalar_t>(),
+                  mul_grad.data_ptr<scalar_t>(), wg_f32.data_ptr<float>(),
+                  bg_f32.data_ptr<float>(), grad_output.data_ptr<scalar_t>(),
                   input.data_ptr<scalar_t>(), mul.data_ptr<scalar_t>(),
                   weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
                   offsets.data_ptr<int32_t>(), inv_rms.data_ptr<float>(),
+                  scores_ptr, gs_ptr, (float)eps, D, num_experts,
+                  (int)expert_offset, (float)hidden_clamp);
         });
   }
 // Public API: without scores
 // ---------------------------------------------------------------------------
 std::tuple<torch::Tensor, torch::Tensor>
+grouped_poly_norm_forward(const torch::Tensor &input, const torch::Tensor &mul,
+                          const torch::Tensor &weight,
+                          const torch::Tensor &bias,
+                          const torch::Tensor &offsets, double eps,
+                          int64_t expert_offset, double hidden_clamp) {
+  return _fwd_impl(input, mul, weight, bias, offsets, nullptr, eps,
+                   expert_offset, hidden_clamp);
 }
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+grouped_poly_norm_backward(const torch::Tensor &grad_output,
+                           const torch::Tensor &input, const torch::Tensor &mul,
+                           const torch::Tensor &weight,
+                           const torch::Tensor &bias,
+                           const torch::Tensor &offsets,
+                           const torch::Tensor &inv_rms, double eps,
+                           int64_t expert_offset, double hidden_clamp) {
   const int64_t N = input.size(0);
+  auto [ig, mg, wg, bg, _] =
+      _bwd_impl(grad_output, input, mul, weight, bias, offsets, inv_rms,
+                nullptr, nullptr, N, eps, expert_offset, hidden_clamp);
   return {ig, mg, wg, bg};
 }
 // ---------------------------------------------------------------------------
 // Public API: with scores
 // ---------------------------------------------------------------------------
+std::tuple<torch::Tensor, torch::Tensor> grouped_poly_norm_forward_scored(
     const torch::Tensor &input, const torch::Tensor &mul,
     const torch::Tensor &weight, const torch::Tensor &bias,
+    const torch::Tensor &offsets, const torch::Tensor &scores, double eps,
+    int64_t expert_offset, double hidden_clamp) {
+  return _fwd_impl(input, mul, weight, bias, offsets, scores.data_ptr<float>(),
+                   eps, expert_offset, hidden_clamp);
 }
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor>
 grouped_poly_norm_backward_scored(
     const torch::Tensor &grad_output, const torch::Tensor &input,
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, const torch::Tensor &offsets,
+    const torch::Tensor &inv_rms, const torch::Tensor &scores, double eps,
+    int64_t expert_offset, double hidden_clamp) {
   const int64_t N = input.size(0);
   auto gs_f32 = torch::empty({N}, input.options().dtype(torch::kFloat));
+  auto [ig, mg, wg, bg, _] =
+      _bwd_impl(grad_output, input, mul, weight, bias, offsets, inv_rms,
+                scores.data_ptr<float>(), gs_f32.data_ptr<float>(), N, eps,
+                expert_offset, hidden_clamp);
   auto gs = gs_f32.unsqueeze(-1);
   return {ig, mg, wg, bg, gs};
 }

benchmarks/cases/grouped_mul_poly.py CHANGED Viewed

@@ -5,10 +5,8 @@ from common.diff_engine import DiffCase
 torch._functorch.config.donated_buffer = False
-from grouped_poly_norm import (
-    fused_mul_grouped_poly_norm,
-    fused_mul_grouped_poly_norm_ref,
-)
 # 384 / 8 (EP) = 48 experts per rank
 # total_tokens = bs * sl, which equals per-rank tokens
@@ -28,9 +26,14 @@ class GroupedRefModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
-        return fused_mul_grouped_poly_norm_ref(x, mul, self.weight, self.bias,
-                                               self.offsets, self.eps,
-                                               expert_offset=self.expert_offset)
 class GroupedCUDAModule(torch.nn.Module):
@@ -45,8 +48,12 @@ class GroupedCUDAModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
-        return fused_mul_grouped_poly_norm(x, mul, self.weight, self.bias,
-                                           self.offsets, self.eps,
                                            expert_offset=self.expert_offset)
@@ -66,29 +73,28 @@ class GroupedMulPoly(DiffCase):
         probs = torch.ones(num_experts) / num_experts
         assignments = torch.multinomial(probs, total_tokens, replacement=True)
         counts = torch.bincount(assignments, minlength=num_experts).tolist()
-        offsets = torch.cumsum(
-            torch.tensor(counts, dtype=torch.int32), dim=0)
         return {
             "x":
-                torch.randn(total_tokens, hidden, dtype=dtype,
-                            requires_grad=True) * 0.5,
             "mul":
-                torch.randn(total_tokens, hidden, dtype=dtype,
-                            requires_grad=True) * 0.5,
             "weight":
-                torch.ones(num_experts, 3, dtype=dtype) / 3 +
-                torch.randn(num_experts, 3, dtype=dtype) * 0.01,
             "bias":
-                torch.randn(num_experts, 1, dtype=dtype) * 0.01,
             "offsets":
-                offsets,
             "dim":
-                hidden,
             "eps":
-                eps,
             "dtype":
-                dtype,
         }
     def make_naive(self, I):

 torch._functorch.config.donated_buffer = False
+from grouped_poly_norm import (fused_mul_grouped_poly_norm,
+                               fused_mul_grouped_poly_norm_ref)
 # 384 / 8 (EP) = 48 experts per rank
 # total_tokens = bs * sl, which equals per-rank tokens
         self.expert_offset = expert_offset
     def forward(self, x, mul):
+        return fused_mul_grouped_poly_norm_ref(
+            x,
+            mul,
+            self.weight,
+            self.bias,
+            self.offsets,
+            self.eps,
+            expert_offset=self.expert_offset)
 class GroupedCUDAModule(torch.nn.Module):
         self.expert_offset = expert_offset
     def forward(self, x, mul):
+        return fused_mul_grouped_poly_norm(x,
+                                           mul,
+                                           self.weight,
+                                           self.bias,
+                                           self.offsets,
+                                           self.eps,
                                            expert_offset=self.expert_offset)
         probs = torch.ones(num_experts) / num_experts
         assignments = torch.multinomial(probs, total_tokens, replacement=True)
         counts = torch.bincount(assignments, minlength=num_experts).tolist()
+        offsets = torch.cumsum(torch.tensor(counts, dtype=torch.int32), dim=0)
         return {
             "x":
+            torch.randn(total_tokens, hidden, dtype=dtype,
+                        requires_grad=True) * 0.5,
             "mul":
+            torch.randn(total_tokens, hidden, dtype=dtype,
+                        requires_grad=True) * 0.5,
             "weight":
+            torch.ones(num_experts, 3, dtype=dtype) / 3 +
+            torch.randn(num_experts, 3, dtype=dtype) * 0.01,
             "bias":
+            torch.randn(num_experts, 1, dtype=dtype) * 0.01,
             "offsets":
+            offsets,
             "dim":
+            hidden,
             "eps":
+            eps,
             "dtype":
+            dtype,
         }
     def make_naive(self, I):

benchmarks/profile_bwd.py DELETED Viewed

@@ -1,146 +0,0 @@
-"""Profiling script for grouped polynorm backward kernel using torch.profiler."""
-import argparse
-import torch
-import torch.cuda
-from torch.profiler import profile, ProfilerActivity
-from grouped_poly_norm import fused_mul_grouped_poly_norm
-torch.set_default_device("cuda")
-def make_inputs(N, D, num_experts):
-    torch.manual_seed(42)
-    probs = torch.ones(num_experts) / num_experts
-    assignments = torch.multinomial(probs, N, replacement=True)
-    counts = torch.bincount(assignments, minlength=num_experts).tolist()
-    offsets = torch.cumsum(
-        torch.tensor(counts, dtype=torch.int32), dim=0)
-    x = torch.randn(N, D, dtype=torch.bfloat16, requires_grad=True) * 0.5
-    m = torch.randn(N, D, dtype=torch.bfloat16, requires_grad=True) * 0.5
-    w = (torch.ones(num_experts, 3, dtype=torch.bfloat16) / 3
-         ).requires_grad_(True)
-    b = (torch.randn(num_experts, 1, dtype=torch.bfloat16) * 0.01
-         ).requires_grad_(True)
-    return x, m, w, b, offsets
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--tokens", type=int, default=4096)
-    parser.add_argument("--dim", type=int, default=1280)
-    parser.add_argument("--experts", type=int, default=48)
-    parser.add_argument("--output", type=str, default="/tmp/profile")
-    args = parser.parse_args()
-    N, D, num_experts = args.tokens, args.dim, args.experts
-    # Warmup (fresh inputs each time to avoid graph reuse issues)
-    for _ in range(3):
-        x, m, w, b, offsets = make_inputs(N, D, num_experts)
-        out = fused_mul_grouped_poly_norm(x, m, w, b, offsets)
-        out.sum().backward()
-    torch.cuda.synchronize()
-    # Profiled: mimic do_bench — forward once, backward multiple times with retain_graph
-    x, m, w, b, offsets = make_inputs(N, D, num_experts)
-    out = fused_mul_grouped_poly_norm(x, m, w, b, offsets)
-    gin = [x, m] + [w, b]
-    g = [torch.randn_like(out)]
-    # Warmup backward
-    for _ in range(5):
-        torch.autograd.grad(out, gin, g, retain_graph=True, allow_unused=True)
-    torch.cuda.synchronize()
-    with profile(
-        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-        record_shapes=True,
-        with_stack=True,
-    ) as prof:
-        for _ in range(100):
-            torch.autograd.grad(out, gin, g, retain_graph=True, allow_unused=True)
-        torch.cuda.synchronize()
-    # Print kernel-level stats
-    print(f"\n=== Kernel Table (N={N}, D={D}) ===")
-    print(prof.key_averages().table(
-        sort_by="cuda_time_total", row_limit=20))
-    # Export chrome trace
-    trace_path = f"{args.output}_trace_N{N}.json"
-    prof.export_chrome_trace(trace_path)
-    print(f"\nTrace exported to {trace_path}")
-    # === Occupancy analysis from Triton kernel metadata ===
-    print(f"\n=== Occupancy Analysis ===")
-    props = torch.cuda.get_device_properties(0)
-    print(f"GPU: {props.name}")
-    print(f"SMs: {props.multi_processor_count}")
-    print(f"Max threads/SM: {props.max_threads_per_multi_processor}")
-    print(f"Regs/SM: {props.regs_per_multiprocessor}")
-    print(f"Shared mem/block: {props.shared_memory_per_block} bytes")
-    # Get register info from Triton compiled cubins
-    try:
-        import glob
-        import json
-        import subprocess
-        cache_dir = os.path.expanduser("~/.triton/cache")
-        # Find metadata JSON files
-        json_files = sorted(glob.glob(f"{cache_dir}/**/*.json", recursive=True),
-                            key=os.path.getmtime, reverse=True)
-        print(f"\nFound {len(json_files)} compiled kernel metadata files")
-        for jf in json_files[:10]:
-            try:
-                with open(jf) as f:
-                    meta = json.load(f)
-                if isinstance(meta, dict):
-                    n_regs = meta.get('num_regs', meta.get('n_regs', None))
-                    n_spills = meta.get('num_spills', meta.get('n_spills', None))
-                    name = meta.get('name', os.path.basename(jf))
-                    shared = meta.get('shared', None)
-                    if n_regs is not None:
-                        print(f"  {name}: regs={n_regs}, spills={n_spills}, shared={shared}")
-            except Exception:
-                pass
-        # Also try cuobjdump on recent cubins
-        cubin_files = sorted(glob.glob(f"{cache_dir}/**/*.cubin", recursive=True),
-                             key=os.path.getmtime, reverse=True)
-        print(f"\nFound {len(cubin_files)} cubins, inspecting latest:")
-        for cb in cubin_files[:5]:
-            try:
-                result = subprocess.run(
-                    ["cuobjdump", "-res-usage", cb],
-                    capture_output=True, text=True, timeout=5)
-                if result.returncode == 0 and result.stdout.strip():
-                    print(f"\n  {os.path.basename(cb)}:")
-                    for line in result.stdout.strip().split('\n'):
-                        print(f"    {line}")
-            except Exception as e:
-                print(f"  cuobjdump failed: {e}")
-                break
-    except Exception as e:
-        print(f"Cache inspection error: {e}")
-    # Calculate theoretical occupancy for different register counts
-    print("\n=== Theoretical Occupancy (num_warps=4, 128 threads/block) ===")
-    threads_per_block = 128
-    max_threads = props.max_threads_per_multi_processor
-    total_regs = props.regs_per_multiprocessor
-    for n_regs in [64, 96, 128, 160, 192, 224, 256]:
-        regs_per_block = n_regs * threads_per_block
-        max_blocks_by_regs = total_regs // regs_per_block
-        max_blocks_by_threads = max_threads // threads_per_block
-        blocks = min(max_blocks_by_regs, max_blocks_by_threads, 32)
-        active_threads = blocks * threads_per_block
-        occupancy = active_threads / max_threads * 100
-        print(f"  {n_regs:3d} regs/thread -> {blocks:2d} blocks/SM -> "
-              f"{active_threads:4d} threads -> {occupancy:.1f}% occupancy")
-if __name__ == "__main__":
-    main()

benchmarks/run_cases.py CHANGED Viewed

@@ -28,8 +28,10 @@ def plot_result(r_path, columns=None):
     import pandas as pd
     df = pd.read_csv(r_path + ".csv")
     if columns is None:
-        columns = [c for c in ["Naive", "Compiled", "Cuda", "Triton"]
-                   if c in df.columns]
     plt.figure(figsize=(12, 6))
     ax = df.plot(x="config", y=columns, kind="bar", ax=plt.gca())
     ax.set_title("Speedup over torch (higher is better)\n" + make_title_tag(),
@@ -64,6 +66,11 @@ def main():
         default="bf16",
         help="Data type for benchmarking (default: bf16)",
     )
     args = ap.parse_args()
     dtype_map = {
@@ -81,12 +88,14 @@ def main():
     mod = importlib.import_module(f"cases.{args.case}")
     case: DiffCase = mod.CASE
-    calculate_diff(
-        case,
-        batch_size=2,
-        seq_len=128,
-        hidden_size=4096,
-    )
     for dtype_name, dtype in dtypes:
         print(f"\n{'=' * 60}")
@@ -161,28 +170,40 @@ def main():
                 itertools.product(dim, batch_size_range, seq_length_range))
             if is_grouped:
-                csv_line_vals = ("naive", "compiled", "cuda", "speedup")
-                csv_line_names = {
                     "naive": "Naive",
                     "compiled": "Compiled",
                     "cuda": "Triton",
                     "speedup": "SpeedUp",
                 }
             else:
-                csv_line_vals = ("naive", "cuda", "speedup")
-                csv_line_names = {
                     "naive": "Naive",
                     "cuda": "Cuda",
                     "speedup": "SpeedUp",
                 }
             bench = make_fwd_benchmark_for_case(
                 case=case,
                 configs=configs,
                 plot_name=f"{args.case}-{dtype_name}-fwd-perf",
                 dtype=dtype,
-                line_vals=csv_line_vals,
-                line_names=csv_line_names,
             )
             bench.run(print_data=True, save_path=save_dir)
@@ -192,8 +213,10 @@ def main():
                 configs=configs,
                 plot_name=f"{args.case}-{dtype_name}-bwd-perf",
                 dtype=dtype,
-                line_vals=csv_line_vals,
-                line_names=csv_line_names,
             )
             bench.run(print_data=True, save_path=save_dir)

     import pandas as pd
     df = pd.read_csv(r_path + ".csv")
     if columns is None:
+        columns = [
+            c for c in ["Naive", "Compiled", "Cuda", "Triton"]
+            if c in df.columns
+        ]
     plt.figure(figsize=(12, 6))
     ax = df.plot(x="config", y=columns, kind="bar", ax=plt.gca())
     ax.set_title("Speedup over torch (higher is better)\n" + make_title_tag(),
         default="bf16",
         help="Data type for benchmarking (default: bf16)",
     )
+    ap.add_argument(
+        "--profile",
+        action="store_true",
+        help="Export chrome traces for backward benchmarks",
+    )
     args = ap.parse_args()
     dtype_map = {
     mod = importlib.import_module(f"cases.{args.case}")
     case: DiffCase = mod.CASE
+    # Correctness checks across multiple configs
+    for bs, sl, hid in [(2, 128, 4096), (8, 4096, 1280), (1, 32768, 1280)]:
+        print(
+            f"Checking correctness: bs={bs}, sl={sl}, D={hid} "
+            f"(N={bs*sl})...",
+            end=" ")
+        calculate_diff(case, batch_size=bs, seq_len=sl, hidden_size=hid)
+        print("✅")
     for dtype_name, dtype in dtypes:
         print(f"\n{'=' * 60}")
                 itertools.product(dim, batch_size_range, seq_length_range))
             if is_grouped:
+                fwd_line_vals = ("naive", "compiled", "cuda", "speedup")
+                fwd_line_names = {
                     "naive": "Naive",
                     "compiled": "Compiled",
                     "cuda": "Triton",
                     "speedup": "SpeedUp",
                 }
+                bwd_line_vals = ("naive", "compiled", "compiled_cuda",
+                                 "speedup")
+                bwd_line_names = {
+                    "naive": "Naive",
+                    "compiled": "Compiled",
+                    "compiled_cuda": "CompiledCUDA",
+                    "speedup": "SpeedUp",
+                }
             else:
+                fwd_line_vals = ("naive", "cuda", "speedup")
+                fwd_line_names = {
                     "naive": "Naive",
                     "cuda": "Cuda",
                     "speedup": "SpeedUp",
                 }
+                bwd_line_vals = fwd_line_vals
+                bwd_line_names = fwd_line_names
             bench = make_fwd_benchmark_for_case(
                 case=case,
                 configs=configs,
                 plot_name=f"{args.case}-{dtype_name}-fwd-perf",
                 dtype=dtype,
+                line_vals=fwd_line_vals,
+                line_names=fwd_line_names,
+                profile=args.profile,
+                profile_dir=os.path.join(save_dir, "traces"),
             )
             bench.run(print_data=True, save_path=save_dir)
                 configs=configs,
                 plot_name=f"{args.case}-{dtype_name}-bwd-perf",
                 dtype=dtype,
+                line_vals=bwd_line_vals,
+                line_names=bwd_line_names,
+                profile=args.profile,
+                profile_dir=os.path.join(save_dir, "traces"),
             )
             bench.run(print_data=True, save_path=save_dir)

registration.h CHANGED Viewed

@@ -2,8 +2,8 @@
 // Local build compatibility shim for kernel-builder's registration.h
-#include <torch/library.h>
 #include <torch/extension.h>
 // TORCH_LIBRARY_EXPAND may not be defined in all PyTorch versions
 #ifndef TORCH_LIBRARY_EXPAND
@@ -11,4 +11,5 @@
 #endif
 // Generate the PyInit_<name> entry point for the shared library
-#define REGISTER_EXTENSION(name) PYBIND11_MODULE(name, m) {}

 // Local build compatibility shim for kernel-builder's registration.h
 #include <torch/extension.h>
+#include <torch/library.h>
 // TORCH_LIBRARY_EXPAND may not be defined in all PyTorch versions
 #ifndef TORCH_LIBRARY_EXPAND
 #endif
 // Generate the PyInit_<name> entry point for the shared library
+#define REGISTER_EXTENSION(name)                                               \
+  PYBIND11_MODULE(name, m) {}

setup.py CHANGED Viewed

@@ -44,9 +44,9 @@ NVCC_FLAGS = [
     "--use_fast_math",
     "-std=c++17",
     # Generate code for common architectures
-    "-gencode=arch=compute_80,code=sm_80",   # A100
-    "-gencode=arch=compute_89,code=sm_89",   # L40/4090
-    "-gencode=arch=compute_90,code=sm_90",   # H100
 ]
 # Check for B200 support (sm_100, requires CUDA 12.8+)

     "--use_fast_math",
     "-std=c++17",
     # Generate code for common architectures
+    "-gencode=arch=compute_80,code=sm_80",  # A100
+    "-gencode=arch=compute_89,code=sm_89",  # L40/4090
+    "-gencode=arch=compute_90,code=sm_90",  # H100
 ]
 # Check for B200 support (sm_100, requires CUDA 12.8+)

tests/test_fused_mul_grouped_poly_norm.py CHANGED Viewed

@@ -1,10 +1,6 @@
 import pytest
 import torch
-from grouped_poly_norm import (
-    _has_cuda_ops,
-    fused_mul_grouped_poly_norm_ref,
-)
 if _has_cuda_ops:
     from grouped_poly_norm import fused_mul_grouped_poly_norm
@@ -23,12 +19,19 @@ CUDA_DEVICES = ["cuda:0"]
 def _counts_to_offsets(counts_list, device):
     """Convert list of counts to cumsum offsets tensor."""
-    return torch.cumsum(
-        torch.tensor(counts_list, device=device, dtype=torch.int32), dim=0).to(torch.int32)
-def _make_inputs(total_tokens, hidden_dim, num_experts, dtype, device,
-                 seed=42, expert_offset=0):
     """Create deterministic test inputs with random token distribution."""
     torch.manual_seed(seed)
@@ -57,40 +60,64 @@ def _make_scores(total_tokens, device, dtype=torch.float32):
     return torch.rand(total_tokens, 1, device=device, dtype=dtype) * 0.5 + 0.5
-def _run_ref(input_t, mul_t, weight, bias, offsets, expert_offset=0,
-             scores=None, hidden_clamp=None):
     """Run reference forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
     b = bias.clone().detach().requires_grad_(True)
-    s = scores.clone().detach().requires_grad_(True) if scores is not None else None
-    out = fused_mul_grouped_poly_norm_ref(inp, m, w, b, offsets,
                                           expert_offset=expert_offset,
-                                          scores=s, hidden_clamp=hidden_clamp)
     out.sum().backward()
     grads = (out, inp.grad, m.grad, w.grad, b.grad)
-    return grads + (s.grad,) if s is not None else grads + (None,)
-def _run_cuda(input_t, mul_t, weight, bias, offsets, expert_offset=0,
-                scores=None, hidden_clamp=None):
     """Run CUDA forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
     b = bias.clone().detach().requires_grad_(True)
-    s = scores.clone().detach().requires_grad_(True) if scores is not None else None
-    out = fused_mul_grouped_poly_norm(inp, m, w, b, offsets,
                                       expert_offset=expert_offset,
-                                      scores=s, hidden_clamp=hidden_clamp)
     out.sum().backward()
     grads = (out, inp.grad, m.grad, w.grad, b.grad)
-    return grads + (s.grad,) if s is not None else grads + (None,)
 @pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
@@ -113,15 +140,26 @@ def test_fused_mul_grouped_poly_norm_forward(
     """CUDA forward output should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
-        num_tokens, d, num_experts, dtype, device, seed,
         expert_offset=expert_offset)
-    out_ref = fused_mul_grouped_poly_norm_ref(input_t, mul_t, weight, bias,
                                               offsets,
                                               expert_offset=expert_offset)
-    out_tri = fused_mul_grouped_poly_norm(input_t, mul_t, weight, bias,
-                                         offsets,
-                                         expert_offset=expert_offset)
     assert out_ref.shape == out_tri.shape == (num_tokens, d)
     assert out_ref.dtype == out_tri.dtype == dtype
@@ -152,7 +190,12 @@ def test_fused_mul_grouped_poly_norm_backward(
     """CUDA backward gradients should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
-        num_tokens, d, num_experts, dtype, device, seed,
         expert_offset=expert_offset)
     _, inp_grad_ref, mul_grad_ref, w_grad_ref, b_grad_ref, _ = _run_ref(
@@ -195,12 +238,18 @@ def test_fused_mul_grouped_poly_norm_zero_token_experts(
     bias = torch.zeros(total_experts, 1, device=device, dtype=dtype)
     offsets = _counts_to_offsets(counts, device)
-    out_ref = fused_mul_grouped_poly_norm_ref(input_t, mul_t, weight, bias,
                                               offsets,
                                               expert_offset=expert_offset)
-    out_tri = fused_mul_grouped_poly_norm(input_t, mul_t, weight, bias,
-                                         offsets,
-                                         expert_offset=expert_offset)
     if dtype == torch.float32:
         assert_close(out_ref, out_tri, atol=1e-4, rtol=1e-4)
@@ -208,12 +257,18 @@ def test_fused_mul_grouped_poly_norm_zero_token_experts(
         assert_close(out_ref, out_tri, atol=1e-2, rtol=1e-2)
     # Check backward with zero-token experts
-    _, _, _, w_grad_ref, b_grad_ref, _ = _run_ref(input_t, mul_t, weight, bias,
                                                    offsets,
                                                    expert_offset=expert_offset)
-    _, _, _, w_grad_tri, b_grad_tri, _ = _run_cuda(input_t, mul_t, weight, bias,
-                                                      offsets,
-                                                      expert_offset=expert_offset)
     if dtype == torch.float32:
         atol, rtol = 1e-3, 1e-3
@@ -270,7 +325,11 @@ def test_fused_mul_grouped_poly_norm_no_nan_inf(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_scores_forward(
-    num_tokens, d, num_experts, dtype, device,
 ):
     """Forward with scores should match reference."""
     torch.set_default_device(device)
@@ -278,10 +337,18 @@ def test_fused_mul_grouped_poly_norm_scores_forward(
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
-    out_ref = fused_mul_grouped_poly_norm_ref(
-        input_t, mul_t, weight, bias, offsets, scores=scores)
-    out_tri = fused_mul_grouped_poly_norm(
-        input_t, mul_t, weight, bias, offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
@@ -294,7 +361,11 @@ def test_fused_mul_grouped_poly_norm_scores_forward(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_scores_backward(
-    num_tokens, d, num_experts, dtype, device,
 ):
     """Backward with scores should match reference."""
     torch.set_default_device(device)
@@ -302,10 +373,18 @@ def test_fused_mul_grouped_poly_norm_scores_backward(
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
-    out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
-        input_t, mul_t, weight, bias, offsets, scores=scores)
-    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(
-        input_t, mul_t, weight, bias, offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     # weight/bias grads use atomicAdd accumulation across tokens,
@@ -332,7 +411,12 @@ CLAMP_VALUES = [10.0, 1.0, 0.5]
 @pytest.mark.parametrize("hidden_clamp", CLAMP_VALUES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_hidden_clamp_forward(
-    num_tokens, d, num_experts, dtype, hidden_clamp, device,
 ):
     """Forward with hidden_clamp should match reference."""
     torch.set_default_device(device)
@@ -340,12 +424,20 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_forward(
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
-    out_ref = fused_mul_grouped_poly_norm_ref(
-        input_t, mul_t, weight, bias, offsets,
-        scores=scores, hidden_clamp=hidden_clamp)
-    out_tri = fused_mul_grouped_poly_norm(
-        input_t, mul_t, weight, bias, offsets,
-        scores=scores, hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
@@ -359,7 +451,12 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_forward(
 @pytest.mark.parametrize("hidden_clamp", CLAMP_VALUES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_hidden_clamp_backward(
-    num_tokens, d, num_experts, dtype, hidden_clamp, device,
 ):
     """Backward with hidden_clamp should match reference."""
     torch.set_default_device(device)
@@ -368,11 +465,21 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_backward(
     scores = _make_scores(num_tokens, device)
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
-        input_t, mul_t, weight, bias, offsets,
-        scores=scores, hidden_clamp=hidden_clamp)
     out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(
-        input_t, mul_t, weight, bias, offsets,
-        scores=scores, hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     # weight/bias grads use atomicAdd accumulation across tokens,

 import pytest
 import torch
+from grouped_poly_norm import _has_cuda_ops, fused_mul_grouped_poly_norm_ref
 if _has_cuda_ops:
     from grouped_poly_norm import fused_mul_grouped_poly_norm
 def _counts_to_offsets(counts_list, device):
     """Convert list of counts to cumsum offsets tensor."""
+    return torch.cumsum(torch.tensor(counts_list,
+                                     device=device,
+                                     dtype=torch.int32),
+                        dim=0).to(torch.int32)
+def _make_inputs(total_tokens,
+                 hidden_dim,
+                 num_experts,
+                 dtype,
+                 device,
+                 seed=42,
+                 expert_offset=0):
     """Create deterministic test inputs with random token distribution."""
     torch.manual_seed(seed)
     return torch.rand(total_tokens, 1, device=device, dtype=dtype) * 0.5 + 0.5
+def _run_ref(input_t,
+             mul_t,
+             weight,
+             bias,
+             offsets,
+             expert_offset=0,
+             scores=None,
+             hidden_clamp=None):
     """Run reference forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
     b = bias.clone().detach().requires_grad_(True)
+    s = scores.clone().detach().requires_grad_(
+        True) if scores is not None else None
+    out = fused_mul_grouped_poly_norm_ref(inp,
+                                          m,
+                                          w,
+                                          b,
+                                          offsets,
                                           expert_offset=expert_offset,
+                                          scores=s,
+                                          hidden_clamp=hidden_clamp)
     out.sum().backward()
     grads = (out, inp.grad, m.grad, w.grad, b.grad)
+    return grads + (s.grad, ) if s is not None else grads + (None, )
+def _run_cuda(input_t,
+              mul_t,
+              weight,
+              bias,
+              offsets,
+              expert_offset=0,
+              scores=None,
+              hidden_clamp=None):
     """Run CUDA forward + backward, return output and grads."""
     inp = input_t.clone().detach().requires_grad_(True)
     m = mul_t.clone().detach().requires_grad_(True)
     w = weight.clone().detach().requires_grad_(True)
     b = bias.clone().detach().requires_grad_(True)
+    s = scores.clone().detach().requires_grad_(
+        True) if scores is not None else None
+    out = fused_mul_grouped_poly_norm(inp,
+                                      m,
+                                      w,
+                                      b,
+                                      offsets,
                                       expert_offset=expert_offset,
+                                      scores=s,
+                                      hidden_clamp=hidden_clamp)
     out.sum().backward()
     grads = (out, inp.grad, m.grad, w.grad, b.grad)
+    return grads + (s.grad, ) if s is not None else grads + (None, )
 @pytest.mark.skipif(not _has_cuda_ops, reason="CUDA ops not available")
     """CUDA forward output should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
+        num_tokens,
+        d,
+        num_experts,
+        dtype,
+        device,
+        seed,
         expert_offset=expert_offset)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
                                               offsets,
                                               expert_offset=expert_offset)
+    out_tri = fused_mul_grouped_poly_norm(input_t,
+                                          mul_t,
+                                          weight,
+                                          bias,
+                                          offsets,
+                                          expert_offset=expert_offset)
     assert out_ref.shape == out_tri.shape == (num_tokens, d)
     assert out_ref.dtype == out_tri.dtype == dtype
     """CUDA backward gradients should match PyTorch reference."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_inputs(
+        num_tokens,
+        d,
+        num_experts,
+        dtype,
+        device,
+        seed,
         expert_offset=expert_offset)
     _, inp_grad_ref, mul_grad_ref, w_grad_ref, b_grad_ref, _ = _run_ref(
     bias = torch.zeros(total_experts, 1, device=device, dtype=dtype)
     offsets = _counts_to_offsets(counts, device)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
                                               offsets,
                                               expert_offset=expert_offset)
+    out_tri = fused_mul_grouped_poly_norm(input_t,
+                                          mul_t,
+                                          weight,
+                                          bias,
+                                          offsets,
+                                          expert_offset=expert_offset)
     if dtype == torch.float32:
         assert_close(out_ref, out_tri, atol=1e-4, rtol=1e-4)
         assert_close(out_ref, out_tri, atol=1e-2, rtol=1e-2)
     # Check backward with zero-token experts
+    _, _, _, w_grad_ref, b_grad_ref, _ = _run_ref(input_t,
+                                                  mul_t,
+                                                  weight,
+                                                  bias,
+                                                  offsets,
+                                                  expert_offset=expert_offset)
+    _, _, _, w_grad_tri, b_grad_tri, _ = _run_cuda(input_t,
+                                                   mul_t,
+                                                   weight,
+                                                   bias,
                                                    offsets,
                                                    expert_offset=expert_offset)
     if dtype == torch.float32:
         atol, rtol = 1e-3, 1e-3
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_scores_forward(
+    num_tokens,
+    d,
+    num_experts,
+    dtype,
+    device,
 ):
     """Forward with scores should match reference."""
     torch.set_default_device(device)
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
+                                              offsets,
+                                              scores=scores)
+    out_tri = fused_mul_grouped_poly_norm(input_t,
+                                          mul_t,
+                                          weight,
+                                          bias,
+                                          offsets,
+                                          scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_scores_backward(
+    num_tokens,
+    d,
+    num_experts,
+    dtype,
+    device,
 ):
     """Backward with scores should match reference."""
     torch.set_default_device(device)
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
+    out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(input_t,
+                                                               mul_t,
+                                                               weight,
+                                                               bias,
+                                                               offsets,
+                                                               scores=scores)
+    out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(input_t,
+                                                                mul_t,
+                                                                weight,
+                                                                bias,
+                                                                offsets,
+                                                                scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     # weight/bias grads use atomicAdd accumulation across tokens,
 @pytest.mark.parametrize("hidden_clamp", CLAMP_VALUES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_hidden_clamp_forward(
+    num_tokens,
+    d,
+    num_experts,
+    dtype,
+    hidden_clamp,
+    device,
 ):
     """Forward with hidden_clamp should match reference."""
     torch.set_default_device(device)
         num_tokens, d, num_experts, dtype, device)
     scores = _make_scores(num_tokens, device)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
+                                              offsets,
+                                              scores=scores,
+                                              hidden_clamp=hidden_clamp)
+    out_tri = fused_mul_grouped_poly_norm(input_t,
+                                          mul_t,
+                                          weight,
+                                          bias,
+                                          offsets,
+                                          scores=scores,
+                                          hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
     assert_close(out_ref, out_tri, atol=atol, rtol=rtol)
 @pytest.mark.parametrize("hidden_clamp", CLAMP_VALUES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_fused_mul_grouped_poly_norm_hidden_clamp_backward(
+    num_tokens,
+    d,
+    num_experts,
+    dtype,
+    hidden_clamp,
+    device,
 ):
     """Backward with hidden_clamp should match reference."""
     torch.set_default_device(device)
     scores = _make_scores(num_tokens, device)
     out_ref, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
+        input_t,
+        mul_t,
+        weight,
+        bias,
+        offsets,
+        scores=scores,
+        hidden_clamp=hidden_clamp)
     out_tri, ig_tri, mg_tri, wg_tri, bg_tri, sg_tri = _run_cuda(
+        input_t,
+        mul_t,
+        weight,
+        bias,
+        offsets,
+        scores=scores,
+        hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     # weight/bias grads use atomicAdd accumulation across tokens,

torch-ext/activation/grouped_poly_norm.py CHANGED Viewed

@@ -37,37 +37,42 @@ _has_cuda_ops = _ops is not None and hasattr(_ops, 'grouped_poly_norm_forward')
 # Register fake (meta) tensor implementations for torch.compile
 if _has_cuda_ops:
     try:
         @torch.library.register_fake("_activation::grouped_poly_norm_forward")
         def _fwd_fake(input, mul, weight, bias, offsets, eps, expert_offset,
-                       hidden_clamp):
             return (torch.empty_like(input),
-                    torch.empty(input.shape[0], 3, dtype=torch.float32,
                                 device=input.device))
         @torch.library.register_fake("_activation::grouped_poly_norm_backward")
         def _bwd_fake(grad_output, input, mul, weight, bias, offsets, inv_rms,
-                       eps, expert_offset, hidden_clamp):
-            return (torch.empty_like(input),
-                    torch.empty_like(mul),
-                    torch.empty_like(weight),
-                    torch.empty_like(bias))
-        @torch.library.register_fake("_activation::grouped_poly_norm_forward_scored")
-        def _fwd_scored_fake(input, mul, weight, bias, offsets, scores,
-                              eps, expert_offset, hidden_clamp):
             return (torch.empty_like(input),
-                    torch.empty(input.shape[0], 3, dtype=torch.float32,
                                 device=input.device))
-        @torch.library.register_fake("_activation::grouped_poly_norm_backward_scored")
         def _bwd_scored_fake(grad_output, input, mul, weight, bias, offsets,
-                              inv_rms, scores, eps, expert_offset,
-                              hidden_clamp):
-            return (torch.empty_like(input),
-                    torch.empty_like(mul),
-                    torch.empty_like(weight),
-                    torch.empty_like(bias),
-                    torch.empty(input.shape[0], 1, dtype=torch.float32,
                                 device=input.device))
     except Exception:
         pass  # already registered
@@ -111,7 +116,8 @@ def fused_mul_grouped_poly_norm_ref(
     orig_dtype = input.dtype
     token_positions = torch.arange(input.shape[0], device=input.device)
-    expert_idx = torch.bucketize(token_positions, offsets, right=True) + expert_offset
     weight_fp32 = weight.float()
     bias_fp32 = bias.float()
@@ -182,25 +188,25 @@ if _has_cuda_ops:
         """With scores — same pattern, adds scores + hidden_clamp."""
         @staticmethod
-        def forward(input, mul, weight, bias, offsets, scores,
-                    eps, expert_offset, hidden_clamp):
             input = input.contiguous()
             mul = mul.contiguous()
             assert scores.dtype == torch.float32, \
                 f"scores must be float32, got {scores.dtype}"
             scores = scores.contiguous()
             output, inv_rms = _ops.grouped_poly_norm_forward_scored(
-                input, mul, weight, bias, offsets, scores, eps,
-                expert_offset, hidden_clamp)
             return output, inv_rms
         @staticmethod
         def setup_context(ctx, inputs, output):
-            (input, mul, weight, bias, offsets, scores,
-             eps, expert_offset, hidden_clamp) = inputs
             _, inv_rms = output
-            ctx.save_for_backward(input, mul, weight, bias, offsets,
-                                  inv_rms, scores)
             ctx.eps = eps
             ctx.expert_offset = expert_offset
             ctx.hidden_clamp = hidden_clamp
@@ -244,13 +250,14 @@ if _has_cuda_ops:
         """
         clamp_val = -1.0 if hidden_clamp is None else float(hidden_clamp)
         if scores is not None:
-            output, _ = _GroupedPolyNormScoredFn.apply(
-                input, mul, weight, bias, offsets, scores, eps,
-                expert_offset, clamp_val)
         else:
-            output, _ = _GroupedPolyNormFn.apply(
-                input, mul, weight, bias, offsets, eps, expert_offset,
-                clamp_val)
         return output
 else:

 # Register fake (meta) tensor implementations for torch.compile
 if _has_cuda_ops:
     try:
         @torch.library.register_fake("_activation::grouped_poly_norm_forward")
         def _fwd_fake(input, mul, weight, bias, offsets, eps, expert_offset,
+                      hidden_clamp):
             return (torch.empty_like(input),
+                    torch.empty(input.shape[0],
+                                3,
+                                dtype=torch.float32,
                                 device=input.device))
         @torch.library.register_fake("_activation::grouped_poly_norm_backward")
         def _bwd_fake(grad_output, input, mul, weight, bias, offsets, inv_rms,
+                      eps, expert_offset, hidden_clamp):
+            return (torch.empty_like(input), torch.empty_like(mul),
+                    torch.empty_like(weight), torch.empty_like(bias))
+        @torch.library.register_fake(
+            "_activation::grouped_poly_norm_forward_scored")
+        def _fwd_scored_fake(input, mul, weight, bias, offsets, scores, eps,
+                             expert_offset, hidden_clamp):
             return (torch.empty_like(input),
+                    torch.empty(input.shape[0],
+                                3,
+                                dtype=torch.float32,
                                 device=input.device))
+        @torch.library.register_fake(
+            "_activation::grouped_poly_norm_backward_scored")
         def _bwd_scored_fake(grad_output, input, mul, weight, bias, offsets,
+                             inv_rms, scores, eps, expert_offset,
+                             hidden_clamp):
+            return (torch.empty_like(input), torch.empty_like(mul),
+                    torch.empty_like(weight), torch.empty_like(bias),
+                    torch.empty(input.shape[0],
+                                1,
+                                dtype=torch.float32,
                                 device=input.device))
     except Exception:
         pass  # already registered
     orig_dtype = input.dtype
     token_positions = torch.arange(input.shape[0], device=input.device)
+    expert_idx = torch.bucketize(token_positions, offsets,
+                                 right=True) + expert_offset
     weight_fp32 = weight.float()
     bias_fp32 = bias.float()
         """With scores — same pattern, adds scores + hidden_clamp."""
         @staticmethod
+        def forward(input, mul, weight, bias, offsets, scores, eps,
+                    expert_offset, hidden_clamp):
             input = input.contiguous()
             mul = mul.contiguous()
             assert scores.dtype == torch.float32, \
                 f"scores must be float32, got {scores.dtype}"
             scores = scores.contiguous()
             output, inv_rms = _ops.grouped_poly_norm_forward_scored(
+                input, mul, weight, bias, offsets, scores, eps, expert_offset,
+                hidden_clamp)
             return output, inv_rms
         @staticmethod
         def setup_context(ctx, inputs, output):
+            (input, mul, weight, bias, offsets, scores, eps, expert_offset,
+             hidden_clamp) = inputs
             _, inv_rms = output
+            ctx.save_for_backward(input, mul, weight, bias, offsets, inv_rms,
+                                  scores)
             ctx.eps = eps
             ctx.expert_offset = expert_offset
             ctx.hidden_clamp = hidden_clamp
         """
         clamp_val = -1.0 if hidden_clamp is None else float(hidden_clamp)
         if scores is not None:
+            output, _ = _GroupedPolyNormScoredFn.apply(input, mul, weight,
+                                                       bias, offsets, scores,
+                                                       eps, expert_offset,
+                                                       clamp_val)
         else:
+            output, _ = _GroupedPolyNormFn.apply(input, mul, weight, bias,
+                                                 offsets, eps, expert_offset,
+                                                 clamp_val)
         return output
 else:

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -50,32 +50,36 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
            &fused_add_rms_norm_backward);
   // grouped_poly_norm (without scores, hidden_clamp < 0 = disabled)
-  ops.def("grouped_poly_norm_forward("
-          "Tensor input, Tensor mul, Tensor weight, "
-          "Tensor bias, Tensor offsets, "
-          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward", torch::kCUDA,
            &grouped_poly_norm_forward);
   ops.def("grouped_poly_norm_backward("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, "
-          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward", torch::kCUDA,
            &grouped_poly_norm_backward);
   // grouped_poly_norm (with scores)
-  ops.def("grouped_poly_norm_forward_scored("
-          "Tensor input, Tensor mul, Tensor weight, "
-          "Tensor bias, Tensor offsets, Tensor scores, "
-          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward_scored", torch::kCUDA,
            &grouped_poly_norm_forward_scored);
   ops.def("grouped_poly_norm_backward_scored("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, Tensor scores, "
-          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor, Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward_scored", torch::kCUDA,
            &grouped_poly_norm_backward_scored);
 }

            &fused_add_rms_norm_backward);
   // grouped_poly_norm (without scores, hidden_clamp < 0 = disabled)
+  ops.def(
+      "grouped_poly_norm_forward("
+      "Tensor input, Tensor mul, Tensor weight, "
+      "Tensor bias, Tensor offsets, "
+      "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward", torch::kCUDA,
            &grouped_poly_norm_forward);
   ops.def("grouped_poly_norm_backward("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, "
+          "Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward", torch::kCUDA,
            &grouped_poly_norm_backward);
   // grouped_poly_norm (with scores)
+  ops.def(
+      "grouped_poly_norm_forward_scored("
+      "Tensor input, Tensor mul, Tensor weight, "
+      "Tensor bias, Tensor offsets, Tensor scores, "
+      "float eps, int expert_offset, float hidden_clamp) -> (Tensor, Tensor)");
   ops.impl("grouped_poly_norm_forward_scored", torch::kCUDA,
            &grouped_poly_norm_forward_scored);
   ops.def("grouped_poly_norm_backward_scored("
           "Tensor grad_output, Tensor input, Tensor mul, Tensor weight, "
           "Tensor bias, Tensor offsets, Tensor inv_rms, Tensor scores, "
+          "float eps, int expert_offset, float hidden_clamp) -> (Tensor, "
+          "Tensor, Tensor, Tensor, Tensor)");
   ops.impl("grouped_poly_norm_backward_scored", torch::kCUDA,
            &grouped_poly_norm_backward_scored);
 }