fix: unify all backward kernels to input-based math + fix test import

- Always save input in setup_context (no output-based path)
- bwd_fused: input-based math (d_sum = sum(dy*x*w), grad = scale*dy*w - dxx*x)
- bwd_large_input_grad: input-based math
- bwd_large_weight_grad: input-based (weight_grad = dy*x*scale, needs inv_rms)
- bwd_scalar: input-based (unchanged)
- pytest.ini: add pythonpath to prevent activation/ CUDA dir namespace shadow

Eliminates bf16 precision loss from output-based x recovery (y/(w*scale)).
All 48 test_rms_norm tests pass. No performance regression.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

activation/rms_norm.cu +81 -94
tests/pytest.ini +1 -0
torch-ext/activation/rms_norm.py +1 -7

activation/rms_norm.cu CHANGED Viewed

@@ -223,20 +223,17 @@ __global__ void rms_norm_fwd_scalar(scalar_t *__restrict__ out,
 // Backward: single-pass with register caching (NVECS vecs per thread)
 // ---------------------------------------------------------------------------
 // ---------------------------------------------------------------------------
-// Backward input_grad (dim > 8192): output-based, 2-pass
 //   Architecture: 1 block/token, 256 threads, __launch_bounds__(256, 4)
-//   Uses forward output y instead of input x (saved by autograd):
-//     d_sum = (1/scale) * sum(dy * y)  — no weight read in pass 1
-//     input_grad = scale*dy*w - dxx*y/(w*scale)
-//   Pass 1: read dy + y → d_sum (warp shuffle reduction)
-//   Pass 2: read dy + y + w (L1 hit) → write input_grad
 //   Weight grad computed by separate bwd_large_weight_grad kernel
 // ---------------------------------------------------------------------------
 template <typename scalar_t, typename acc_t, int width>
 __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_input_grad(
     scalar_t *__restrict__ input_grad,
     const scalar_t *__restrict__ output_grad, // dy
-    const scalar_t *__restrict__ output,      // y = x * w * scale
     const scalar_t *__restrict__ weight, const acc_t *__restrict__ inv_rms,
     const int d) {
   using vec_t = type_vec_t<scalar_t, width>;
@@ -245,8 +242,7 @@ __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_input_grad(
   const int vec_d = d / width;
   const int64_t vec_offset = token_idx * vec_d;
-  const vec_t *__restrict__ output_vec =
-      reinterpret_cast<const vec_t *>(output);
   const vec_t *__restrict__ output_grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
   const vec_t *__restrict__ weight_vec =
@@ -254,46 +250,44 @@ __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_input_grad(
   acc_t scale = inv_rms[token_idx];
-  // Pass 1: d_sum = (1/scale) * sum(dy * y) — only reads dy and y!
-  acc_t dy_y_sum = 0.0f;
   for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-    vec_t y_vec = output_vec[vec_offset + vidx];
     vec_t dy_vec = output_grad_vec[vec_offset + vidx];
 #pragma unroll
     for (int i = 0; i < width; ++i) {
-      dy_y_sum += static_cast<acc_t>(dy_vec.data[i]) *
-                  static_cast<acc_t>(y_vec.data[i]);
     }
   }
-  dy_y_sum = block_reduce_sum(dy_y_sum);
-  // d_sum = dy_y_sum / scale, dxx = d_sum * scale^3 / d = dy_y_sum * scale^2 /
-  // d
   __shared__ acc_t s_dxx;
   if (threadIdx.x == 0) {
-    s_dxx = dy_y_sum * scale * scale / d;
   }
   __syncthreads();
   acc_t dxx = s_dxx;
-  // Pass 2: input_grad = scale * dy * w - dxx * y / (w * scale)
   vec_t *__restrict__ input_grad_vec = reinterpret_cast<vec_t *>(input_grad);
   for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-    vec_t y_vec = output_vec[vec_offset + vidx];
     vec_t dy_vec = output_grad_vec[vec_offset + vidx];
     vec_t w_vec = weight_vec[vidx];
     vec_t in_grad_vec;
 #pragma unroll
     for (int i = 0; i < width; ++i) {
-      acc_t y = y_vec.data[i];
       acc_t dy = dy_vec.data[i];
       acc_t w = w_vec.data[i];
-      // x = y / (w * scale), so: scale*dy*w - dxx*x = scale*dy*w -
-      // dxx*y/(w*scale)
-      in_grad_vec.data[i] = scale * dy * w - dxx * y / (w * scale);
     }
     input_grad_vec[vec_offset + vidx] = in_grad_vec;
   }
@@ -302,8 +296,7 @@ __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_input_grad(
 // ---------------------------------------------------------------------------
 // Backward fused (dim ≤ 8192): multi-token block, input_grad + weight_grad
 //   Architecture: 32 tokens/block, 128 threads, dynamic shared mem [d] fp32
-//   Uses saved input (not output) — input-based math:
-//     d_sum = sum(dy * x * w), input_grad = scale*dy*w - dxx*x
 //   2-token batched float2 reduction (syncthreads halved)
 //   Weight grad accumulated in shared memory → atomicAdd to global
 //   Single kernel: no separate weight_grad kernel needed
@@ -313,13 +306,13 @@ __global__ void rms_norm_bwd_fused(
     scalar_t *__restrict__ input_grad,
     acc_t *__restrict__ weight_grad_acc, // [d] — atomicAdd target
     const scalar_t *__restrict__ output_grad,
-    const scalar_t *__restrict__ output, // forward output y
-    const scalar_t *__restrict__ weight, const acc_t *__restrict__ inv_rms,
-    const int d, const int64_t num_tokens, const int tpb) {
   using vec_t = type_vec_t<scalar_t, width>;
-  extern __shared__ acc_t wg_shared[]; // [d] float32 — accumulates sum(dy*y)
   const int vec_d = d / width;
@@ -328,8 +321,7 @@ __global__ void rms_norm_bwd_fused(
     wg_shared[idx] = 0.0f;
   __syncthreads();
-  const vec_t *__restrict__ output_vec =
-      reinterpret_cast<const vec_t *>(output);
   const vec_t *__restrict__ output_grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
   const vec_t *__restrict__ weight_vec =
@@ -345,50 +337,52 @@ __global__ void rms_norm_bwd_fused(
     acc_t s0 = inv_rms[t], s1 = inv_rms[t + 1];
     int64_t off0 = t * vec_d, off1 = (t + 1) * vec_d;
-    // Pass 1: dy_y_sum = sum(dy * y) — no weight needed for d_sum!
-    acc_t dys0 = 0.0f, dys1 = 0.0f;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-      vec_t y0 = output_vec[off0 + vidx];
       vec_t dy0 = output_grad_vec[off0 + vidx];
-      vec_t y1 = output_vec[off1 + vidx];
       vec_t dy1 = output_grad_vec[off1 + vidx];
 #pragma unroll
       for (int i = 0; i < width; ++i) {
-        dys0 +=
-            static_cast<acc_t>(dy0.data[i]) * static_cast<acc_t>(y0.data[i]);
-        dys1 +=
-            static_cast<acc_t>(dy1.data[i]) * static_cast<acc_t>(y1.data[i]);
       }
     }
-    float2 sums = block_reduce_sum2(make_float2(dys0, dys1));
-    // dxx = dy_y_sum * scale^2 / d
     __shared__ acc_t sd0, sd1;
     if (threadIdx.x == 0) {
-      sd0 = sums.x * s0 * s0 / d;
-      sd1 = sums.y * s1 * s1 / d;
     }
     __syncthreads();
     acc_t dxx0 = sd0, dxx1 = sd1;
-    // Pass 2: input_grad + wg_shared accumulate (L1 hit on y, dy)
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-      vec_t y0 = output_vec[off0 + vidx];
       vec_t dy0 = output_grad_vec[off0 + vidx];
-      vec_t y1 = output_vec[off1 + vidx];
       vec_t dy1 = output_grad_vec[off1 + vidx];
       vec_t w = weight_vec[vidx];
       vec_t g0, g1;
 #pragma unroll
       for (int i = 0; i < width; ++i) {
-        acc_t yi0 = y0.data[i], di0 = dy0.data[i], wi = w.data[i];
-        acc_t yi1 = y1.data[i], di1 = dy1.data[i];
-        g0.data[i] = s0 * di0 * wi - dxx0 * yi0 / (wi * s0);
-        g1.data[i] = s1 * di1 * wi - dxx1 * yi1 / (wi * s1);
-        // wg_shared accumulates sum(dy * y), divide by w at the end
-        wg_shared[vidx * width + i] += di0 * yi0 + di1 * yi1;
       }
       input_grad_vec[off0 + vidx] = g0;
       input_grad_vec[off1 + vidx] = g1;
@@ -399,44 +393,44 @@ __global__ void rms_norm_bwd_fused(
   if (t < token_end) {
     acc_t scale = inv_rms[t];
     int64_t vec_offset = t * vec_d;
-    acc_t dy_y_sum = 0.0f;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-      vec_t y_vec = output_vec[vec_offset + vidx];
       vec_t dy_vec = output_grad_vec[vec_offset + vidx];
 #pragma unroll
       for (int i = 0; i < width; ++i)
-        dy_y_sum += static_cast<acc_t>(dy_vec.data[i]) *
-                    static_cast<acc_t>(y_vec.data[i]);
     }
-    dy_y_sum = block_reduce_sum(dy_y_sum);
     __shared__ acc_t s_dxx;
     if (threadIdx.x == 0)
-      s_dxx = dy_y_sum * scale * scale / d;
     __syncthreads();
     acc_t dxx = s_dxx;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
-      vec_t y_vec = output_vec[vec_offset + vidx];
       vec_t dy_vec = output_grad_vec[vec_offset + vidx];
-      vec_t w_vec = weight_vec[vidx];
       vec_t gv;
 #pragma unroll
       for (int i = 0; i < width; ++i) {
-        acc_t y = y_vec.data[i], dy = dy_vec.data[i], w = w_vec.data[i];
-        gv.data[i] = scale * dy * w - dxx * y / (w * scale);
-        wg_shared[vidx * width + i] += dy * y;
       }
       input_grad_vec[vec_offset + vidx] = gv;
     }
   }
-  // AtomicAdd accumulated weight grad: wg_shared has sum(dy*y), divide by w
-  const scalar_t *__restrict__ w_ptr = weight;
   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    acc_t w = static_cast<acc_t>(w_ptr[idx]);
-    atomicAdd(&weight_grad_acc[idx], wg_shared[idx] / w);
   }
 }
@@ -444,7 +438,7 @@ __global__ void rms_norm_bwd_fused(
 // Backward weight_grad (dim > 8192): column-parallel with vec8 loads
 //   Architecture: 2D grid (col_blocks × token_chunks), 256 threads
 //   Each thread handles 8 columns (vec8), iterates over chunk_size tokens
-//   Output-based: weight_grad_i = sum_t(dy*y) / w_i — no inv_rms needed
 //   Writes partial_wg[chunk, d], host reduces with at::sum_out
 //   __launch_bounds__(256, 4)
 // ---------------------------------------------------------------------------
@@ -452,11 +446,11 @@ template <typename scalar_t, typename acc_t, int TILE_T, int VEC_W>
 __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_weight_grad(
     acc_t *__restrict__ partial_wg,           // [num_chunks, d]
     const scalar_t *__restrict__ output_grad, // [num_tokens, d]
-    const scalar_t *__restrict__ output, // [num_tokens, d] — forward output y
-    const scalar_t *__restrict__ weight, // [d]
     const int d, const int64_t num_tokens, const int64_t chunk_size) {
-  // weight_grad_i = (1/w_i) * sum_t(dy * y)
-  // No inv_rms needed! Just dy, y, and w.
   using vec_t = type_vec_t<scalar_t, VEC_W>;
@@ -465,19 +459,15 @@ __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_weight_grad(
   const int64_t token_start = blockIdx.y * chunk_size;
   const int64_t token_end = min(token_start + chunk_size, num_tokens);
-  const vec_t *__restrict__ output_vec =
-      reinterpret_cast<const vec_t *>(output);
   const vec_t *__restrict__ grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
-  const vec_t *__restrict__ weight_vec =
-      reinterpret_cast<const vec_t *>(weight);
   const int vec_d = d / VEC_W;
-  // Accumulate sum_t(dy * y) in registers
-  acc_t dy_y_acc[VEC_W];
 #pragma unroll
   for (int i = 0; i < VEC_W; i++)
-    dy_y_acc[i] = 0.0f;
   int vec_col = blockIdx.x * blockDim.x + threadIdx.x;
@@ -487,23 +477,22 @@ __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_weight_grad(
     if (vec_col < vec_d) {
       for (int r = 0; r < tile_size; r++) {
         int64_t t = t_base + r;
-        vec_t y_v = output_vec[t * vec_d + vec_col];
         vec_t dy_v = grad_vec[t * vec_d + vec_col];
 #pragma unroll
         for (int i = 0; i < VEC_W; i++) {
-          dy_y_acc[i] += static_cast<acc_t>(dy_v.data[i]) *
-                         static_cast<acc_t>(y_v.data[i]);
         }
       }
     }
   }
-  // Write results: weight_grad_i = dy_y_acc_i / w_i
   if (vec_col < vec_d) {
-    vec_t w_v = weight_vec[vec_col];
     for (int i = 0; i < VEC_W; i++) {
-      acc_t w = static_cast<acc_t>(w_v.data[i]);
-      partial_wg[blockIdx.y * d + col_base + i] = dy_y_acc[i] / w;
     }
   }
 }
@@ -665,24 +654,22 @@ rms_norm_backward(const torch::Tensor &output_grad, // [..., d]
       block_size = ((block_size + 31) / 32) * 32;
       block_size = std::max(block_size, 32);
       size_t smem = d * sizeof(float);
-      // For dim <= 8192: 'output' arg is actually input (saved by Python
-      // autograd)
       MOTIF_DISPATCH_FLOATING_TYPES(
           output.scalar_type(), "rms_norm_bwd_fused", [&] {
             motif::rms_norm_bwd_fused<scalar_t, float, 8>
                 <<<num_blocks_mt, block_size, smem, stream>>>(
                     input_grad.data_ptr<scalar_t>(), wg_acc.data_ptr<float>(),
                     output_grad.data_ptr<scalar_t>(),
-                    output.data_ptr<scalar_t>(), // actually input for dim<=8192
-                    weight.data_ptr<scalar_t>(), inv_rms.data_ptr<float>(), d,
-                    num_tokens, tpb);
           });
       if (weight_grad.defined()) {
         weight_grad.copy_(wg_acc);
       }
     } else {
-      // Large dims (d > 8192): output-based bwd + column-parallel weight grad
       MOTIF_DISPATCH_FLOATING_TYPES(
           output.scalar_type(), "rms_norm_bwd_large_input_grad", [&] {
             motif::rms_norm_bwd_large_input_grad<scalar_t, float, 8>
@@ -712,7 +699,7 @@ rms_norm_backward(const torch::Tensor &output_grad, // [..., d]
                       partial_wg.data_ptr<float>(),
                       output_grad.data_ptr<scalar_t>(),
                       output.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-                      d, num_tokens, chunk_size);
             });
         torch::Tensor acc =

 // Backward: single-pass with register caching (NVECS vecs per thread)
 // ---------------------------------------------------------------------------
 // ---------------------------------------------------------------------------
+// Backward input_grad (dim > 8192): input-based, 2-pass
 //   Architecture: 1 block/token, 256 threads, __launch_bounds__(256, 4)
+//   Pass 1: read dy + x + w → d_sum = sum(dy * x * w)
+//   Pass 2: read dy + x + w (L1 hit) → input_grad = scale*dy*w - dxx*x
 //   Weight grad computed by separate bwd_large_weight_grad kernel
 // ---------------------------------------------------------------------------
 template <typename scalar_t, typename acc_t, int width>
 __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_input_grad(
     scalar_t *__restrict__ input_grad,
     const scalar_t *__restrict__ output_grad, // dy
+    const scalar_t *__restrict__ input,       // x
     const scalar_t *__restrict__ weight, const acc_t *__restrict__ inv_rms,
     const int d) {
   using vec_t = type_vec_t<scalar_t, width>;
   const int vec_d = d / width;
   const int64_t vec_offset = token_idx * vec_d;
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
   const vec_t *__restrict__ output_grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
   const vec_t *__restrict__ weight_vec =
   acc_t scale = inv_rms[token_idx];
+  // Pass 1: d_sum = sum(dy * x * w)
+  acc_t d_sum = 0.0f;
   for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + vidx];
     vec_t dy_vec = output_grad_vec[vec_offset + vidx];
+    vec_t w_vec = weight_vec[vidx];
 #pragma unroll
     for (int i = 0; i < width; ++i) {
+      d_sum += static_cast<acc_t>(dy_vec.data[i]) *
+               static_cast<acc_t>(x_vec.data[i]) *
+               static_cast<acc_t>(w_vec.data[i]);
     }
   }
+  d_sum = block_reduce_sum(d_sum);
   __shared__ acc_t s_dxx;
   if (threadIdx.x == 0) {
+    s_dxx = d_sum * scale * scale * scale / d;
   }
   __syncthreads();
   acc_t dxx = s_dxx;
+  // Pass 2: input_grad = scale * dy * w - dxx * x
   vec_t *__restrict__ input_grad_vec = reinterpret_cast<vec_t *>(input_grad);
   for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+    vec_t x_vec = input_vec[vec_offset + vidx];
     vec_t dy_vec = output_grad_vec[vec_offset + vidx];
     vec_t w_vec = weight_vec[vidx];
     vec_t in_grad_vec;
 #pragma unroll
     for (int i = 0; i < width; ++i) {
+      acc_t x = x_vec.data[i];
       acc_t dy = dy_vec.data[i];
       acc_t w = w_vec.data[i];
+      in_grad_vec.data[i] = scale * dy * w - dxx * x;
     }
     input_grad_vec[vec_offset + vidx] = in_grad_vec;
   }
 // ---------------------------------------------------------------------------
 // Backward fused (dim ≤ 8192): multi-token block, input_grad + weight_grad
 //   Architecture: 32 tokens/block, 128 threads, dynamic shared mem [d] fp32
+//   Input-based math: d_sum = sum(dy * x * w), input_grad = scale*dy*w - dxx*x
 //   2-token batched float2 reduction (syncthreads halved)
 //   Weight grad accumulated in shared memory → atomicAdd to global
 //   Single kernel: no separate weight_grad kernel needed
     scalar_t *__restrict__ input_grad,
     acc_t *__restrict__ weight_grad_acc, // [d] — atomicAdd target
     const scalar_t *__restrict__ output_grad,
+    const scalar_t *__restrict__ input, const scalar_t *__restrict__ weight,
+    const acc_t *__restrict__ inv_rms, const int d, const int64_t num_tokens,
+    const int tpb) {
   using vec_t = type_vec_t<scalar_t, width>;
+  extern __shared__ acc_t wg_shared[]; // [d] float32 — accumulates weight grad
   const int vec_d = d / width;
     wg_shared[idx] = 0.0f;
   __syncthreads();
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
   const vec_t *__restrict__ output_grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
   const vec_t *__restrict__ weight_vec =
     acc_t s0 = inv_rms[t], s1 = inv_rms[t + 1];
     int64_t off0 = t * vec_d, off1 = (t + 1) * vec_d;
+    // Pass 1: d_sum = sum(dy * x * w)
+    acc_t dsum0 = 0.0f, dsum1 = 0.0f;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+      vec_t x0 = input_vec[off0 + vidx];
       vec_t dy0 = output_grad_vec[off0 + vidx];
+      vec_t x1 = input_vec[off1 + vidx];
       vec_t dy1 = output_grad_vec[off1 + vidx];
+      vec_t w = weight_vec[vidx];
 #pragma unroll
       for (int i = 0; i < width; ++i) {
+        acc_t wi = w.data[i];
+        dsum0 += static_cast<acc_t>(dy0.data[i]) *
+                 static_cast<acc_t>(x0.data[i]) * wi;
+        dsum1 += static_cast<acc_t>(dy1.data[i]) *
+                 static_cast<acc_t>(x1.data[i]) * wi;
       }
     }
+    float2 sums = block_reduce_sum2(make_float2(dsum0, dsum1));
+    // dxx = d_sum * scale^3 / d
     __shared__ acc_t sd0, sd1;
     if (threadIdx.x == 0) {
+      sd0 = sums.x * s0 * s0 * s0 / d;
+      sd1 = sums.y * s1 * s1 * s1 / d;
     }
     __syncthreads();
     acc_t dxx0 = sd0, dxx1 = sd1;
+    // Pass 2: input_grad + wg_shared accumulate (L1 hit on x, dy)
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+      vec_t x0 = input_vec[off0 + vidx];
       vec_t dy0 = output_grad_vec[off0 + vidx];
+      vec_t x1 = input_vec[off1 + vidx];
       vec_t dy1 = output_grad_vec[off1 + vidx];
       vec_t w = weight_vec[vidx];
       vec_t g0, g1;
 #pragma unroll
       for (int i = 0; i < width; ++i) {
+        acc_t xi0 = x0.data[i], di0 = dy0.data[i], wi = w.data[i];
+        acc_t xi1 = x1.data[i], di1 = dy1.data[i];
+        g0.data[i] = s0 * di0 * wi - dxx0 * xi0;
+        g1.data[i] = s1 * di1 * wi - dxx1 * xi1;
+        // weight_grad = dy * x * scale
+        wg_shared[vidx * width + i] += di0 * xi0 * s0 + di1 * xi1 * s1;
       }
       input_grad_vec[off0 + vidx] = g0;
       input_grad_vec[off1 + vidx] = g1;
   if (t < token_end) {
     acc_t scale = inv_rms[t];
     int64_t vec_offset = t * vec_d;
+    acc_t d_sum = 0.0f;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+      vec_t x_vec = input_vec[vec_offset + vidx];
       vec_t dy_vec = output_grad_vec[vec_offset + vidx];
+      vec_t w_vec = weight_vec[vidx];
 #pragma unroll
       for (int i = 0; i < width; ++i)
+        d_sum += static_cast<acc_t>(dy_vec.data[i]) *
+                 static_cast<acc_t>(x_vec.data[i]) *
+                 static_cast<acc_t>(w_vec.data[i]);
     }
+    d_sum = block_reduce_sum(d_sum);
     __shared__ acc_t s_dxx;
     if (threadIdx.x == 0)
+      s_dxx = d_sum * scale * scale * scale / d;
     __syncthreads();
     acc_t dxx = s_dxx;
     for (int64_t vidx = threadIdx.x; vidx < vec_d; vidx += blockDim.x) {
+      vec_t x_vec = input_vec[vec_offset + vidx];
       vec_t dy_vec = output_grad_vec[vec_offset + vidx];
       vec_t gv;
 #pragma unroll
       for (int i = 0; i < width; ++i) {
+        acc_t x = x_vec.data[i], dy = dy_vec.data[i];
+        gv.data[i] =
+            scale * dy * static_cast<acc_t>(weight_vec[vidx].data[i]) - dxx * x;
+        wg_shared[vidx * width + i] += dy * x * scale;
       }
       input_grad_vec[vec_offset + vidx] = gv;
     }
   }
+  // AtomicAdd accumulated weight grad
   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    atomicAdd(&weight_grad_acc[idx], wg_shared[idx]);
   }
 }
 // Backward weight_grad (dim > 8192): column-parallel with vec8 loads
 //   Architecture: 2D grid (col_blocks × token_chunks), 256 threads
 //   Each thread handles 8 columns (vec8), iterates over chunk_size tokens
+//   Input-based: weight_grad_i = sum_t(dy * x * scale)
 //   Writes partial_wg[chunk, d], host reduces with at::sum_out
 //   __launch_bounds__(256, 4)
 // ---------------------------------------------------------------------------
 __global__ __launch_bounds__(256, 4) void rms_norm_bwd_large_weight_grad(
     acc_t *__restrict__ partial_wg,           // [num_chunks, d]
     const scalar_t *__restrict__ output_grad, // [num_tokens, d]
+    const scalar_t *__restrict__ input,       // [num_tokens, d]
+    const scalar_t *__restrict__ weight,      // [d]
+    const acc_t *__restrict__ inv_rms,        // [num_tokens]
     const int d, const int64_t num_tokens, const int64_t chunk_size) {
+  // weight_grad_i = sum_t(dy_t * x_t * scale_t)
   using vec_t = type_vec_t<scalar_t, VEC_W>;
   const int64_t token_start = blockIdx.y * chunk_size;
   const int64_t token_end = min(token_start + chunk_size, num_tokens);
+  const vec_t *__restrict__ input_vec = reinterpret_cast<const vec_t *>(input);
   const vec_t *__restrict__ grad_vec =
       reinterpret_cast<const vec_t *>(output_grad);
   const int vec_d = d / VEC_W;
+  acc_t wg_acc[VEC_W];
 #pragma unroll
   for (int i = 0; i < VEC_W; i++)
+    wg_acc[i] = 0.0f;
   int vec_col = blockIdx.x * blockDim.x + threadIdx.x;
     if (vec_col < vec_d) {
       for (int r = 0; r < tile_size; r++) {
         int64_t t = t_base + r;
+        acc_t scale = inv_rms[t];
+        vec_t x_v = input_vec[t * vec_d + vec_col];
         vec_t dy_v = grad_vec[t * vec_d + vec_col];
 #pragma unroll
         for (int i = 0; i < VEC_W; i++) {
+          wg_acc[i] += static_cast<acc_t>(dy_v.data[i]) *
+                       static_cast<acc_t>(x_v.data[i]) * scale;
         }
       }
     }
   }
+  // Write results directly
   if (vec_col < vec_d) {
     for (int i = 0; i < VEC_W; i++) {
+      partial_wg[blockIdx.y * d + col_base + i] = wg_acc[i];
     }
   }
 }
       block_size = ((block_size + 31) / 32) * 32;
       block_size = std::max(block_size, 32);
       size_t smem = d * sizeof(float);
+      // 'output' C++ arg receives input (saved by Python autograd)
       MOTIF_DISPATCH_FLOATING_TYPES(
           output.scalar_type(), "rms_norm_bwd_fused", [&] {
             motif::rms_norm_bwd_fused<scalar_t, float, 8>
                 <<<num_blocks_mt, block_size, smem, stream>>>(
                     input_grad.data_ptr<scalar_t>(), wg_acc.data_ptr<float>(),
                     output_grad.data_ptr<scalar_t>(),
+                    output.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                    inv_rms.data_ptr<float>(), d, num_tokens, tpb);
           });
       if (weight_grad.defined()) {
         weight_grad.copy_(wg_acc);
       }
     } else {
+      // Large dims (d > 8192): input-based bwd + column-parallel weight grad
       MOTIF_DISPATCH_FLOATING_TYPES(
           output.scalar_type(), "rms_norm_bwd_large_input_grad", [&] {
             motif::rms_norm_bwd_large_input_grad<scalar_t, float, 8>
                       partial_wg.data_ptr<float>(),
                       output_grad.data_ptr<scalar_t>(),
                       output.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                      inv_rms.data_ptr<float>(), d, num_tokens, chunk_size);
             });
         torch::Tensor acc =

tests/pytest.ini CHANGED Viewed

@@ -1,3 +1,4 @@
 [pytest]
 log_cli = true
 log_cli_level = INFO

 [pytest]
 log_cli = true
 log_cli_level = INFO
+pythonpath = ../torch-ext

torch-ext/activation/rms_norm.py CHANGED Viewed

@@ -15,13 +15,7 @@ class RMSNormFunction(torch.autograd.Function):
     def setup_context(ctx, inputs, outputs):
         input, weight, eps = inputs
         output, inv_rms = outputs
-        d = input.size(-1)
-        # Large dims: save output (output-based backward, less memory traffic)
-        # Small dims: save input (multitoken backward, avoids division overhead)
-        if d > 8192:
-            ctx.save_for_backward(output, weight, inv_rms)
-        else:
-            ctx.save_for_backward(input, weight, inv_rms)
         ctx.eps = eps
     @staticmethod

     def setup_context(ctx, inputs, outputs):
         input, weight, eps = inputs
         output, inv_rms = outputs
+        ctx.save_for_backward(input, weight, inv_rms)
         ctx.eps = eps
     @staticmethod