style: fix yapf/isort formatting for CI --all-files check

Browse files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

tests/test_fused_mul_grouped_poly_norm.py +70 -22
torch-ext/activation/_ops.py +4 -5
torch-ext/activation/grouped_poly_norm.py +4 -2

tests/test_fused_mul_grouped_poly_norm.py CHANGED Viewed

@@ -498,8 +498,14 @@ def test_fused_mul_grouped_poly_norm_hidden_clamp_backward(
 PADDING_SIZES = [64, 256]
-def _make_padded_inputs(num_valid_tokens, num_padding, hidden_dim, num_experts,
-                        dtype, device, seed=42, expert_offset=0):
     """Create inputs with extra padding rows (large values) beyond valid tokens."""
     torch.manual_seed(seed)
     probs = torch.ones(num_experts) / num_experts
@@ -530,7 +536,8 @@ def _make_padded_inputs(num_valid_tokens, num_padding, hidden_dim, num_experts,
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_padded_forward(num_tokens, num_padding, d, num_experts, dtype, device):
     """Forward with padded input: valid rows correct, padding rows zero."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_padded_inputs(
@@ -547,7 +554,9 @@ def test_padded_forward(num_tokens, num_padding, d, num_experts, dtype, device):
     assert out_ref.shape == (M, d)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
-    assert_close(out_cuda[:num_valid], out_ref[:num_valid], atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         f"Padding rows not zero: max={out_cuda[num_valid:].abs().max().item()}"
@@ -569,8 +578,9 @@ def test_padded_backward(num_tokens, num_padding, d, num_experts, dtype,
     num_valid = int(offsets[-1].item())
     # Reference backward on valid-only rows
-    _, ig_ref, mg_ref, wg_ref, bg_ref, _ = _run_ref(
-        input_t[:num_valid], mul_t[:num_valid], weight, bias, offsets)
     # CUDA backward on full padded input
     _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, _ = _run_cuda(
@@ -611,13 +621,23 @@ def test_padded_forward_scored(num_tokens, num_padding, d, num_experts, dtype,
     M = num_tokens + num_padding
     scores = _make_scores(M, device)
-    out_ref = fused_mul_grouped_poly_norm_ref(input_t, mul_t, weight, bias,
-                                              offsets, scores=scores)
-    out_cuda = fused_mul_grouped_poly_norm(input_t, mul_t, weight, bias,
-                                           offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
-    assert_close(out_cuda[:num_valid], out_ref[:num_valid], atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         "Padding rows not zero with scores"
@@ -642,12 +662,20 @@ def test_padded_backward_scored(num_tokens, num_padding, d, num_experts, dtype,
     # Reference on valid-only
     _, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
-        input_t[:num_valid], mul_t[:num_valid], weight, bias, offsets,
         scores=scores[:num_valid])
     # CUDA on full padded
-    _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, sg_cuda = _run_cuda(
-        input_t, mul_t, weight, bias, offsets, scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     wg_atol = 5e-4 if dtype == torch.float32 else 5e-2
@@ -681,15 +709,25 @@ def test_padded_forward_scored_clamp(num_tokens, num_padding, d, num_experts,
     M = num_tokens + num_padding
     scores = _make_scores(M, device)
-    out_ref = fused_mul_grouped_poly_norm_ref(input_t, mul_t, weight, bias,
-                                              offsets, scores=scores,
                                               hidden_clamp=hidden_clamp)
-    out_cuda = fused_mul_grouped_poly_norm(input_t, mul_t, weight, bias,
-                                           offsets, scores=scores,
                                            hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
-    assert_close(out_cuda[:num_valid], out_ref[:num_valid], atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         "Padding rows not zero with scores+clamp"
@@ -715,12 +753,22 @@ def test_padded_backward_scored_clamp(num_tokens, num_padding, d, num_experts,
     # Reference on valid-only
     _, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
-        input_t[:num_valid], mul_t[:num_valid], weight, bias, offsets,
-        scores=scores[:num_valid], hidden_clamp=hidden_clamp)
     # CUDA on full padded
     _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, sg_cuda = _run_cuda(
-        input_t, mul_t, weight, bias, offsets, scores=scores,
         hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)

 PADDING_SIZES = [64, 256]
+def _make_padded_inputs(num_valid_tokens,
+                        num_padding,
+                        hidden_dim,
+                        num_experts,
+                        dtype,
+                        device,
+                        seed=42,
+                        expert_offset=0):
     """Create inputs with extra padding rows (large values) beyond valid tokens."""
     torch.manual_seed(seed)
     probs = torch.ones(num_experts) / num_experts
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_padded_forward(num_tokens, num_padding, d, num_experts, dtype,
+                        device):
     """Forward with padded input: valid rows correct, padding rows zero."""
     torch.set_default_device(device)
     input_t, mul_t, weight, bias, offsets = _make_padded_inputs(
     assert out_ref.shape == (M, d)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
+    assert_close(out_cuda[:num_valid],
+                 out_ref[:num_valid],
+                 atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         f"Padding rows not zero: max={out_cuda[num_valid:].abs().max().item()}"
     num_valid = int(offsets[-1].item())
     # Reference backward on valid-only rows
+    _, ig_ref, mg_ref, wg_ref, bg_ref, _ = _run_ref(input_t[:num_valid],
+                                                    mul_t[:num_valid], weight,
+                                                    bias, offsets)
     # CUDA backward on full padded input
     _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, _ = _run_cuda(
     M = num_tokens + num_padding
     scores = _make_scores(M, device)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
+                                              offsets,
+                                              scores=scores)
+    out_cuda = fused_mul_grouped_poly_norm(input_t,
+                                           mul_t,
+                                           weight,
+                                           bias,
+                                           offsets,
+                                           scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
+    assert_close(out_cuda[:num_valid],
+                 out_ref[:num_valid],
+                 atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         "Padding rows not zero with scores"
     # Reference on valid-only
     _, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
+        input_t[:num_valid],
+        mul_t[:num_valid],
+        weight,
+        bias,
+        offsets,
         scores=scores[:num_valid])
     # CUDA on full padded
+    _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, sg_cuda = _run_cuda(input_t,
+                                                               mul_t,
+                                                               weight,
+                                                               bias,
+                                                               offsets,
+                                                               scores=scores)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)
     wg_atol = 5e-4 if dtype == torch.float32 else 5e-2
     M = num_tokens + num_padding
     scores = _make_scores(M, device)
+    out_ref = fused_mul_grouped_poly_norm_ref(input_t,
+                                              mul_t,
+                                              weight,
+                                              bias,
+                                              offsets,
+                                              scores=scores,
                                               hidden_clamp=hidden_clamp)
+    out_cuda = fused_mul_grouped_poly_norm(input_t,
+                                           mul_t,
+                                           weight,
+                                           bias,
+                                           offsets,
+                                           scores=scores,
                                            hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (1e-2, 1e-2)
+    assert_close(out_cuda[:num_valid],
+                 out_ref[:num_valid],
+                 atol=atol,
                  rtol=rtol)
     assert out_cuda[num_valid:].abs().max() == 0, \
         "Padding rows not zero with scores+clamp"
     # Reference on valid-only
     _, ig_ref, mg_ref, wg_ref, bg_ref, sg_ref = _run_ref(
+        input_t[:num_valid],
+        mul_t[:num_valid],
+        weight,
+        bias,
+        offsets,
+        scores=scores[:num_valid],
+        hidden_clamp=hidden_clamp)
     # CUDA on full padded
     _, ig_cuda, mg_cuda, wg_cuda, bg_cuda, sg_cuda = _run_cuda(
+        input_t,
+        mul_t,
+        weight,
+        bias,
+        offsets,
+        scores=scores,
         hidden_clamp=hidden_clamp)
     atol, rtol = (1e-4, 1e-4) if dtype == torch.float32 else (5e-2, 5e-2)

torch-ext/activation/_ops.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """Op loader — works with both kernel-builder (.abi3.so) and local setup.py builds."""
 import importlib
 import torch
 # Try kernel-builder build first (namespace like _activation_HASH)
 # Fall back to local setup.py build (_activation)
 _lib = None
-for _name in sorted(
-    [m for m in dir() if m.startswith("_activation")], reverse=True
-):
     _lib = importlib.import_module(f".{_name}", __package__)
     break
@@ -28,8 +28,7 @@ if _lib is None:
             ops = getattr(torch.ops, _mod_name)
         else:
             raise ImportError(
-                "No activation extension found. Build with: pip install -e ."
-            )
 else:
     ops = getattr(torch.ops, _lib.__name__.split(".")[-1])

 """Op loader — works with both kernel-builder (.abi3.so) and local setup.py builds."""
 import importlib
 import torch
 # Try kernel-builder build first (namespace like _activation_HASH)
 # Fall back to local setup.py build (_activation)
 _lib = None
+for _name in sorted([m for m in dir() if m.startswith("_activation")],
+                    reverse=True):
     _lib = importlib.import_module(f".{_name}", __package__)
     break
             ops = getattr(torch.ops, _mod_name)
         else:
             raise ImportError(
+                "No activation extension found. Build with: pip install -e .")
 else:
     ops = getattr(torch.ops, _lib.__name__.split(".")[-1])

torch-ext/activation/grouped_poly_norm.py CHANGED Viewed

@@ -146,8 +146,10 @@ def fused_mul_grouped_poly_norm_ref(
     if hidden_clamp is not None:
         result = result.clamp(-hidden_clamp, hidden_clamp)
     if input.shape[0] > num_valid:
-        padding = torch.zeros(input.shape[0] - num_valid, input.shape[-1],
-                              dtype=orig_dtype, device=input.device)
         return torch.cat([result.to(orig_dtype), padding], dim=0)
     return result.to(orig_dtype)

     if hidden_clamp is not None:
         result = result.clamp(-hidden_clamp, hidden_clamp)
     if input.shape[0] > num_valid:
+        padding = torch.zeros(input.shape[0] - num_valid,
+                              input.shape[-1],
+                              dtype=orig_dtype,
+                              device=input.device)
         return torch.cat([result.to(orig_dtype), padding], dim=0)
     return result.to(orig_dtype)